From d5aa7c46692474376a3c31704cfc4783c86338f2 Mon Sep 17 00:00:00 2001 From: Jacques Nadeau Date: Fri, 5 Feb 2016 12:08:35 -0800 Subject: [PATCH 001/210] Initial Commit --- README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 00000000000..e2dc7471c20 --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +arrow From cbc56bf8ac423c585c782d5eda5c517ea8df8e3c Mon Sep 17 00:00:00 2001 From: Jacques Nadeau Date: Tue, 16 Feb 2016 21:35:38 -0800 Subject: [PATCH 002/210] Update readme and add license in root. --- LICENSE.txt | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 14 +++- 2 files changed, 215 insertions(+), 1 deletion(-) create mode 100644 LICENSE.txt diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index e2dc7471c20..4423a913513 100644 --- a/README.md +++ b/README.md @@ -1 +1,13 @@ -arrow +## Apache Arrow + +#### Powering Columnar In-Memory Analytics + +Arrow is a set of technologies that enable big-data systems to process and move data fast. + +Initial implementations include: + + - [The Arrow Format](https://github.com/apache/arrow/tree/master/format) + - [Arrow Structures and APIs in C++](https://github.com/apache/arrow/tree/master/cpp) + - [Arrow Structures and APIs in Java](https://github.com/apache/arrow/tree/master/java) + +Arrow is an [Apache Software Foundation](www.apache.org) project. More info can be found at [arrow.apache.org](http://arrow.apache.org). From fa5f0299f046c46e1b2f671e5e3b4f1956522711 Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Wed, 17 Feb 2016 04:37:53 -0800 Subject: [PATCH 003/210] ARROW-1: Initial Arrow Code Commit --- java/.gitignore | 22 + java/memory/pom.xml | 50 + .../main/java/io/netty/buffer/ArrowBuf.java | 863 ++++++++++++++++++ .../io/netty/buffer/ExpandableByteBuf.java | 55 ++ .../java/io/netty/buffer/LargeBuffer.java | 59 ++ .../netty/buffer/MutableWrappedByteBuf.java | 336 +++++++ .../netty/buffer/PooledByteBufAllocatorL.java | 272 ++++++ .../buffer/UnsafeDirectLittleEndian.java | 270 ++++++ .../org/apache/arrow/memory/Accountant.java | 272 ++++++ .../arrow/memory/AllocationManager.java | 433 +++++++++ .../arrow/memory/AllocationReservation.java | 86 ++ .../memory/AllocatorClosedException.java | 31 + .../apache/arrow/memory/BaseAllocator.java | 781 ++++++++++++++++ .../apache/arrow/memory/BoundsChecking.java | 35 + .../apache/arrow/memory/BufferAllocator.java | 151 +++ .../apache/arrow/memory/BufferManager.java | 66 ++ .../apache/arrow/memory/ChildAllocator.java | 53 ++ .../arrow/memory/DrillByteBufAllocator.java | 141 +++ .../arrow/memory/OutOfMemoryException.java | 50 + .../java/org/apache/arrow/memory/README.md | 121 +++ .../apache/arrow/memory/RootAllocator.java | 39 + .../org/apache/arrow/memory/package-info.java | 24 + .../arrow/memory/util/AssertionUtil.java | 37 + .../arrow/memory/util/AutoCloseableLock.java | 43 + .../arrow/memory/util/HistoricalLog.java | 185 ++++ .../org/apache/arrow/memory/util/Metrics.java | 40 + .../org/apache/arrow/memory/util/Pointer.java | 28 + .../apache/arrow/memory/util/StackTrace.java | 70 ++ .../src/main/resources/drill-module.conf | 25 + .../apache/arrow/memory/TestAccountant.java | 164 ++++ .../arrow/memory/TestBaseAllocator.java | 648 +++++++++++++ .../apache/arrow/memory/TestEndianess.java | 43 + java/pom.xml | 470 ++++++++++ java/vector/pom.xml | 165 ++++ java/vector/src/main/codegen/config.fmpp | 24 + .../main/codegen/data/ValueVectorTypes.tdd | 168 ++++ .../src/main/codegen/includes/license.ftl | 18 + .../src/main/codegen/includes/vv_imports.ftl | 62 ++ .../templates/AbstractFieldReader.java | 124 +++ .../templates/AbstractFieldWriter.java | 147 +++ .../AbstractPromotableFieldWriter.java | 142 +++ .../main/codegen/templates/BaseReader.java | 73 ++ .../main/codegen/templates/BaseWriter.java | 117 +++ .../codegen/templates/BasicTypeHelper.java | 538 +++++++++++ .../main/codegen/templates/ComplexCopier.java | 133 +++ .../codegen/templates/ComplexReaders.java | 183 ++++ .../codegen/templates/ComplexWriters.java | 151 +++ .../codegen/templates/FixedValueVectors.java | 813 +++++++++++++++++ .../codegen/templates/HolderReaderImpl.java | 290 ++++++ .../main/codegen/templates/ListWriters.java | 234 +++++ .../main/codegen/templates/MapWriters.java | 240 +++++ .../main/codegen/templates/NullReader.java | 138 +++ .../templates/NullableValueVectors.java | 630 +++++++++++++ .../templates/RepeatedValueVectors.java | 421 +++++++++ .../codegen/templates/UnionListWriter.java | 185 ++++ .../main/codegen/templates/UnionReader.java | 194 ++++ .../main/codegen/templates/UnionVector.java | 467 ++++++++++ .../main/codegen/templates/UnionWriter.java | 228 +++++ .../main/codegen/templates/ValueHolders.java | 116 +++ .../templates/VariableLengthVectors.java | 644 +++++++++++++ .../apache/arrow/vector/AddOrGetResult.java | 38 + .../apache/arrow/vector/AllocationHelper.java | 61 ++ .../arrow/vector/BaseDataValueVector.java | 91 ++ .../apache/arrow/vector/BaseValueVector.java | 125 +++ .../org/apache/arrow/vector/BitVector.java | 450 +++++++++ .../apache/arrow/vector/FixedWidthVector.java | 35 + .../apache/arrow/vector/NullableVector.java | 23 + .../NullableVectorDefinitionSetter.java | 23 + .../org/apache/arrow/vector/ObjectVector.java | 220 +++++ .../arrow/vector/SchemaChangeCallBack.java | 52 ++ .../arrow/vector/ValueHolderHelper.java | 203 ++++ .../org/apache/arrow/vector/ValueVector.java | 222 +++++ .../arrow/vector/VariableWidthVector.java | 51 ++ .../apache/arrow/vector/VectorDescriptor.java | 83 ++ .../apache/arrow/vector/VectorTrimmer.java | 33 + .../org/apache/arrow/vector/ZeroVector.java | 181 ++++ .../complex/AbstractContainerVector.java | 143 +++ .../vector/complex/AbstractMapVector.java | 278 ++++++ .../complex/BaseRepeatedValueVector.java | 260 ++++++ .../vector/complex/ContainerVectorLike.java | 43 + .../vector/complex/EmptyValuePopulator.java | 54 ++ .../arrow/vector/complex/ListVector.java | 321 +++++++ .../arrow/vector/complex/MapVector.java | 374 ++++++++ .../arrow/vector/complex/Positionable.java | 22 + .../complex/RepeatedFixedWidthVectorLike.java | 40 + .../vector/complex/RepeatedListVector.java | 428 +++++++++ .../vector/complex/RepeatedMapVector.java | 584 ++++++++++++ .../vector/complex/RepeatedValueVector.java | 85 ++ .../RepeatedVariableWidthVectorLike.java | 35 + .../arrow/vector/complex/StateTool.java | 34 + .../vector/complex/VectorWithOrdinal.java | 30 + .../complex/impl/AbstractBaseReader.java | 100 ++ .../complex/impl/AbstractBaseWriter.java | 59 ++ .../complex/impl/ComplexWriterImpl.java | 193 ++++ .../complex/impl/MapOrListWriterImpl.java | 112 +++ .../vector/complex/impl/PromotableWriter.java | 196 ++++ .../complex/impl/RepeatedListReaderImpl.java | 145 +++ .../complex/impl/RepeatedMapReaderImpl.java | 192 ++++ .../impl/SingleLikeRepeatedMapReaderImpl.java | 89 ++ .../complex/impl/SingleListReaderImpl.java | 88 ++ .../complex/impl/SingleMapReaderImpl.java | 108 +++ .../vector/complex/impl/UnionListReader.java | 98 ++ .../vector/complex/reader/FieldReader.java | 29 + .../vector/complex/writer/FieldWriter.java | 27 + .../arrow/vector/holders/ComplexHolder.java | 25 + .../arrow/vector/holders/ObjectHolder.java | 38 + .../vector/holders/RepeatedListHolder.java | 23 + .../vector/holders/RepeatedMapHolder.java | 23 + .../arrow/vector/holders/UnionHolder.java | 37 + .../arrow/vector/holders/ValueHolder.java | 31 + .../arrow/vector/types/MaterializedField.java | 217 +++++ .../org/apache/arrow/vector/types/Types.java | 132 +++ .../vector/util/ByteFunctionHelpers.java | 233 +++++ .../apache/arrow/vector/util/CallBack.java | 23 + .../arrow/vector/util/CoreDecimalUtility.java | 91 ++ .../apache/arrow/vector/util/DateUtility.java | 682 ++++++++++++++ .../arrow/vector/util/DecimalUtility.java | 737 +++++++++++++++ .../vector/util/JsonStringArrayList.java | 57 ++ .../arrow/vector/util/JsonStringHashMap.java | 76 ++ .../arrow/vector/util/MapWithOrdinal.java | 248 +++++ .../util/OversizedAllocationException.java | 49 + .../util/SchemaChangeRuntimeException.java | 41 + .../org/apache/arrow/vector/util/Text.java | 621 +++++++++++++ .../arrow/vector/util/TransferPair.java | 27 + 124 files changed, 22077 insertions(+) create mode 100644 java/.gitignore create mode 100644 java/memory/pom.xml create mode 100644 java/memory/src/main/java/io/netty/buffer/ArrowBuf.java create mode 100644 java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java create mode 100644 java/memory/src/main/java/io/netty/buffer/LargeBuffer.java create mode 100644 java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java create mode 100644 java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java create mode 100644 java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/Accountant.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/AllocationReservation.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/BoundsChecking.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/DrillByteBufAllocator.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/OutOfMemoryException.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/README.md create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/RootAllocator.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/package-info.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/AssertionUtil.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/AutoCloseableLock.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/Metrics.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/Pointer.java create mode 100644 java/memory/src/main/java/org/apache/arrow/memory/util/StackTrace.java create mode 100644 java/memory/src/main/resources/drill-module.conf create mode 100644 java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java create mode 100644 java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java create mode 100644 java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java create mode 100644 java/pom.xml create mode 100644 java/vector/pom.xml create mode 100644 java/vector/src/main/codegen/config.fmpp create mode 100644 java/vector/src/main/codegen/data/ValueVectorTypes.tdd create mode 100644 java/vector/src/main/codegen/includes/license.ftl create mode 100644 java/vector/src/main/codegen/includes/vv_imports.ftl create mode 100644 java/vector/src/main/codegen/templates/AbstractFieldReader.java create mode 100644 java/vector/src/main/codegen/templates/AbstractFieldWriter.java create mode 100644 java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java create mode 100644 java/vector/src/main/codegen/templates/BaseReader.java create mode 100644 java/vector/src/main/codegen/templates/BaseWriter.java create mode 100644 java/vector/src/main/codegen/templates/BasicTypeHelper.java create mode 100644 java/vector/src/main/codegen/templates/ComplexCopier.java create mode 100644 java/vector/src/main/codegen/templates/ComplexReaders.java create mode 100644 java/vector/src/main/codegen/templates/ComplexWriters.java create mode 100644 java/vector/src/main/codegen/templates/FixedValueVectors.java create mode 100644 java/vector/src/main/codegen/templates/HolderReaderImpl.java create mode 100644 java/vector/src/main/codegen/templates/ListWriters.java create mode 100644 java/vector/src/main/codegen/templates/MapWriters.java create mode 100644 java/vector/src/main/codegen/templates/NullReader.java create mode 100644 java/vector/src/main/codegen/templates/NullableValueVectors.java create mode 100644 java/vector/src/main/codegen/templates/RepeatedValueVectors.java create mode 100644 java/vector/src/main/codegen/templates/UnionListWriter.java create mode 100644 java/vector/src/main/codegen/templates/UnionReader.java create mode 100644 java/vector/src/main/codegen/templates/UnionVector.java create mode 100644 java/vector/src/main/codegen/templates/UnionWriter.java create mode 100644 java/vector/src/main/codegen/templates/ValueHolders.java create mode 100644 java/vector/src/main/codegen/templates/VariableLengthVectors.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/BitVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/types/Types.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/Text.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java diff --git a/java/.gitignore b/java/.gitignore new file mode 100644 index 00000000000..73c1be49122 --- /dev/null +++ b/java/.gitignore @@ -0,0 +1,22 @@ +.project +.buildpath +.classpath +.checkstyle +.settings/ +.idea/ +TAGS +*.log +*.lck +*.iml +target/ +*.DS_Store +*.patch +*~ +git.properties +contrib/native/client/build/ +contrib/native/client/build/* +CMakeCache.txt +CMakeFiles +Makefile +cmake_install.cmake +install_manifest.txt diff --git a/java/memory/pom.xml b/java/memory/pom.xml new file mode 100644 index 00000000000..44332f5ed14 --- /dev/null +++ b/java/memory/pom.xml @@ -0,0 +1,50 @@ + + + + 4.0.0 + + org.apache.arrow + arrow-java-root + 0.1-SNAPSHOT + + arrow-memory + arrow-memory + + + + + com.codahale.metrics + metrics-core + 3.0.1 + + + + com.google.code.findbugs + jsr305 + 3.0.1 + + + + com.carrotsearch + hppc + 0.7.1 + + + + + + + + + + diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java new file mode 100644 index 00000000000..f033ba6538e --- /dev/null +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -0,0 +1,863 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.netty.buffer; + +import io.netty.util.internal.PlatformDependent; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.GatheringByteChannel; +import java.nio.channels.ScatteringByteChannel; +import java.nio.charset.Charset; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.arrow.memory.BaseAllocator; +import org.apache.arrow.memory.BoundsChecking; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.BufferManager; +import org.apache.arrow.memory.AllocationManager.BufferLedger; +import org.apache.arrow.memory.BaseAllocator.Verbosity; +import org.apache.arrow.memory.util.HistoricalLog; + +import com.google.common.base.Preconditions; + +public final class ArrowBuf extends AbstractByteBuf implements AutoCloseable { + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ArrowBuf.class); + + private static final AtomicLong idGenerator = new AtomicLong(0); + + private final long id = idGenerator.incrementAndGet(); + private final AtomicInteger refCnt; + private final UnsafeDirectLittleEndian udle; + private final long addr; + private final int offset; + private final BufferLedger ledger; + private final BufferManager bufManager; + private final ByteBufAllocator alloc; + private final boolean isEmpty; + private volatile int length; + private final HistoricalLog historicalLog = BaseAllocator.DEBUG ? + new HistoricalLog(BaseAllocator.DEBUG_LOG_LENGTH, "DrillBuf[%d]", id) : null; + + public ArrowBuf( + final AtomicInteger refCnt, + final BufferLedger ledger, + final UnsafeDirectLittleEndian byteBuf, + final BufferManager manager, + final ByteBufAllocator alloc, + final int offset, + final int length, + boolean isEmpty) { + super(byteBuf.maxCapacity()); + this.refCnt = refCnt; + this.udle = byteBuf; + this.isEmpty = isEmpty; + this.bufManager = manager; + this.alloc = alloc; + this.addr = byteBuf.memoryAddress() + offset; + this.ledger = ledger; + this.length = length; + this.offset = offset; + + if (BaseAllocator.DEBUG) { + historicalLog.recordEvent("create()"); + } + + } + + public ArrowBuf reallocIfNeeded(final int size) { + Preconditions.checkArgument(size >= 0, "reallocation size must be non-negative"); + + if (this.capacity() >= size) { + return this; + } + + if (bufManager != null) { + return bufManager.replace(this, size); + } else { + throw new UnsupportedOperationException("Realloc is only available in the context of an operator's UDFs"); + } + } + + @Override + public int refCnt() { + if (isEmpty) { + return 1; + } else { + return refCnt.get(); + } + } + + private long addr(int index) { + return addr + index; + } + + private final void checkIndexD(int index, int fieldLength) { + ensureAccessible(); + if (fieldLength < 0) { + throw new IllegalArgumentException("length: " + fieldLength + " (expected: >= 0)"); + } + if (index < 0 || index > capacity() - fieldLength) { + if (BaseAllocator.DEBUG) { + historicalLog.logHistory(logger); + } + throw new IndexOutOfBoundsException(String.format( + "index: %d, length: %d (expected: range(0, %d))", index, fieldLength, capacity())); + } + } + + /** + * Allows a function to determine whether not reading a particular string of bytes is valid. + * + * Will throw an exception if the memory is not readable for some reason. Only doesn't something in the case that + * AssertionUtil.BOUNDS_CHECKING_ENABLED is true. + * + * @param start + * The starting position of the bytes to be read. + * @param end + * The exclusive endpoint of the bytes to be read. + */ + public void checkBytes(int start, int end) { + if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { + checkIndexD(start, end - start); + } + } + + private void chk(int index, int width) { + if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { + checkIndexD(index, width); + } + } + + private void ensure(int width) { + if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { + ensureWritable(width); + } + } + + /** + * Create a new DrillBuf that is associated with an alternative allocator for the purposes of memory ownership and + * accounting. This has no impact on the reference counting for the current DrillBuf except in the situation where the + * passed in Allocator is the same as the current buffer. + * + * This operation has no impact on the reference count of this DrillBuf. The newly created DrillBuf with either have a + * reference count of 1 (in the case that this is the first time this memory is being associated with the new + * allocator) or the current value of the reference count + 1 for the other AllocationManager/BufferLedger combination + * in the case that the provided allocator already had an association to this underlying memory. + * + * @param target + * The target allocator to create an association with. + * @return A new DrillBuf which shares the same underlying memory as this DrillBuf. + */ + public ArrowBuf retain(BufferAllocator target) { + + if (isEmpty) { + return this; + } + + if (BaseAllocator.DEBUG) { + historicalLog.recordEvent("retain(%s)", target.getName()); + } + final BufferLedger otherLedger = this.ledger.getLedgerForAllocator(target); + return otherLedger.newDrillBuf(offset, length, null); + } + + /** + * Transfer the memory accounting ownership of this DrillBuf to another allocator. This will generate a new DrillBuf + * that carries an association with the underlying memory of this DrillBuf. If this DrillBuf is connected to the + * owning BufferLedger of this memory, that memory ownership/accounting will be transferred to the taret allocator. If + * this DrillBuf does not currently own the memory underlying it (and is only associated with it), this does not + * transfer any ownership to the newly created DrillBuf. + * + * This operation has no impact on the reference count of this DrillBuf. The newly created DrillBuf with either have a + * reference count of 1 (in the case that this is the first time this memory is being associated with the new + * allocator) or the current value of the reference count for the other AllocationManager/BufferLedger combination in + * the case that the provided allocator already had an association to this underlying memory. + * + * Transfers will always succeed, even if that puts the other allocator into an overlimit situation. This is possible + * due to the fact that the original owning allocator may have allocated this memory out of a local reservation + * whereas the target allocator may need to allocate new memory from a parent or RootAllocator. This operation is done + * in a mostly-lockless but consistent manner. As such, the overlimit==true situation could occur slightly prematurely + * to an actual overlimit==true condition. This is simply conservative behavior which means we may return overlimit + * slightly sooner than is necessary. + * + * @param target + * The allocator to transfer ownership to. + * @return A new transfer result with the impact of the transfer (whether it was overlimit) as well as the newly + * created DrillBuf. + */ + public TransferResult transferOwnership(BufferAllocator target) { + + if (isEmpty) { + return new TransferResult(true, this); + } + + final BufferLedger otherLedger = this.ledger.getLedgerForAllocator(target); + final ArrowBuf newBuf = otherLedger.newDrillBuf(offset, length, null); + final boolean allocationFit = this.ledger.transferBalance(otherLedger); + return new TransferResult(allocationFit, newBuf); + } + + /** + * The outcome of a Transfer. + */ + public class TransferResult { + + /** + * Whether this transfer fit within the target allocator's capacity. + */ + public final boolean allocationFit; + + /** + * The newly created buffer associated with the target allocator. + */ + public final ArrowBuf buffer; + + private TransferResult(boolean allocationFit, ArrowBuf buffer) { + this.allocationFit = allocationFit; + this.buffer = buffer; + } + + } + + @Override + public boolean release() { + return release(1); + } + + /** + * Release the provided number of reference counts. + */ + @Override + public boolean release(int decrement) { + + if (isEmpty) { + return false; + } + + if (decrement < 1) { + throw new IllegalStateException(String.format("release(%d) argument is not positive. Buffer Info: %s", + decrement, toVerboseString())); + } + + final int refCnt = ledger.decrement(decrement); + + if (BaseAllocator.DEBUG) { + historicalLog.recordEvent("release(%d). original value: %d", decrement, refCnt + decrement); + } + + if (refCnt < 0) { + throw new IllegalStateException( + String.format("DrillBuf[%d] refCnt has gone negative. Buffer Info: %s", id, toVerboseString())); + } + + return refCnt == 0; + + } + + @Override + public int capacity() { + return length; + } + + @Override + public synchronized ArrowBuf capacity(int newCapacity) { + + if (newCapacity == length) { + return this; + } + + Preconditions.checkArgument(newCapacity >= 0); + + if (newCapacity < length) { + length = newCapacity; + return this; + } + + throw new UnsupportedOperationException("Buffers don't support resizing that increases the size."); + } + + @Override + public ByteBufAllocator alloc() { + return udle.alloc(); + } + + @Override + public ByteOrder order() { + return ByteOrder.LITTLE_ENDIAN; + } + + @Override + public ByteBuf order(ByteOrder endianness) { + return this; + } + + @Override + public ByteBuf unwrap() { + return udle; + } + + @Override + public boolean isDirect() { + return true; + } + + @Override + public ByteBuf readBytes(int length) { + throw new UnsupportedOperationException(); + } + + @Override + public ByteBuf readSlice(int length) { + final ByteBuf slice = slice(readerIndex(), length); + readerIndex(readerIndex() + length); + return slice; + } + + @Override + public ByteBuf copy() { + throw new UnsupportedOperationException(); + } + + @Override + public ByteBuf copy(int index, int length) { + throw new UnsupportedOperationException(); + } + + @Override + public ByteBuf slice() { + return slice(readerIndex(), readableBytes()); + } + + public static String bufferState(final ByteBuf buf) { + final int cap = buf.capacity(); + final int mcap = buf.maxCapacity(); + final int ri = buf.readerIndex(); + final int rb = buf.readableBytes(); + final int wi = buf.writerIndex(); + final int wb = buf.writableBytes(); + return String.format("cap/max: %d/%d, ri: %d, rb: %d, wi: %d, wb: %d", + cap, mcap, ri, rb, wi, wb); + } + + @Override + public ArrowBuf slice(int index, int length) { + + if (isEmpty) { + return this; + } + + /* + * Re the behavior of reference counting, see http://netty.io/wiki/reference-counted-objects.html#wiki-h3-5, which + * explains that derived buffers share their reference count with their parent + */ + final ArrowBuf newBuf = ledger.newDrillBuf(offset + index, length); + newBuf.writerIndex(length); + return newBuf; + } + + @Override + public ArrowBuf duplicate() { + return slice(0, length); + } + + @Override + public int nioBufferCount() { + return 1; + } + + @Override + public ByteBuffer nioBuffer() { + return nioBuffer(readerIndex(), readableBytes()); + } + + @Override + public ByteBuffer nioBuffer(int index, int length) { + return udle.nioBuffer(offset + index, length); + } + + @Override + public ByteBuffer internalNioBuffer(int index, int length) { + return udle.internalNioBuffer(offset + index, length); + } + + @Override + public ByteBuffer[] nioBuffers() { + return new ByteBuffer[] { nioBuffer() }; + } + + @Override + public ByteBuffer[] nioBuffers(int index, int length) { + return new ByteBuffer[] { nioBuffer(index, length) }; + } + + @Override + public boolean hasArray() { + return udle.hasArray(); + } + + @Override + public byte[] array() { + return udle.array(); + } + + @Override + public int arrayOffset() { + return udle.arrayOffset(); + } + + @Override + public boolean hasMemoryAddress() { + return true; + } + + @Override + public long memoryAddress() { + return this.addr; + } + + @Override + public String toString() { + return String.format("DrillBuf[%d], udle: [%d %d..%d]", id, udle.id, offset, offset + capacity()); + } + + @Override + public String toString(Charset charset) { + return toString(readerIndex, readableBytes(), charset); + } + + @Override + public String toString(int index, int length, Charset charset) { + + if (length == 0) { + return ""; + } + + return ByteBufUtil.decodeString(nioBuffer(index, length), charset); + } + + @Override + public int hashCode() { + return System.identityHashCode(this); + } + + @Override + public boolean equals(Object obj) { + // identity equals only. + return this == obj; + } + + @Override + public ByteBuf retain(int increment) { + Preconditions.checkArgument(increment > 0, "retain(%d) argument is not positive", increment); + + if (isEmpty) { + return this; + } + + if (BaseAllocator.DEBUG) { + historicalLog.recordEvent("retain(%d)", increment); + } + + final int originalReferenceCount = refCnt.getAndAdd(increment); + Preconditions.checkArgument(originalReferenceCount > 0); + return this; + } + + @Override + public ByteBuf retain() { + return retain(1); + } + + @Override + public long getLong(int index) { + chk(index, 8); + final long v = PlatformDependent.getLong(addr(index)); + return v; + } + + @Override + public float getFloat(int index) { + return Float.intBitsToFloat(getInt(index)); + } + + @Override + public double getDouble(int index) { + return Double.longBitsToDouble(getLong(index)); + } + + @Override + public char getChar(int index) { + return (char) getShort(index); + } + + @Override + public long getUnsignedInt(int index) { + return getInt(index) & 0xFFFFFFFFL; + } + + @Override + public int getInt(int index) { + chk(index, 4); + final int v = PlatformDependent.getInt(addr(index)); + return v; + } + + @Override + public int getUnsignedShort(int index) { + return getShort(index) & 0xFFFF; + } + + @Override + public short getShort(int index) { + chk(index, 2); + short v = PlatformDependent.getShort(addr(index)); + return v; + } + + @Override + public ByteBuf setShort(int index, int value) { + chk(index, 2); + PlatformDependent.putShort(addr(index), (short) value); + return this; + } + + @Override + public ByteBuf setInt(int index, int value) { + chk(index, 4); + PlatformDependent.putInt(addr(index), value); + return this; + } + + @Override + public ByteBuf setLong(int index, long value) { + chk(index, 8); + PlatformDependent.putLong(addr(index), value); + return this; + } + + @Override + public ByteBuf setChar(int index, int value) { + chk(index, 2); + PlatformDependent.putShort(addr(index), (short) value); + return this; + } + + @Override + public ByteBuf setFloat(int index, float value) { + chk(index, 4); + PlatformDependent.putInt(addr(index), Float.floatToRawIntBits(value)); + return this; + } + + @Override + public ByteBuf setDouble(int index, double value) { + chk(index, 8); + PlatformDependent.putLong(addr(index), Double.doubleToRawLongBits(value)); + return this; + } + + @Override + public ByteBuf writeShort(int value) { + ensure(2); + PlatformDependent.putShort(addr(writerIndex), (short) value); + writerIndex += 2; + return this; + } + + @Override + public ByteBuf writeInt(int value) { + ensure(4); + PlatformDependent.putInt(addr(writerIndex), value); + writerIndex += 4; + return this; + } + + @Override + public ByteBuf writeLong(long value) { + ensure(8); + PlatformDependent.putLong(addr(writerIndex), value); + writerIndex += 8; + return this; + } + + @Override + public ByteBuf writeChar(int value) { + ensure(2); + PlatformDependent.putShort(addr(writerIndex), (short) value); + writerIndex += 2; + return this; + } + + @Override + public ByteBuf writeFloat(float value) { + ensure(4); + PlatformDependent.putInt(addr(writerIndex), Float.floatToRawIntBits(value)); + writerIndex += 4; + return this; + } + + @Override + public ByteBuf writeDouble(double value) { + ensure(8); + PlatformDependent.putLong(addr(writerIndex), Double.doubleToRawLongBits(value)); + writerIndex += 8; + return this; + } + + @Override + public ByteBuf getBytes(int index, byte[] dst, int dstIndex, int length) { + udle.getBytes(index + offset, dst, dstIndex, length); + return this; + } + + @Override + public ByteBuf getBytes(int index, ByteBuffer dst) { + udle.getBytes(index + offset, dst); + return this; + } + + @Override + public ByteBuf setByte(int index, int value) { + chk(index, 1); + PlatformDependent.putByte(addr(index), (byte) value); + return this; + } + + public void setByte(int index, byte b) { + chk(index, 1); + PlatformDependent.putByte(addr(index), b); + } + + public void writeByteUnsafe(byte b) { + PlatformDependent.putByte(addr(readerIndex), b); + readerIndex++; + } + + @Override + protected byte _getByte(int index) { + return getByte(index); + } + + @Override + protected short _getShort(int index) { + return getShort(index); + } + + @Override + protected int _getInt(int index) { + return getInt(index); + } + + @Override + protected long _getLong(int index) { + return getLong(index); + } + + @Override + protected void _setByte(int index, int value) { + setByte(index, value); + } + + @Override + protected void _setShort(int index, int value) { + setShort(index, value); + } + + @Override + protected void _setMedium(int index, int value) { + setMedium(index, value); + } + + @Override + protected void _setInt(int index, int value) { + setInt(index, value); + } + + @Override + protected void _setLong(int index, long value) { + setLong(index, value); + } + + @Override + public ByteBuf getBytes(int index, ByteBuf dst, int dstIndex, int length) { + udle.getBytes(index + offset, dst, dstIndex, length); + return this; + } + + @Override + public ByteBuf getBytes(int index, OutputStream out, int length) throws IOException { + udle.getBytes(index + offset, out, length); + return this; + } + + @Override + protected int _getUnsignedMedium(int index) { + final long addr = addr(index); + return (PlatformDependent.getByte(addr) & 0xff) << 16 | + (PlatformDependent.getByte(addr + 1) & 0xff) << 8 | + PlatformDependent.getByte(addr + 2) & 0xff; + } + + @Override + public int getBytes(int index, GatheringByteChannel out, int length) throws IOException { + return udle.getBytes(index + offset, out, length); + } + + @Override + public ByteBuf setBytes(int index, ByteBuf src, int srcIndex, int length) { + udle.setBytes(index + offset, src, srcIndex, length); + return this; + } + + public ByteBuf setBytes(int index, ByteBuffer src, int srcIndex, int length) { + if (src.isDirect()) { + checkIndex(index, length); + PlatformDependent.copyMemory(PlatformDependent.directBufferAddress(src) + srcIndex, this.memoryAddress() + index, + length); + } else { + if (srcIndex == 0 && src.capacity() == length) { + udle.setBytes(index + offset, src); + } else { + ByteBuffer newBuf = src.duplicate(); + newBuf.position(srcIndex); + newBuf.limit(srcIndex + length); + udle.setBytes(index + offset, src); + } + } + + return this; + } + + @Override + public ByteBuf setBytes(int index, byte[] src, int srcIndex, int length) { + udle.setBytes(index + offset, src, srcIndex, length); + return this; + } + + @Override + public ByteBuf setBytes(int index, ByteBuffer src) { + udle.setBytes(index + offset, src); + return this; + } + + @Override + public int setBytes(int index, InputStream in, int length) throws IOException { + return udle.setBytes(index + offset, in, length); + } + + @Override + public int setBytes(int index, ScatteringByteChannel in, int length) throws IOException { + return udle.setBytes(index + offset, in, length); + } + + @Override + public byte getByte(int index) { + chk(index, 1); + return PlatformDependent.getByte(addr(index)); + } + + @Override + public void close() { + release(); + } + + /** + * Returns the possible memory consumed by this DrillBuf in the worse case scenario. (not shared, connected to larger + * underlying buffer of allocated memory) + * + * @return Size in bytes. + */ + public int getPossibleMemoryConsumed() { + return ledger.getSize(); + } + + /** + * Return that is Accounted for by this buffer (and its potentially shared siblings within the context of the + * associated allocator). + * + * @return Size in bytes. + */ + public int getActualMemoryConsumed() { + return ledger.getAccountedSize(); + } + + private final static int LOG_BYTES_PER_ROW = 10; + + /** + * Return the buffer's byte contents in the form of a hex dump. + * + * @param start + * the starting byte index + * @param length + * how many bytes to log + * @return A hex dump in a String. + */ + public String toHexString(final int start, final int length) { + final int roundedStart = (start / LOG_BYTES_PER_ROW) * LOG_BYTES_PER_ROW; + + final StringBuilder sb = new StringBuilder("buffer byte dump\n"); + int index = roundedStart; + for (int nLogged = 0; nLogged < length; nLogged += LOG_BYTES_PER_ROW) { + sb.append(String.format(" [%05d-%05d]", index, index + LOG_BYTES_PER_ROW - 1)); + for (int i = 0; i < LOG_BYTES_PER_ROW; ++i) { + try { + final byte b = getByte(index++); + sb.append(String.format(" 0x%02x", b)); + } catch (IndexOutOfBoundsException ioob) { + sb.append(" "); + } + } + sb.append('\n'); + } + return sb.toString(); + } + + /** + * Get the integer id assigned to this DrillBuf for debugging purposes. + * + * @return integer id + */ + public long getId() { + return id; + } + + public String toVerboseString() { + if (isEmpty) { + return toString(); + } + + StringBuilder sb = new StringBuilder(); + ledger.print(sb, 0, Verbosity.LOG_WITH_STACKTRACE); + return sb.toString(); + } + + public void print(StringBuilder sb, int indent, Verbosity verbosity) { + BaseAllocator.indent(sb, indent).append(toString()); + + if (BaseAllocator.DEBUG && !isEmpty && verbosity.includeHistoricalLog) { + sb.append("\n"); + historicalLog.buildHistory(sb, indent + 1, verbosity.includeStackTraces); + } + } + +} diff --git a/java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java b/java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java new file mode 100644 index 00000000000..59886474923 --- /dev/null +++ b/java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.netty.buffer; + +import org.apache.arrow.memory.BufferAllocator; + +/** + * Allows us to decorate DrillBuf to make it expandable so that we can use them in the context of the Netty framework + * (thus supporting RPC level memory accounting). + */ +public class ExpandableByteBuf extends MutableWrappedByteBuf { + + private final BufferAllocator allocator; + + public ExpandableByteBuf(ByteBuf buffer, BufferAllocator allocator) { + super(buffer); + this.allocator = allocator; + } + + @Override + public ByteBuf copy(int index, int length) { + return new ExpandableByteBuf(buffer.copy(index, length), allocator); + } + + @Override + public ByteBuf capacity(int newCapacity) { + if (newCapacity > capacity()) { + ByteBuf newBuf = allocator.buffer(newCapacity); + newBuf.writeBytes(buffer, 0, buffer.capacity()); + newBuf.readerIndex(buffer.readerIndex()); + newBuf.writerIndex(buffer.writerIndex()); + buffer.release(); + buffer = newBuf; + return newBuf; + } else { + return super.capacity(newCapacity); + } + } + +} diff --git a/java/memory/src/main/java/io/netty/buffer/LargeBuffer.java b/java/memory/src/main/java/io/netty/buffer/LargeBuffer.java new file mode 100644 index 00000000000..5f5e904fb04 --- /dev/null +++ b/java/memory/src/main/java/io/netty/buffer/LargeBuffer.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.netty.buffer; + +import java.util.concurrent.atomic.AtomicLong; + +/** + * A MutableWrappedByteBuf that also maintains a metric of the number of huge buffer bytes and counts. + */ +public class LargeBuffer extends MutableWrappedByteBuf { + + private final AtomicLong hugeBufferSize; + private final AtomicLong hugeBufferCount; + + private final int initCap; + + public LargeBuffer(ByteBuf buffer, AtomicLong hugeBufferSize, AtomicLong hugeBufferCount) { + super(buffer); + initCap = buffer.capacity(); + this.hugeBufferCount = hugeBufferCount; + this.hugeBufferSize = hugeBufferSize; + } + + @Override + public ByteBuf copy(int index, int length) { + return new LargeBuffer(buffer.copy(index, length), hugeBufferSize, hugeBufferCount); + } + + @Override + public boolean release() { + return release(1); + } + + @Override + public boolean release(int decrement) { + boolean released = unwrap().release(decrement); + if (released) { + hugeBufferSize.addAndGet(-initCap); + hugeBufferCount.decrementAndGet(); + } + return released; + } + +} diff --git a/java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java b/java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java new file mode 100644 index 00000000000..5709473135e --- /dev/null +++ b/java/memory/src/main/java/io/netty/buffer/MutableWrappedByteBuf.java @@ -0,0 +1,336 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.netty.buffer; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.GatheringByteChannel; +import java.nio.channels.ScatteringByteChannel; + +/** + * This is basically a complete copy of DuplicatedByteBuf. We copy because we want to override some behaviors and make + * buffer mutable. + */ +abstract class MutableWrappedByteBuf extends AbstractByteBuf { + + @Override + public ByteBuffer nioBuffer(int index, int length) { + return unwrap().nioBuffer(index, length); + } + + ByteBuf buffer; + + public MutableWrappedByteBuf(ByteBuf buffer) { + super(buffer.maxCapacity()); + + if (buffer instanceof MutableWrappedByteBuf) { + this.buffer = ((MutableWrappedByteBuf) buffer).buffer; + } else { + this.buffer = buffer; + } + + setIndex(buffer.readerIndex(), buffer.writerIndex()); + } + + @Override + public ByteBuf unwrap() { + return buffer; + } + + @Override + public ByteBufAllocator alloc() { + return buffer.alloc(); + } + + @Override + public ByteOrder order() { + return buffer.order(); + } + + @Override + public boolean isDirect() { + return buffer.isDirect(); + } + + @Override + public int capacity() { + return buffer.capacity(); + } + + @Override + public ByteBuf capacity(int newCapacity) { + buffer.capacity(newCapacity); + return this; + } + + @Override + public boolean hasArray() { + return buffer.hasArray(); + } + + @Override + public byte[] array() { + return buffer.array(); + } + + @Override + public int arrayOffset() { + return buffer.arrayOffset(); + } + + @Override + public boolean hasMemoryAddress() { + return buffer.hasMemoryAddress(); + } + + @Override + public long memoryAddress() { + return buffer.memoryAddress(); + } + + @Override + public byte getByte(int index) { + return _getByte(index); + } + + @Override + protected byte _getByte(int index) { + return buffer.getByte(index); + } + + @Override + public short getShort(int index) { + return _getShort(index); + } + + @Override + protected short _getShort(int index) { + return buffer.getShort(index); + } + + @Override + public int getUnsignedMedium(int index) { + return _getUnsignedMedium(index); + } + + @Override + protected int _getUnsignedMedium(int index) { + return buffer.getUnsignedMedium(index); + } + + @Override + public int getInt(int index) { + return _getInt(index); + } + + @Override + protected int _getInt(int index) { + return buffer.getInt(index); + } + + @Override + public long getLong(int index) { + return _getLong(index); + } + + @Override + protected long _getLong(int index) { + return buffer.getLong(index); + } + + @Override + public abstract ByteBuf copy(int index, int length); + + @Override + public ByteBuf slice(int index, int length) { + return new SlicedByteBuf(this, index, length); + } + + @Override + public ByteBuf getBytes(int index, ByteBuf dst, int dstIndex, int length) { + buffer.getBytes(index, dst, dstIndex, length); + return this; + } + + @Override + public ByteBuf getBytes(int index, byte[] dst, int dstIndex, int length) { + buffer.getBytes(index, dst, dstIndex, length); + return this; + } + + @Override + public ByteBuf getBytes(int index, ByteBuffer dst) { + buffer.getBytes(index, dst); + return this; + } + + @Override + public ByteBuf setByte(int index, int value) { + _setByte(index, value); + return this; + } + + @Override + protected void _setByte(int index, int value) { + buffer.setByte(index, value); + } + + @Override + public ByteBuf setShort(int index, int value) { + _setShort(index, value); + return this; + } + + @Override + protected void _setShort(int index, int value) { + buffer.setShort(index, value); + } + + @Override + public ByteBuf setMedium(int index, int value) { + _setMedium(index, value); + return this; + } + + @Override + protected void _setMedium(int index, int value) { + buffer.setMedium(index, value); + } + + @Override + public ByteBuf setInt(int index, int value) { + _setInt(index, value); + return this; + } + + @Override + protected void _setInt(int index, int value) { + buffer.setInt(index, value); + } + + @Override + public ByteBuf setLong(int index, long value) { + _setLong(index, value); + return this; + } + + @Override + protected void _setLong(int index, long value) { + buffer.setLong(index, value); + } + + @Override + public ByteBuf setBytes(int index, byte[] src, int srcIndex, int length) { + buffer.setBytes(index, src, srcIndex, length); + return this; + } + + @Override + public ByteBuf setBytes(int index, ByteBuf src, int srcIndex, int length) { + buffer.setBytes(index, src, srcIndex, length); + return this; + } + + @Override + public ByteBuf setBytes(int index, ByteBuffer src) { + buffer.setBytes(index, src); + return this; + } + + @Override + public ByteBuf getBytes(int index, OutputStream out, int length) + throws IOException { + buffer.getBytes(index, out, length); + return this; + } + + @Override + public int getBytes(int index, GatheringByteChannel out, int length) + throws IOException { + return buffer.getBytes(index, out, length); + } + + @Override + public int setBytes(int index, InputStream in, int length) + throws IOException { + return buffer.setBytes(index, in, length); + } + + @Override + public int setBytes(int index, ScatteringByteChannel in, int length) + throws IOException { + return buffer.setBytes(index, in, length); + } + + @Override + public int nioBufferCount() { + return buffer.nioBufferCount(); + } + + @Override + public ByteBuffer[] nioBuffers(int index, int length) { + return buffer.nioBuffers(index, length); + } + + @Override + public ByteBuffer internalNioBuffer(int index, int length) { + return nioBuffer(index, length); + } + + @Override + public int forEachByte(int index, int length, ByteBufProcessor processor) { + return buffer.forEachByte(index, length, processor); + } + + @Override + public int forEachByteDesc(int index, int length, ByteBufProcessor processor) { + return buffer.forEachByteDesc(index, length, processor); + } + + @Override + public final int refCnt() { + return unwrap().refCnt(); + } + + @Override + public final ByteBuf retain() { + unwrap().retain(); + return this; + } + + @Override + public final ByteBuf retain(int increment) { + unwrap().retain(increment); + return this; + } + + @Override + public boolean release() { + return release(1); + } + + @Override + public boolean release(int decrement) { + boolean released = unwrap().release(decrement); + return released; + } + +} diff --git a/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java new file mode 100644 index 00000000000..1610028df9d --- /dev/null +++ b/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -0,0 +1,272 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.netty.buffer; + +import io.netty.util.internal.StringUtil; + +import java.lang.reflect.Field; +import java.nio.ByteBuffer; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.arrow.memory.OutOfMemoryException; + +import com.codahale.metrics.Gauge; +import com.codahale.metrics.Histogram; +import com.codahale.metrics.Metric; +import com.codahale.metrics.MetricFilter; +import com.codahale.metrics.MetricRegistry; + +/** + * The base allocator that we use for all of Drill's memory management. Returns UnsafeDirectLittleEndian buffers. + */ +public class PooledByteBufAllocatorL { + private static final org.slf4j.Logger memoryLogger = org.slf4j.LoggerFactory.getLogger("drill.allocator"); + + private static final int MEMORY_LOGGER_FREQUENCY_SECONDS = 60; + + + public static final String METRIC_PREFIX = "drill.allocator."; + + private final MetricRegistry registry; + private final AtomicLong hugeBufferSize = new AtomicLong(0); + private final AtomicLong hugeBufferCount = new AtomicLong(0); + private final AtomicLong normalBufferSize = new AtomicLong(0); + private final AtomicLong normalBufferCount = new AtomicLong(0); + + private final InnerAllocator allocator; + public final UnsafeDirectLittleEndian empty; + + public PooledByteBufAllocatorL(MetricRegistry registry) { + this.registry = registry; + allocator = new InnerAllocator(); + empty = new UnsafeDirectLittleEndian(new DuplicatedByteBuf(Unpooled.EMPTY_BUFFER)); + } + + public UnsafeDirectLittleEndian allocate(int size) { + try { + return allocator.directBuffer(size, Integer.MAX_VALUE); + } catch (OutOfMemoryError e) { + throw new OutOfMemoryException("Failure allocating buffer.", e); + } + + } + + public int getChunkSize() { + return allocator.chunkSize; + } + + private class InnerAllocator extends PooledByteBufAllocator { + + + private final PoolArena[] directArenas; + private final MemoryStatusThread statusThread; + private final Histogram largeBuffersHist; + private final Histogram normalBuffersHist; + private final int chunkSize; + + public InnerAllocator() { + super(true); + + try { + Field f = PooledByteBufAllocator.class.getDeclaredField("directArenas"); + f.setAccessible(true); + this.directArenas = (PoolArena[]) f.get(this); + } catch (Exception e) { + throw new RuntimeException("Failure while initializing allocator. Unable to retrieve direct arenas field.", e); + } + + this.chunkSize = directArenas[0].chunkSize; + + if (memoryLogger.isTraceEnabled()) { + statusThread = new MemoryStatusThread(); + statusThread.start(); + } else { + statusThread = null; + } + removeOldMetrics(); + + registry.register(METRIC_PREFIX + "normal.size", new Gauge() { + @Override + public Long getValue() { + return normalBufferSize.get(); + } + }); + + registry.register(METRIC_PREFIX + "normal.count", new Gauge() { + @Override + public Long getValue() { + return normalBufferCount.get(); + } + }); + + registry.register(METRIC_PREFIX + "huge.size", new Gauge() { + @Override + public Long getValue() { + return hugeBufferSize.get(); + } + }); + + registry.register(METRIC_PREFIX + "huge.count", new Gauge() { + @Override + public Long getValue() { + return hugeBufferCount.get(); + } + }); + + largeBuffersHist = registry.histogram(METRIC_PREFIX + "huge.hist"); + normalBuffersHist = registry.histogram(METRIC_PREFIX + "normal.hist"); + + } + + + private synchronized void removeOldMetrics() { + registry.removeMatching(new MetricFilter() { + @Override + public boolean matches(String name, Metric metric) { + return name.startsWith("drill.allocator."); + } + + }); + } + + private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCapacity) { + PoolThreadCache cache = threadCache.get(); + PoolArena directArena = cache.directArena; + + if (directArena != null) { + + if (initialCapacity > directArena.chunkSize) { + // This is beyond chunk size so we'll allocate separately. + ByteBuf buf = UnpooledByteBufAllocator.DEFAULT.directBuffer(initialCapacity, maxCapacity); + + hugeBufferCount.incrementAndGet(); + hugeBufferSize.addAndGet(buf.capacity()); + largeBuffersHist.update(buf.capacity()); + // logger.debug("Allocating huge buffer of size {}", initialCapacity, new Exception()); + return new UnsafeDirectLittleEndian(new LargeBuffer(buf, hugeBufferSize, hugeBufferCount)); + + } else { + // within chunk, use arena. + ByteBuf buf = directArena.allocate(cache, initialCapacity, maxCapacity); + if (!(buf instanceof PooledUnsafeDirectByteBuf)) { + fail(); + } + + normalBuffersHist.update(buf.capacity()); + if (ASSERT_ENABLED) { + normalBufferSize.addAndGet(buf.capacity()); + normalBufferCount.incrementAndGet(); + } + + return new UnsafeDirectLittleEndian((PooledUnsafeDirectByteBuf) buf, normalBufferCount, + normalBufferSize); + } + + } else { + throw fail(); + } + } + + private UnsupportedOperationException fail() { + return new UnsupportedOperationException( + "Drill requries that the JVM used supports access sun.misc.Unsafe. This platform didn't provide that functionality."); + } + + public UnsafeDirectLittleEndian directBuffer(int initialCapacity, int maxCapacity) { + if (initialCapacity == 0 && maxCapacity == 0) { + newDirectBuffer(initialCapacity, maxCapacity); + } + validate(initialCapacity, maxCapacity); + return newDirectBufferL(initialCapacity, maxCapacity); + } + + @Override + public ByteBuf heapBuffer(int initialCapacity, int maxCapacity) { + throw new UnsupportedOperationException("Drill doesn't support using heap buffers."); + } + + + private void validate(int initialCapacity, int maxCapacity) { + if (initialCapacity < 0) { + throw new IllegalArgumentException("initialCapacity: " + initialCapacity + " (expectd: 0+)"); + } + if (initialCapacity > maxCapacity) { + throw new IllegalArgumentException(String.format( + "initialCapacity: %d (expected: not greater than maxCapacity(%d)", + initialCapacity, maxCapacity)); + } + } + + private class MemoryStatusThread extends Thread { + + public MemoryStatusThread() { + super("memory-status-logger"); + this.setDaemon(true); + this.setName("allocation.logger"); + } + + @Override + public void run() { + while (true) { + memoryLogger.trace("Memory Usage: \n{}", PooledByteBufAllocatorL.this.toString()); + try { + Thread.sleep(MEMORY_LOGGER_FREQUENCY_SECONDS * 1000); + } catch (InterruptedException e) { + return; + } + + } + } + + } + + public String toString() { + StringBuilder buf = new StringBuilder(); + buf.append(directArenas.length); + buf.append(" direct arena(s):"); + buf.append(StringUtil.NEWLINE); + for (PoolArena a : directArenas) { + buf.append(a); + } + + buf.append("Large buffers outstanding: "); + buf.append(hugeBufferCount.get()); + buf.append(" totaling "); + buf.append(hugeBufferSize.get()); + buf.append(" bytes."); + buf.append('\n'); + buf.append("Normal buffers outstanding: "); + buf.append(normalBufferCount.get()); + buf.append(" totaling "); + buf.append(normalBufferSize.get()); + buf.append(" bytes."); + return buf.toString(); + } + + + } + + public static final boolean ASSERT_ENABLED; + + static { + boolean isAssertEnabled = false; + assert isAssertEnabled = true; + ASSERT_ENABLED = isAssertEnabled; + } + +} diff --git a/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java new file mode 100644 index 00000000000..6495d5d371e --- /dev/null +++ b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java @@ -0,0 +1,270 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.netty.buffer; + +import io.netty.util.internal.PlatformDependent; + +import java.nio.ByteOrder; +import java.util.concurrent.atomic.AtomicLong; + +/** + * The underlying class we use for little-endian access to memory. Is used underneath DrillBufs to abstract away the + * Netty classes and underlying Netty memory management. + */ +public final class UnsafeDirectLittleEndian extends WrappedByteBuf { + private static final boolean NATIVE_ORDER = ByteOrder.nativeOrder() == ByteOrder.LITTLE_ENDIAN; + private static final AtomicLong ID_GENERATOR = new AtomicLong(0); + + public final long id = ID_GENERATOR.incrementAndGet(); + private final AbstractByteBuf wrapped; + private final long memoryAddress; + + private final AtomicLong bufferCount; + private final AtomicLong bufferSize; + private final long initCap; + + UnsafeDirectLittleEndian(DuplicatedByteBuf buf) { + this(buf, true, null, null); + } + + UnsafeDirectLittleEndian(LargeBuffer buf) { + this(buf, true, null, null); + } + + UnsafeDirectLittleEndian(PooledUnsafeDirectByteBuf buf, AtomicLong bufferCount, AtomicLong bufferSize) { + this(buf, true, bufferCount, bufferSize); + + } + + private UnsafeDirectLittleEndian(AbstractByteBuf buf, boolean fake, AtomicLong bufferCount, AtomicLong bufferSize) { + super(buf); + if (!NATIVE_ORDER || buf.order() != ByteOrder.BIG_ENDIAN) { + throw new IllegalStateException("Drill only runs on LittleEndian systems."); + } + + this.bufferCount = bufferCount; + this.bufferSize = bufferSize; + + // initCap is used if we're tracking memory release. If we're in non-debug mode, we'll skip this. + this.initCap = ASSERT_ENABLED ? buf.capacity() : -1; + + this.wrapped = buf; + this.memoryAddress = buf.memoryAddress(); + } + private long addr(int index) { + return memoryAddress + index; + } + + @Override + public long getLong(int index) { +// wrapped.checkIndex(index, 8); + long v = PlatformDependent.getLong(addr(index)); + return v; + } + + @Override + public float getFloat(int index) { + return Float.intBitsToFloat(getInt(index)); + } + + @Override + public ByteBuf slice() { + return slice(this.readerIndex(), readableBytes()); + } + + @Override + public ByteBuf slice(int index, int length) { + return new SlicedByteBuf(this, index, length); + } + + @Override + public ByteOrder order() { + return ByteOrder.LITTLE_ENDIAN; + } + + @Override + public ByteBuf order(ByteOrder endianness) { + return this; + } + + @Override + public double getDouble(int index) { + return Double.longBitsToDouble(getLong(index)); + } + + @Override + public char getChar(int index) { + return (char) getShort(index); + } + + @Override + public long getUnsignedInt(int index) { + return getInt(index) & 0xFFFFFFFFL; + } + + @Override + public int getInt(int index) { + int v = PlatformDependent.getInt(addr(index)); + return v; + } + + @Override + public int getUnsignedShort(int index) { + return getShort(index) & 0xFFFF; + } + + @Override + public short getShort(int index) { + short v = PlatformDependent.getShort(addr(index)); + return v; + } + + @Override + public ByteBuf setShort(int index, int value) { + wrapped.checkIndex(index, 2); + _setShort(index, value); + return this; + } + + @Override + public ByteBuf setInt(int index, int value) { + wrapped.checkIndex(index, 4); + _setInt(index, value); + return this; + } + + @Override + public ByteBuf setLong(int index, long value) { + wrapped.checkIndex(index, 8); + _setLong(index, value); + return this; + } + + @Override + public ByteBuf setChar(int index, int value) { + setShort(index, value); + return this; + } + + @Override + public ByteBuf setFloat(int index, float value) { + setInt(index, Float.floatToRawIntBits(value)); + return this; + } + + @Override + public ByteBuf setDouble(int index, double value) { + setLong(index, Double.doubleToRawLongBits(value)); + return this; + } + + @Override + public ByteBuf writeShort(int value) { + wrapped.ensureWritable(2); + _setShort(wrapped.writerIndex, value); + wrapped.writerIndex += 2; + return this; + } + + @Override + public ByteBuf writeInt(int value) { + wrapped.ensureWritable(4); + _setInt(wrapped.writerIndex, value); + wrapped.writerIndex += 4; + return this; + } + + @Override + public ByteBuf writeLong(long value) { + wrapped.ensureWritable(8); + _setLong(wrapped.writerIndex, value); + wrapped.writerIndex += 8; + return this; + } + + @Override + public ByteBuf writeChar(int value) { + writeShort(value); + return this; + } + + @Override + public ByteBuf writeFloat(float value) { + writeInt(Float.floatToRawIntBits(value)); + return this; + } + + @Override + public ByteBuf writeDouble(double value) { + writeLong(Double.doubleToRawLongBits(value)); + return this; + } + + private void _setShort(int index, int value) { + PlatformDependent.putShort(addr(index), (short) value); + } + + private void _setInt(int index, int value) { + PlatformDependent.putInt(addr(index), value); + } + + private void _setLong(int index, long value) { + PlatformDependent.putLong(addr(index), value); + } + + @Override + public byte getByte(int index) { + return PlatformDependent.getByte(addr(index)); + } + + @Override + public ByteBuf setByte(int index, int value) { + PlatformDependent.putByte(addr(index), (byte) value); + return this; + } + + @Override + public boolean release() { + return release(1); + } + + @Override + public boolean release(int decrement) { + final boolean released = super.release(decrement); + if (ASSERT_ENABLED && released && bufferCount != null && bufferSize != null) { + bufferCount.decrementAndGet(); + bufferSize.addAndGet(-initCap); + } + return released; + } + + @Override + public int hashCode() { + return System.identityHashCode(this); + } + + public static final boolean ASSERT_ENABLED; + + static { + boolean isAssertEnabled = false; + assert isAssertEnabled = true; + ASSERT_ENABLED = isAssertEnabled; + } + +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java new file mode 100644 index 00000000000..dc75e5d7231 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java @@ -0,0 +1,272 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import java.util.concurrent.atomic.AtomicLong; + +import javax.annotation.concurrent.ThreadSafe; + +import com.google.common.base.Preconditions; + +/** + * Provides a concurrent way to manage account for memory usage without locking. Used as basis for Allocators. All + * operations are threadsafe (except for close). + */ +@ThreadSafe +class Accountant implements AutoCloseable { + // private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Accountant.class); + + /** + * The parent allocator + */ + protected final Accountant parent; + + /** + * The amount of memory reserved for this allocator. Releases below this amount of memory will not be returned to the + * parent Accountant until this Accountant is closed. + */ + protected final long reservation; + + private final AtomicLong peakAllocation = new AtomicLong(); + + /** + * Maximum local memory that can be held. This can be externally updated. Changing it won't cause past memory to + * change but will change responses to future allocation efforts + */ + private final AtomicLong allocationLimit = new AtomicLong(); + + /** + * Currently allocated amount of memory; + */ + private final AtomicLong locallyHeldMemory = new AtomicLong(); + + public Accountant(Accountant parent, long reservation, long maxAllocation) { + Preconditions.checkArgument(reservation >= 0, "The initial reservation size must be non-negative."); + Preconditions.checkArgument(maxAllocation >= 0, "The maximum allocation limit must be non-negative."); + Preconditions.checkArgument(reservation <= maxAllocation, + "The initial reservation size must be <= the maximum allocation."); + Preconditions.checkArgument(reservation == 0 || parent != null, "The root accountant can't reserve memory."); + + this.parent = parent; + this.reservation = reservation; + this.allocationLimit.set(maxAllocation); + + if (reservation != 0) { + // we will allocate a reservation from our parent. + final AllocationOutcome outcome = parent.allocateBytes(reservation); + if (!outcome.isOk()) { + throw new OutOfMemoryException(String.format( + "Failure trying to allocate initial reservation for Allocator. " + + "Attempted to allocate %d bytes and received an outcome of %s.", reservation, outcome.name())); + } + } + } + + /** + * Attempt to allocate the requested amount of memory. Either completely succeeds or completely fails. Constructs a a + * log of delta + * + * If it fails, no changes are made to accounting. + * + * @param size + * The amount of memory to reserve in bytes. + * @return True if the allocation was successful, false if the allocation failed. + */ + AllocationOutcome allocateBytes(long size) { + final AllocationOutcome outcome = allocate(size, true, false); + if (!outcome.isOk()) { + releaseBytes(size); + } + return outcome; + } + + private void updatePeak() { + final long currentMemory = locallyHeldMemory.get(); + while (true) { + + final long previousPeak = peakAllocation.get(); + if (currentMemory > previousPeak) { + if (!peakAllocation.compareAndSet(previousPeak, currentMemory)) { + // peak allocation changed underneath us. try again. + continue; + } + } + + // we either succeeded to set peak allocation or we weren't above the previous peak, exit. + return; + } + } + + + /** + * Increase the accounting. Returns whether the allocation fit within limits. + * + * @param size + * to increase + * @return Whether the allocation fit within limits. + */ + boolean forceAllocate(long size) { + final AllocationOutcome outcome = allocate(size, true, true); + return outcome.isOk(); + } + + /** + * Internal method for allocation. This takes a forced approach to allocation to ensure that we manage reservation + * boundary issues consistently. Allocation is always done through the entire tree. The two options that we influence + * are whether the allocation should be forced and whether or not the peak memory allocation should be updated. If at + * some point during allocation escalation we determine that the allocation is no longer possible, we will continue to + * do a complete and consistent allocation but we will stop updating the peak allocation. We do this because we know + * that we will be directly unwinding this allocation (and thus never actually making the allocation). If force + * allocation is passed, then we continue to update the peak limits since we now know that this allocation will occur + * despite our moving past one or more limits. + * + * @param size + * The size of the allocation. + * @param incomingUpdatePeak + * Whether we should update the local peak for this allocation. + * @param forceAllocation + * Whether we should force the allocation. + * @return The outcome of the allocation. + */ + private AllocationOutcome allocate(final long size, final boolean incomingUpdatePeak, final boolean forceAllocation) { + final long newLocal = locallyHeldMemory.addAndGet(size); + final long beyondReservation = newLocal - reservation; + final boolean beyondLimit = newLocal > allocationLimit.get(); + final boolean updatePeak = forceAllocation || (incomingUpdatePeak && !beyondLimit); + + AllocationOutcome parentOutcome = AllocationOutcome.SUCCESS; + if (beyondReservation > 0 && parent != null) { + // we need to get memory from our parent. + final long parentRequest = Math.min(beyondReservation, size); + parentOutcome = parent.allocate(parentRequest, updatePeak, forceAllocation); + } + + final AllocationOutcome finalOutcome = beyondLimit ? AllocationOutcome.FAILED_LOCAL : + parentOutcome.ok ? AllocationOutcome.SUCCESS : AllocationOutcome.FAILED_PARENT; + + if (updatePeak) { + updatePeak(); + } + + return finalOutcome; + } + + public void releaseBytes(long size) { + // reduce local memory. all memory released above reservation should be released up the tree. + final long newSize = locallyHeldMemory.addAndGet(-size); + + Preconditions.checkArgument(newSize >= 0, "Accounted size went negative."); + + final long originalSize = newSize + size; + if(originalSize > reservation && parent != null){ + // we deallocated memory that we should release to our parent. + final long possibleAmountToReleaseToParent = originalSize - reservation; + final long actualToReleaseToParent = Math.min(size, possibleAmountToReleaseToParent); + parent.releaseBytes(actualToReleaseToParent); + } + + } + + /** + * Set the maximum amount of memory that can be allocated in the this Accountant before failing an allocation. + * + * @param newLimit + * The limit in bytes. + */ + public void setLimit(long newLimit) { + allocationLimit.set(newLimit); + } + + public boolean isOverLimit() { + return getAllocatedMemory() > getLimit() || (parent != null && parent.isOverLimit()); + } + + /** + * Close this Accountant. This will release any reservation bytes back to a parent Accountant. + */ + public void close() { + // return memory reservation to parent allocator. + if (parent != null) { + parent.releaseBytes(reservation); + } + } + + /** + * Return the current limit of this Accountant. + * + * @return Limit in bytes. + */ + public long getLimit() { + return allocationLimit.get(); + } + + /** + * Return the current amount of allocated memory that this Accountant is managing accounting for. Note this does not + * include reservation memory that hasn't been allocated. + * + * @return Currently allocate memory in bytes. + */ + public long getAllocatedMemory() { + return locallyHeldMemory.get(); + } + + /** + * The peak memory allocated by this Accountant. + * + * @return The peak allocated memory in bytes. + */ + public long getPeakMemoryAllocation() { + return peakAllocation.get(); + } + + /** + * Describes the type of outcome that occurred when trying to account for allocation of memory. + */ + public static enum AllocationOutcome { + + /** + * Allocation succeeded. + */ + SUCCESS(true), + + /** + * Allocation succeeded but only because the allocator was forced to move beyond a limit. + */ + FORCED_SUCESS(true), + + /** + * Allocation failed because the local allocator's limits were exceeded. + */ + FAILED_LOCAL(false), + + /** + * Allocation failed because a parent allocator's limits were exceeded. + */ + FAILED_PARENT(false); + + private final boolean ok; + + AllocationOutcome(boolean ok) { + this.ok = ok; + } + + public boolean isOk() { + return ok; + } + } +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java new file mode 100644 index 00000000000..0db61443266 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -0,0 +1,433 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import static org.apache.arrow.memory.BaseAllocator.indent; +import io.netty.buffer.ArrowBuf; +import io.netty.buffer.PooledByteBufAllocatorL; +import io.netty.buffer.UnsafeDirectLittleEndian; + +import java.util.IdentityHashMap; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; + +import org.apache.arrow.memory.BaseAllocator.Verbosity; +import org.apache.arrow.memory.util.AutoCloseableLock; +import org.apache.arrow.memory.util.HistoricalLog; +import org.apache.arrow.memory.util.Metrics; + +import com.google.common.base.Preconditions; + +/** + * Manages the relationship between one or more allocators and a particular UDLE. Ensures that one allocator owns the + * memory that multiple allocators may be referencing. Manages a BufferLedger between each of its associated allocators. + * This class is also responsible for managing when memory is allocated and returned to the Netty-based + * PooledByteBufAllocatorL. + * + * The only reason that this isn't package private is we're forced to put DrillBuf in Netty's package which need access + * to these objects or methods. + * + * Threading: AllocationManager manages thread-safety internally. Operations within the context of a single BufferLedger + * are lockless in nature and can be leveraged by multiple threads. Operations that cross the context of two ledgers + * will acquire a lock on the AllocationManager instance. Important note, there is one AllocationManager per + * UnsafeDirectLittleEndian buffer allocation. As such, there will be thousands of these in a typical query. The + * contention of acquiring a lock on AllocationManager should be very low. + * + */ +public class AllocationManager { + // private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AllocationManager.class); + + private static final AtomicLong MANAGER_ID_GENERATOR = new AtomicLong(0); + private static final AtomicLong LEDGER_ID_GENERATOR = new AtomicLong(0); + static final PooledByteBufAllocatorL INNER_ALLOCATOR = new PooledByteBufAllocatorL(Metrics.getInstance()); + + private final RootAllocator root; + private final long allocatorManagerId = MANAGER_ID_GENERATOR.incrementAndGet(); + private final int size; + private final UnsafeDirectLittleEndian underlying; + private final IdentityHashMap map = new IdentityHashMap<>(); + private final ReadWriteLock lock = new ReentrantReadWriteLock(); + private final AutoCloseableLock readLock = new AutoCloseableLock(lock.readLock()); + private final AutoCloseableLock writeLock = new AutoCloseableLock(lock.writeLock()); + private final long amCreationTime = System.nanoTime(); + + private volatile BufferLedger owningLedger; + private volatile long amDestructionTime = 0; + + AllocationManager(BaseAllocator accountingAllocator, int size) { + Preconditions.checkNotNull(accountingAllocator); + accountingAllocator.assertOpen(); + + this.root = accountingAllocator.root; + this.underlying = INNER_ALLOCATOR.allocate(size); + + // we do a no retain association since our creator will want to retrieve the newly created ledger and will create a + // reference count at that point + this.owningLedger = associate(accountingAllocator, false); + this.size = underlying.capacity(); + } + + /** + * Associate the existing underlying buffer with a new allocator. This will increase the reference count to the + * provided ledger by 1. + * @param allocator + * The target allocator to associate this buffer with. + * @return The Ledger (new or existing) that associates the underlying buffer to this new ledger. + */ + BufferLedger associate(final BaseAllocator allocator) { + return associate(allocator, true); + } + + private BufferLedger associate(final BaseAllocator allocator, final boolean retain) { + allocator.assertOpen(); + + if (root != allocator.root) { + throw new IllegalStateException( + "A buffer can only be associated between two allocators that share the same root."); + } + + try (AutoCloseableLock read = readLock.open()) { + + final BufferLedger ledger = map.get(allocator); + if (ledger != null) { + if (retain) { + ledger.inc(); + } + return ledger; + } + + } + try (AutoCloseableLock write = writeLock.open()) { + // we have to recheck existing ledger since a second reader => writer could be competing with us. + + final BufferLedger existingLedger = map.get(allocator); + if (existingLedger != null) { + if (retain) { + existingLedger.inc(); + } + return existingLedger; + } + + final BufferLedger ledger = new BufferLedger(allocator, new ReleaseListener(allocator)); + if (retain) { + ledger.inc(); + } + BufferLedger oldLedger = map.put(allocator, ledger); + Preconditions.checkArgument(oldLedger == null); + allocator.associateLedger(ledger); + return ledger; + } + } + + + /** + * The way that a particular BufferLedger communicates back to the AllocationManager that it now longer needs to hold + * a reference to particular piece of memory. + */ + private class ReleaseListener { + + private final BufferAllocator allocator; + + public ReleaseListener(BufferAllocator allocator) { + this.allocator = allocator; + } + + /** + * Can only be called when you already hold the writeLock. + */ + public void release() { + allocator.assertOpen(); + + final BufferLedger oldLedger = map.remove(allocator); + oldLedger.allocator.dissociateLedger(oldLedger); + + if (oldLedger == owningLedger) { + if (map.isEmpty()) { + // no one else owns, lets release. + oldLedger.allocator.releaseBytes(size); + underlying.release(); + amDestructionTime = System.nanoTime(); + owningLedger = null; + } else { + // we need to change the owning allocator. we've been removed so we'll get whatever is top of list + BufferLedger newLedger = map.values().iterator().next(); + + // we'll forcefully transfer the ownership and not worry about whether we exceeded the limit + // since this consumer can't do anything with this. + oldLedger.transferBalance(newLedger); + } + } else { + if (map.isEmpty()) { + throw new IllegalStateException("The final removal of a ledger should be connected to the owning ledger."); + } + } + + + } + } + + /** + * The reference manager that binds an allocator manager to a particular BaseAllocator. Also responsible for creating + * a set of DrillBufs that share a common fate and set of reference counts. + * As with AllocationManager, the only reason this is public is due to DrillBuf being in io.netty.buffer package. + */ + public class BufferLedger { + + private final IdentityHashMap buffers = + BaseAllocator.DEBUG ? new IdentityHashMap() : null; + + private final long ledgerId = LEDGER_ID_GENERATOR.incrementAndGet(); // unique ID assigned to each ledger + private final AtomicInteger bufRefCnt = new AtomicInteger(0); // start at zero so we can manage request for retain + // correctly + private final long lCreationTime = System.nanoTime(); + private volatile long lDestructionTime = 0; + private final BaseAllocator allocator; + private final ReleaseListener listener; + private final HistoricalLog historicalLog = BaseAllocator.DEBUG ? new HistoricalLog(BaseAllocator.DEBUG_LOG_LENGTH, + "BufferLedger[%d]", 1) + : null; + + private BufferLedger(BaseAllocator allocator, ReleaseListener listener) { + this.allocator = allocator; + this.listener = listener; + } + + /** + * Transfer any balance the current ledger has to the target ledger. In the case that the current ledger holds no + * memory, no transfer is made to the new ledger. + * @param target + * The ledger to transfer ownership account to. + * @return Whether transfer fit within target ledgers limits. + */ + public boolean transferBalance(final BufferLedger target) { + Preconditions.checkNotNull(target); + Preconditions.checkArgument(allocator.root == target.allocator.root, + "You can only transfer between two allocators that share the same root."); + allocator.assertOpen(); + + target.allocator.assertOpen(); + // if we're transferring to ourself, just return. + if (target == this) { + return true; + } + + // since two balance transfers out from the allocator manager could cause incorrect accounting, we need to ensure + // that this won't happen by synchronizing on the allocator manager instance. + try (AutoCloseableLock write = writeLock.open()) { + if (owningLedger != this) { + return true; + } + + if (BaseAllocator.DEBUG) { + this.historicalLog.recordEvent("transferBalance(%s)", target.allocator.name); + target.historicalLog.recordEvent("incoming(from %s)", owningLedger.allocator.name); + } + + boolean overlimit = target.allocator.forceAllocate(size); + allocator.releaseBytes(size); + owningLedger = target; + return overlimit; + } + + } + + /** + * Print the current ledger state to a the provided StringBuilder. + * @param sb + * The StringBuilder to populate. + * @param indent + * The level of indentation to position the data. + * @param verbosity + * The level of verbosity to print. + */ + public void print(StringBuilder sb, int indent, Verbosity verbosity) { + indent(sb, indent) + .append("ledger[") + .append(ledgerId) + .append("] allocator: ") + .append(allocator.name) + .append("), isOwning: ") + .append(owningLedger == this) + .append(", size: ") + .append(size) + .append(", references: ") + .append(bufRefCnt.get()) + .append(", life: ") + .append(lCreationTime) + .append("..") + .append(lDestructionTime) + .append(", allocatorManager: [") + .append(AllocationManager.this.allocatorManagerId) + .append(", life: ") + .append(amCreationTime) + .append("..") + .append(amDestructionTime); + + if (!BaseAllocator.DEBUG) { + sb.append("]\n"); + } else { + synchronized (buffers) { + sb.append("] holds ") + .append(buffers.size()) + .append(" buffers. \n"); + for (ArrowBuf buf : buffers.keySet()) { + buf.print(sb, indent + 2, verbosity); + sb.append('\n'); + } + } + } + + } + + private void inc() { + bufRefCnt.incrementAndGet(); + } + + /** + * Decrement the ledger's reference count. If the ledger is decremented to zero, this ledger should release its + * ownership back to the AllocationManager + */ + public int decrement(int decrement) { + allocator.assertOpen(); + + final int outcome; + try (AutoCloseableLock write = writeLock.open()) { + outcome = bufRefCnt.addAndGet(-decrement); + if (outcome == 0) { + lDestructionTime = System.nanoTime(); + listener.release(); + } + } + + return outcome; + } + + /** + * Returns the ledger associated with a particular BufferAllocator. If the BufferAllocator doesn't currently have a + * ledger associated with this AllocationManager, a new one is created. This is placed on BufferLedger rather than + * AllocationManager directly because DrillBufs don't have access to AllocationManager and they are the ones + * responsible for exposing the ability to associate multiple allocators with a particular piece of underlying + * memory. Note that this will increment the reference count of this ledger by one to ensure the ledger isn't + * destroyed before use. + * + * @param allocator + * @return + */ + public BufferLedger getLedgerForAllocator(BufferAllocator allocator) { + return associate((BaseAllocator) allocator); + } + + /** + * Create a new DrillBuf associated with this AllocationManager and memory. Does not impact reference count. + * Typically used for slicing. + * @param offset + * The offset in bytes to start this new DrillBuf. + * @param length + * The length in bytes that this DrillBuf will provide access to. + * @return A new DrillBuf that shares references with all DrillBufs associated with this BufferLedger + */ + public ArrowBuf newDrillBuf(int offset, int length) { + allocator.assertOpen(); + return newDrillBuf(offset, length, null); + } + + /** + * Create a new DrillBuf associated with this AllocationManager and memory. + * @param offset + * The offset in bytes to start this new DrillBuf. + * @param length + * The length in bytes that this DrillBuf will provide access to. + * @param manager + * An optional BufferManager argument that can be used to manage expansion of this DrillBuf + * @param retain + * Whether or not the newly created buffer should get an additional reference count added to it. + * @return A new DrillBuf that shares references with all DrillBufs associated with this BufferLedger + */ + public ArrowBuf newDrillBuf(int offset, int length, BufferManager manager) { + allocator.assertOpen(); + + final ArrowBuf buf = new ArrowBuf( + bufRefCnt, + this, + underlying, + manager, + allocator.getAsByteBufAllocator(), + offset, + length, + false); + + if (BaseAllocator.DEBUG) { + historicalLog.recordEvent( + "DrillBuf(BufferLedger, BufferAllocator[%s], UnsafeDirectLittleEndian[identityHashCode == " + + "%d](%s)) => ledger hc == %d", + allocator.name, System.identityHashCode(buf), buf.toString(), + System.identityHashCode(this)); + + synchronized (buffers) { + buffers.put(buf, null); + } + } + + return buf; + + } + + /** + * What is the total size (in bytes) of memory underlying this ledger. + * + * @return Size in bytes + */ + public int getSize() { + return size; + } + + /** + * How much memory is accounted for by this ledger. This is either getSize() if this is the owning ledger for the + * memory or zero in the case that this is not the owning ledger associated with this memory. + * + * @return Amount of accounted(owned) memory associated with this ledger. + */ + public int getAccountedSize() { + try (AutoCloseableLock read = readLock.open()) { + if (owningLedger == this) { + return size; + } else { + return 0; + } + } + } + + /** + * Package visible for debugging/verification only. + */ + UnsafeDirectLittleEndian getUnderlying() { + return underlying; + } + + /** + * Package visible for debugging/verification only. + */ + boolean isOwningLedger() { + return this == owningLedger; + } + + } + +} \ No newline at end of file diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationReservation.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationReservation.java new file mode 100644 index 00000000000..68d1244d1e3 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationReservation.java @@ -0,0 +1,86 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import io.netty.buffer.ArrowBuf; + +/** + * Supports cumulative allocation reservation. Clients may increase the size of the reservation repeatedly until they + * call for an allocation of the current total size. The reservation can only be used once, and will throw an exception + * if it is used more than once. + *

+ * For the purposes of airtight memory accounting, the reservation must be close()d whether it is used or not. + * This is not threadsafe. + */ +public interface AllocationReservation extends AutoCloseable { + + /** + * Add to the current reservation. + * + *

Adding may fail if the allocator is not allowed to consume any more space. + * + * @param nBytes the number of bytes to add + * @return true if the addition is possible, false otherwise + * @throws IllegalStateException if called after buffer() is used to allocate the reservation + */ + boolean add(final int nBytes); + + /** + * Requests a reservation of additional space. + * + *

The implementation of the allocator's inner class provides this. + * + * @param nBytes the amount to reserve + * @return true if the reservation can be satisfied, false otherwise + */ + boolean reserve(int nBytes); + + /** + * Allocate a buffer whose size is the total of all the add()s made. + * + *

The allocation request can still fail, even if the amount of space + * requested is available, if the allocation cannot be made contiguously. + * + * @return the buffer, or null, if the request cannot be satisfied + * @throws IllegalStateException if called called more than once + */ + ArrowBuf allocateBuffer(); + + /** + * Get the current size of the reservation (the sum of all the add()s). + * + * @return size of the current reservation + */ + int getSize(); + + /** + * Return whether or not the reservation has been used. + * + * @return whether or not the reservation has been used + */ + public boolean isUsed(); + + /** + * Return whether or not the reservation has been closed. + * + * @return whether or not the reservation has been closed + */ + public boolean isClosed(); + + public void close(); +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java new file mode 100644 index 00000000000..566457981c7 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +/** + * Exception thrown when a closed BufferAllocator is used. Note + * this is an unchecked exception. + * + * @param message string associated with the cause + */ +@SuppressWarnings("serial") +public class AllocatorClosedException extends RuntimeException { + public AllocatorClosedException(String message) { + super(message); + } +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java new file mode 100644 index 00000000000..72f77ab0c7b --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -0,0 +1,781 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import io.netty.buffer.ArrowBuf; +import io.netty.buffer.ByteBufAllocator; +import io.netty.buffer.UnsafeDirectLittleEndian; + +import java.util.Arrays; +import java.util.IdentityHashMap; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.arrow.memory.AllocationManager.BufferLedger; +import org.apache.arrow.memory.util.AssertionUtil; +import org.apache.arrow.memory.util.HistoricalLog; + +import com.google.common.base.Preconditions; + +public abstract class BaseAllocator extends Accountant implements BufferAllocator { + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BaseAllocator.class); + + public static final String DEBUG_ALLOCATOR = "arrow.memory.debug.allocator"; + + private static final AtomicLong ID_GENERATOR = new AtomicLong(0); + private static final int CHUNK_SIZE = AllocationManager.INNER_ALLOCATOR.getChunkSize(); + + public static final int DEBUG_LOG_LENGTH = 6; + public static final boolean DEBUG = AssertionUtil.isAssertionsEnabled() + || Boolean.parseBoolean(System.getProperty(DEBUG_ALLOCATOR, "false")); + private final Object DEBUG_LOCK = DEBUG ? new Object() : null; + + private final BaseAllocator parentAllocator; + private final ByteBufAllocator thisAsByteBufAllocator; + private final IdentityHashMap childAllocators; + private final ArrowBuf empty; + + private volatile boolean isClosed = false; // the allocator has been closed + + // Package exposed for sharing between AllocatorManger and BaseAllocator objects + final String name; + final RootAllocator root; + + // members used purely for debugging + private final IdentityHashMap childLedgers; + private final IdentityHashMap reservations; + private final HistoricalLog historicalLog; + + protected BaseAllocator( + final BaseAllocator parentAllocator, + final String name, + final long initReservation, + final long maxAllocation) throws OutOfMemoryException { + super(parentAllocator, initReservation, maxAllocation); + + if (parentAllocator != null) { + this.root = parentAllocator.root; + empty = parentAllocator.empty; + } else if (this instanceof RootAllocator) { + this.root = (RootAllocator) this; + empty = createEmpty(); + } else { + throw new IllegalStateException("An parent allocator must either carry a root or be the root."); + } + + this.parentAllocator = parentAllocator; + this.name = name; + + this.thisAsByteBufAllocator = new DrillByteBufAllocator(this); + + if (DEBUG) { + childAllocators = new IdentityHashMap<>(); + reservations = new IdentityHashMap<>(); + childLedgers = new IdentityHashMap<>(); + historicalLog = new HistoricalLog(DEBUG_LOG_LENGTH, "allocator[%s]", name); + hist("created by \"%s\", owned = %d", name, this.getAllocatedMemory()); + } else { + childAllocators = null; + reservations = null; + historicalLog = null; + childLedgers = null; + } + + } + + public void assertOpen() { + if (AssertionUtil.ASSERT_ENABLED) { + if (isClosed) { + throw new IllegalStateException("Attempting operation on allocator when allocator is closed.\n" + + toVerboseString()); + } + } + } + + @Override + public String getName() { + return name; + } + + @Override + public ArrowBuf getEmpty() { + assertOpen(); + return empty; + } + + /** + * For debug/verification purposes only. Allows an AllocationManager to tell the allocator that we have a new ledger + * associated with this allocator. + */ + void associateLedger(BufferLedger ledger) { + assertOpen(); + if (DEBUG) { + synchronized (DEBUG_LOCK) { + childLedgers.put(ledger, null); + } + } + } + + /** + * For debug/verification purposes only. Allows an AllocationManager to tell the allocator that we are removing a + * ledger associated with this allocator + */ + void dissociateLedger(BufferLedger ledger) { + assertOpen(); + if (DEBUG) { + synchronized (DEBUG_LOCK) { + if (!childLedgers.containsKey(ledger)) { + throw new IllegalStateException("Trying to remove a child ledger that doesn't exist."); + } + childLedgers.remove(ledger); + } + } + } + + /** + * Track when a ChildAllocator of this BaseAllocator is closed. Used for debugging purposes. + * + * @param childAllocator + * The child allocator that has been closed. + */ + private void childClosed(final BaseAllocator childAllocator) { + assertOpen(); + + if (DEBUG) { + Preconditions.checkArgument(childAllocator != null, "child allocator can't be null"); + + synchronized (DEBUG_LOCK) { + final Object object = childAllocators.remove(childAllocator); + if (object == null) { + childAllocator.historicalLog.logHistory(logger); + throw new IllegalStateException("Child allocator[" + childAllocator.name + + "] not found in parent allocator[" + name + "]'s childAllocators"); + } + } + } + } + + private static String createErrorMsg(final BufferAllocator allocator, final int rounded, final int requested) { + if (rounded != requested) { + return String.format( + "Unable to allocate buffer of size %d (rounded from %d) due to memory limit. Current allocation: %d", + rounded, requested, allocator.getAllocatedMemory()); + } else { + return String.format("Unable to allocate buffer of size %d due to memory limit. Current allocation: %d", + rounded, allocator.getAllocatedMemory()); + } + } + + @Override + public ArrowBuf buffer(final int initialRequestSize) { + assertOpen(); + + return buffer(initialRequestSize, null); + } + + private ArrowBuf createEmpty(){ + assertOpen(); + + return new ArrowBuf(new AtomicInteger(), null, AllocationManager.INNER_ALLOCATOR.empty, null, null, 0, 0, true); + } + + @Override + public ArrowBuf buffer(final int initialRequestSize, BufferManager manager) { + assertOpen(); + + Preconditions.checkArgument(initialRequestSize >= 0, "the requested size must be non-negative"); + + if (initialRequestSize == 0) { + return empty; + } + + // round to next largest power of two if we're within a chunk since that is how our allocator operates + final int actualRequestSize = initialRequestSize < CHUNK_SIZE ? + nextPowerOfTwo(initialRequestSize) + : initialRequestSize; + AllocationOutcome outcome = this.allocateBytes(actualRequestSize); + if (!outcome.isOk()) { + throw new OutOfMemoryException(createErrorMsg(this, actualRequestSize, initialRequestSize)); + } + + boolean success = false; + try { + ArrowBuf buffer = bufferWithoutReservation(actualRequestSize, manager); + success = true; + return buffer; + } finally { + if (!success) { + releaseBytes(actualRequestSize); + } + } + + } + + /** + * Used by usual allocation as well as for allocating a pre-reserved buffer. Skips the typical accounting associated + * with creating a new buffer. + */ + private ArrowBuf bufferWithoutReservation(final int size, BufferManager bufferManager) throws OutOfMemoryException { + assertOpen(); + + final AllocationManager manager = new AllocationManager(this, size); + final BufferLedger ledger = manager.associate(this); // +1 ref cnt (required) + final ArrowBuf buffer = ledger.newDrillBuf(0, size, bufferManager); + + // make sure that our allocation is equal to what we expected. + Preconditions.checkArgument(buffer.capacity() == size, + "Allocated capacity %d was not equal to requested capacity %d.", buffer.capacity(), size); + + return buffer; + } + + @Override + public ByteBufAllocator getAsByteBufAllocator() { + return thisAsByteBufAllocator; + } + + @Override + public BufferAllocator newChildAllocator( + final String name, + final long initReservation, + final long maxAllocation) { + assertOpen(); + + final ChildAllocator childAllocator = new ChildAllocator(this, name, initReservation, maxAllocation); + + if (DEBUG) { + synchronized (DEBUG_LOCK) { + childAllocators.put(childAllocator, childAllocator); + historicalLog.recordEvent("allocator[%s] created new child allocator[%s]", name, childAllocator.name); + } + } + + return childAllocator; + } + + public class Reservation implements AllocationReservation { + private int nBytes = 0; + private boolean used = false; + private boolean closed = false; + private final HistoricalLog historicalLog; + + public Reservation() { + if (DEBUG) { + historicalLog = new HistoricalLog("Reservation[allocator[%s], %d]", name, System.identityHashCode(this)); + historicalLog.recordEvent("created"); + synchronized (DEBUG_LOCK) { + reservations.put(this, this); + } + } else { + historicalLog = null; + } + } + + public boolean add(final int nBytes) { + assertOpen(); + + Preconditions.checkArgument(nBytes >= 0, "nBytes(%d) < 0", nBytes); + Preconditions.checkState(!closed, "Attempt to increase reservation after reservation has been closed"); + Preconditions.checkState(!used, "Attempt to increase reservation after reservation has been used"); + + // we round up to next power of two since all reservations are done in powers of two. This may overestimate the + // preallocation since someone may perceive additions to be power of two. If this becomes a problem, we can look + // at + // modifying this behavior so that we maintain what we reserve and what the user asked for and make sure to only + // round to power of two as necessary. + final int nBytesTwo = BaseAllocator.nextPowerOfTwo(nBytes); + if (!reserve(nBytesTwo)) { + return false; + } + + this.nBytes += nBytesTwo; + return true; + } + + public ArrowBuf allocateBuffer() { + assertOpen(); + + Preconditions.checkState(!closed, "Attempt to allocate after closed"); + Preconditions.checkState(!used, "Attempt to allocate more than once"); + + final ArrowBuf drillBuf = allocate(nBytes); + used = true; + return drillBuf; + } + + public int getSize() { + return nBytes; + } + + public boolean isUsed() { + return used; + } + + public boolean isClosed() { + return closed; + } + + @Override + public void close() { + assertOpen(); + + if (closed) { + return; + } + + if (DEBUG) { + if (!isClosed()) { + final Object object; + synchronized (DEBUG_LOCK) { + object = reservations.remove(this); + } + if (object == null) { + final StringBuilder sb = new StringBuilder(); + print(sb, 0, Verbosity.LOG_WITH_STACKTRACE); + logger.debug(sb.toString()); + throw new IllegalStateException( + String.format("Didn't find closing reservation[%d]", System.identityHashCode(this))); + } + + historicalLog.recordEvent("closed"); + } + } + + if (!used) { + releaseReservation(nBytes); + } + + closed = true; + } + + public boolean reserve(int nBytes) { + assertOpen(); + + final AllocationOutcome outcome = BaseAllocator.this.allocateBytes(nBytes); + + if (DEBUG) { + historicalLog.recordEvent("reserve(%d) => %s", nBytes, Boolean.toString(outcome.isOk())); + } + + return outcome.isOk(); + } + + /** + * Allocate the a buffer of the requested size. + * + *

+ * The implementation of the allocator's inner class provides this. + * + * @param nBytes + * the size of the buffer requested + * @return the buffer, or null, if the request cannot be satisfied + */ + private ArrowBuf allocate(int nBytes) { + assertOpen(); + + boolean success = false; + + /* + * The reservation already added the requested bytes to the allocators owned and allocated bytes via reserve(). + * This ensures that they can't go away. But when we ask for the buffer here, that will add to the allocated bytes + * as well, so we need to return the same number back to avoid double-counting them. + */ + try { + final ArrowBuf drillBuf = BaseAllocator.this.bufferWithoutReservation(nBytes, null); + + if (DEBUG) { + historicalLog.recordEvent("allocate() => %s", String.format("DrillBuf[%d]", drillBuf.getId())); + } + success = true; + return drillBuf; + } finally { + if (!success) { + releaseBytes(nBytes); + } + } + } + + /** + * Return the reservation back to the allocator without having used it. + * + * @param nBytes + * the size of the reservation + */ + private void releaseReservation(int nBytes) { + assertOpen(); + + releaseBytes(nBytes); + + if (DEBUG) { + historicalLog.recordEvent("releaseReservation(%d)", nBytes); + } + } + + } + + @Override + public AllocationReservation newReservation() { + assertOpen(); + + return new Reservation(); + } + + + @Override + public synchronized void close() { + /* + * Some owners may close more than once because of complex cleanup and shutdown + * procedures. + */ + if (isClosed) { + return; + } + + isClosed = true; + + if (DEBUG) { + synchronized(DEBUG_LOCK) { + verifyAllocator(); + + // are there outstanding child allocators? + if (!childAllocators.isEmpty()) { + for (final BaseAllocator childAllocator : childAllocators.keySet()) { + if (childAllocator.isClosed) { + logger.warn(String.format( + "Closed child allocator[%s] on parent allocator[%s]'s child list.\n%s", + childAllocator.name, name, toString())); + } + } + + throw new IllegalStateException( + String.format("Allocator[%s] closed with outstanding child allocators.\n%s", name, toString())); + } + + // are there outstanding buffers? + final int allocatedCount = childLedgers.size(); + if (allocatedCount > 0) { + throw new IllegalStateException( + String.format("Allocator[%s] closed with outstanding buffers allocated (%d).\n%s", + name, allocatedCount, toString())); + } + + if (reservations.size() != 0) { + throw new IllegalStateException( + String.format("Allocator[%s] closed with outstanding reservations (%d).\n%s", name, reservations.size(), + toString())); + } + + } + } + + // Is there unaccounted-for outstanding allocation? + final long allocated = getAllocatedMemory(); + if (allocated > 0) { + throw new IllegalStateException( + String.format("Memory was leaked by query. Memory leaked: (%d)\n%s", allocated, toString())); + } + + // we need to release our memory to our parent before we tell it we've closed. + super.close(); + + // Inform our parent allocator that we've closed + if (parentAllocator != null) { + parentAllocator.childClosed(this); + } + + if (DEBUG) { + historicalLog.recordEvent("closed"); + logger.debug(String.format( + "closed allocator[%s].", + name)); + } + + + } + + public String toString() { + final Verbosity verbosity = logger.isTraceEnabled() ? Verbosity.LOG_WITH_STACKTRACE + : Verbosity.BASIC; + final StringBuilder sb = new StringBuilder(); + print(sb, 0, verbosity); + return sb.toString(); + } + + /** + * Provide a verbose string of the current allocator state. Includes the state of all child allocators, along with + * historical logs of each object and including stacktraces. + * + * @return A Verbose string of current allocator state. + */ + public String toVerboseString() { + final StringBuilder sb = new StringBuilder(); + print(sb, 0, Verbosity.LOG_WITH_STACKTRACE); + return sb.toString(); + } + + private void hist(String noteFormat, Object... args) { + historicalLog.recordEvent(noteFormat, args); + } + + /** + * Rounds up the provided value to the nearest power of two. + * + * @param val + * An integer value. + * @return The closest power of two of that value. + */ + static int nextPowerOfTwo(int val) { + int highestBit = Integer.highestOneBit(val); + if (highestBit == val) { + return val; + } else { + return highestBit << 1; + } + } + + + /** + * Verifies the accounting state of the allocator. Only works for DEBUG. + * + * @throws IllegalStateException + * when any problems are found + */ + void verifyAllocator() { + final IdentityHashMap buffersSeen = new IdentityHashMap<>(); + verifyAllocator(buffersSeen); + } + + /** + * Verifies the accounting state of the allocator. Only works for DEBUG. + * + *

+ * This overload is used for recursive calls, allowing for checking that DrillBufs are unique across all allocators + * that are checked. + *

+ * + * @param buffersSeen + * a map of buffers that have already been seen when walking a tree of allocators + * @throws IllegalStateException + * when any problems are found + */ + private void verifyAllocator(final IdentityHashMap buffersSeen) { + synchronized (DEBUG_LOCK) { + + // The remaining tests can only be performed if we're in debug mode. + if (!DEBUG) { + return; + } + + final long allocated = getAllocatedMemory(); + + // verify my direct descendants + final Set childSet = childAllocators.keySet(); + for (final BaseAllocator childAllocator : childSet) { + childAllocator.verifyAllocator(buffersSeen); + } + + /* + * Verify my relationships with my descendants. + * + * The sum of direct child allocators' owned memory must be <= my allocated memory; my allocated memory also + * includes DrillBuf's directly allocated by me. + */ + long childTotal = 0; + for (final BaseAllocator childAllocator : childSet) { + childTotal += Math.max(childAllocator.getAllocatedMemory(), childAllocator.reservation); + } + if (childTotal > getAllocatedMemory()) { + historicalLog.logHistory(logger); + logger.debug("allocator[" + name + "] child event logs BEGIN"); + for (final BaseAllocator childAllocator : childSet) { + childAllocator.historicalLog.logHistory(logger); + } + logger.debug("allocator[" + name + "] child event logs END"); + throw new IllegalStateException( + "Child allocators own more memory (" + childTotal + ") than their parent (name = " + + name + " ) has allocated (" + getAllocatedMemory() + ')'); + } + + // Furthermore, the amount I've allocated should be that plus buffers I've allocated. + long bufferTotal = 0; + + final Set ledgerSet = childLedgers.keySet(); + for (final BufferLedger ledger : ledgerSet) { + if (!ledger.isOwningLedger()) { + continue; + } + + final UnsafeDirectLittleEndian udle = ledger.getUnderlying(); + /* + * Even when shared, DrillBufs are rewrapped, so we should never see the same instance twice. + */ + final BaseAllocator otherOwner = buffersSeen.get(udle); + if (otherOwner != null) { + throw new IllegalStateException("This allocator's drillBuf already owned by another allocator"); + } + buffersSeen.put(udle, this); + + bufferTotal += udle.capacity(); + } + + // Preallocated space has to be accounted for + final Set reservationSet = reservations.keySet(); + long reservedTotal = 0; + for (final Reservation reservation : reservationSet) { + if (!reservation.isUsed()) { + reservedTotal += reservation.getSize(); + } + } + + if (bufferTotal + reservedTotal + childTotal != getAllocatedMemory()) { + final StringBuilder sb = new StringBuilder(); + sb.append("allocator["); + sb.append(name); + sb.append("]\nallocated: "); + sb.append(Long.toString(allocated)); + sb.append(" allocated - (bufferTotal + reservedTotal + childTotal): "); + sb.append(Long.toString(allocated - (bufferTotal + reservedTotal + childTotal))); + sb.append('\n'); + + if (bufferTotal != 0) { + sb.append("buffer total: "); + sb.append(Long.toString(bufferTotal)); + sb.append('\n'); + dumpBuffers(sb, ledgerSet); + } + + if (childTotal != 0) { + sb.append("child total: "); + sb.append(Long.toString(childTotal)); + sb.append('\n'); + + for (final BaseAllocator childAllocator : childSet) { + sb.append("child allocator["); + sb.append(childAllocator.name); + sb.append("] owned "); + sb.append(Long.toString(childAllocator.getAllocatedMemory())); + sb.append('\n'); + } + } + + if (reservedTotal != 0) { + sb.append(String.format("reserved total : %d bytes.", reservedTotal)); + for (final Reservation reservation : reservationSet) { + reservation.historicalLog.buildHistory(sb, 0, true); + sb.append('\n'); + } + } + + logger.debug(sb.toString()); + + final long allocated2 = getAllocatedMemory(); + + if (allocated2 != allocated) { + throw new IllegalStateException(String.format( + "allocator[%s]: allocated t1 (%d) + allocated t2 (%d). Someone released memory while in verification.", + name, allocated, allocated2)); + + } + throw new IllegalStateException(String.format( + "allocator[%s]: buffer space (%d) + prealloc space (%d) + child space (%d) != allocated (%d)", + name, bufferTotal, reservedTotal, childTotal, allocated)); + } + } + } + + void print(StringBuilder sb, int level, Verbosity verbosity) { + + indent(sb, level) + .append("Allocator(") + .append(name) + .append(") ") + .append(reservation) + .append('/') + .append(getAllocatedMemory()) + .append('/') + .append(getPeakMemoryAllocation()) + .append('/') + .append(getLimit()) + .append(" (res/actual/peak/limit)") + .append('\n'); + + if (DEBUG) { + indent(sb, level + 1).append(String.format("child allocators: %d\n", childAllocators.size())); + for (BaseAllocator child : childAllocators.keySet()) { + child.print(sb, level + 2, verbosity); + } + + indent(sb, level + 1).append(String.format("ledgers: %d\n", childLedgers.size())); + for (BufferLedger ledger : childLedgers.keySet()) { + ledger.print(sb, level + 2, verbosity); + } + + final Set reservations = this.reservations.keySet(); + indent(sb, level + 1).append(String.format("reservations: %d\n", reservations.size())); + for (final Reservation reservation : reservations) { + if (verbosity.includeHistoricalLog) { + reservation.historicalLog.buildHistory(sb, level + 3, true); + } + } + + } + + } + + private void dumpBuffers(final StringBuilder sb, final Set ledgerSet) { + for (final BufferLedger ledger : ledgerSet) { + if (!ledger.isOwningLedger()) { + continue; + } + final UnsafeDirectLittleEndian udle = ledger.getUnderlying(); + sb.append("UnsafeDirectLittleEndian[dentityHashCode == "); + sb.append(Integer.toString(System.identityHashCode(udle))); + sb.append("] size "); + sb.append(Integer.toString(udle.capacity())); + sb.append('\n'); + } + } + + + public static StringBuilder indent(StringBuilder sb, int indent) { + final char[] indentation = new char[indent * 2]; + Arrays.fill(indentation, ' '); + sb.append(indentation); + return sb; + } + + public static enum Verbosity { + BASIC(false, false), // only include basic information + LOG(true, false), // include basic + LOG_WITH_STACKTRACE(true, true) // + ; + + public final boolean includeHistoricalLog; + public final boolean includeStackTraces; + + Verbosity(boolean includeHistoricalLog, boolean includeStackTraces) { + this.includeHistoricalLog = includeHistoricalLog; + this.includeStackTraces = includeStackTraces; + } + } + + public static boolean isDebug() { + return DEBUG; + } +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BoundsChecking.java b/java/memory/src/main/java/org/apache/arrow/memory/BoundsChecking.java new file mode 100644 index 00000000000..4e88c734ab4 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/BoundsChecking.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +public class BoundsChecking { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BoundsChecking.class); + + public static final boolean BOUNDS_CHECKING_ENABLED; + + static { + boolean isAssertEnabled = false; + assert isAssertEnabled = true; + BOUNDS_CHECKING_ENABLED = isAssertEnabled + || !"true".equals(System.getProperty("drill.enable_unsafe_memory_access")); + } + + private BoundsChecking() { + } + +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java new file mode 100644 index 00000000000..16a68128b70 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/BufferAllocator.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import io.netty.buffer.ByteBufAllocator; +import io.netty.buffer.ArrowBuf; + +/** + * Wrapper class to deal with byte buffer allocation. Ensures users only use designated methods. + */ +public interface BufferAllocator extends AutoCloseable { + /** + * Allocate a new or reused buffer of the provided size. Note that the buffer may technically be larger than the + * requested size for rounding purposes. However, the buffer's capacity will be set to the configured size. + * + * @param size + * The size in bytes. + * @return a new DrillBuf, or null if the request can't be satisfied + * @throws OutOfMemoryException + * if buffer cannot be allocated + */ + public ArrowBuf buffer(int size); + + /** + * Allocate a new or reused buffer of the provided size. Note that the buffer may technically be larger than the + * requested size for rounding purposes. However, the buffer's capacity will be set to the configured size. + * + * @param size + * The size in bytes. + * @param manager + * A buffer manager to manage reallocation. + * @return a new DrillBuf, or null if the request can't be satisfied + * @throws OutOfMemoryException + * if buffer cannot be allocated + */ + public ArrowBuf buffer(int size, BufferManager manager); + + /** + * Returns the allocator this allocator falls back to when it needs more memory. + * + * @return the underlying allocator used by this allocator + */ + public ByteBufAllocator getAsByteBufAllocator(); + + /** + * Create a new child allocator. + * + * @param name + * the name of the allocator. + * @param initReservation + * the initial space reservation (obtained from this allocator) + * @param maxAllocation + * maximum amount of space the new allocator can allocate + * @return the new allocator, or null if it can't be created + */ + public BufferAllocator newChildAllocator(String name, long initReservation, long maxAllocation); + + /** + * Close and release all buffers generated from this buffer pool. + * + *

When assertions are on, complains if there are any outstanding buffers; to avoid + * that, release all buffers before the allocator is closed. + */ + @Override + public void close(); + + /** + * Returns the amount of memory currently allocated from this allocator. + * + * @return the amount of memory currently allocated + */ + public long getAllocatedMemory(); + + /** + * Set the maximum amount of memory this allocator is allowed to allocate. + * + * @param newLimit + * The new Limit to apply to allocations + */ + public void setLimit(long newLimit); + + /** + * Return the current maximum limit this allocator imposes. + * + * @return Limit in number of bytes. + */ + public long getLimit(); + + /** + * Returns the peak amount of memory allocated from this allocator. + * + * @return the peak amount of memory allocated + */ + public long getPeakMemoryAllocation(); + + /** + * Create an allocation reservation. A reservation is a way of building up + * a request for a buffer whose size is not known in advance. See + * {@see AllocationReservation}. + * + * @return the newly created reservation + */ + public AllocationReservation newReservation(); + + /** + * Get a reference to the empty buffer associated with this allocator. Empty buffers are special because we don't + * worry about them leaking or managing reference counts on them since they don't actually point to any memory. + */ + public ArrowBuf getEmpty(); + + /** + * Return the name of this allocator. This is a human readable name that can help debugging. Typically provides + * coordinates about where this allocator was created + */ + public String getName(); + + /** + * Return whether or not this allocator (or one if its parents) is over its limits. In the case that an allocator is + * over its limit, all consumers of that allocator should aggressively try to addrss the overlimit situation. + */ + public boolean isOverLimit(); + + /** + * Return a verbose string describing this allocator. If in DEBUG mode, this will also include relevant stacktraces + * and historical logs for underlying objects + * + * @return A very verbose description of the allocator hierarchy. + */ + public String toVerboseString(); + + /** + * Asserts (using java assertions) that the provided allocator is currently open. If assertions are disabled, this is + * a no-op. + */ + public void assertOpen(); +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java b/java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java new file mode 100644 index 00000000000..0610ff09276 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java @@ -0,0 +1,66 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.memory; + +import io.netty.buffer.ArrowBuf; + +/** + * Manages a list of {@link ArrowBuf}s that can be reallocated as needed. Upon + * re-allocation the old buffer will be freed. Managing a list of these buffers + * prevents some parts of the system from needing to define a correct location + * to place the final call to free them. + * + * The current uses of these types of buffers are within the pluggable components of Drill. + * In UDFs, memory management should not be a concern. We provide access to re-allocatable + * DrillBufs to give UDF writers general purpose buffers we can account for. To prevent the need + * for UDFs to contain boilerplate to close all of the buffers they request, this list + * is tracked at a higher level and all of the buffers are freed once we are sure that + * the code depending on them is done executing (currently {@link FragmentContext} + * and {@link QueryContext}. + */ +public interface BufferManager extends AutoCloseable { + + /** + * Replace an old buffer with a new version at least of the provided size. Does not copy data. + * + * @param old + * Old Buffer that the user is no longer going to use. + * @param newSize + * Size of new replacement buffer. + * @return + */ + public ArrowBuf replace(ArrowBuf old, int newSize); + + /** + * Get a managed buffer of indeterminate size. + * + * @return A buffer. + */ + public ArrowBuf getManagedBuffer(); + + /** + * Get a managed buffer of at least a certain size. + * + * @param size + * The desired size + * @return A buffer + */ + public ArrowBuf getManagedBuffer(int size); + + public void close(); +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java new file mode 100644 index 00000000000..6f120e5328b --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + + +/** + * Child allocator class. Only slightly different from the {@see RootAllocator}, + * in that these can't be created directly, but must be obtained from + * {@see BufferAllocator#newChildAllocator(AllocatorOwner, long, long, int)}. + + *

Child allocators can only be created by the root, or other children, so + * this class is package private.

+ */ +class ChildAllocator extends BaseAllocator { + /** + * Constructor. + * + * @param parentAllocator parent allocator -- the one creating this child + * @param allocatorOwner a handle to the object making the request + * @param allocationPolicy the allocation policy to use; the policy for all + * allocators must match for each invocation of a drillbit + * @param initReservation initial amount of space to reserve (obtained from the parent) + * @param maxAllocation maximum amount of space that can be obtained from this allocator; + * note this includes direct allocations (via {@see BufferAllocator#buffer(int, int)} + * et al) and requests from descendant allocators. Depending on the allocation policy in + * force, even less memory may be available + * @param flags one or more of BaseAllocator.F_* flags + */ + ChildAllocator( + BaseAllocator parentAllocator, + String name, + long initReservation, + long maxAllocation) { + super(parentAllocator, name, initReservation, maxAllocation); + } + + +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/DrillByteBufAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/DrillByteBufAllocator.java new file mode 100644 index 00000000000..23d644841e1 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/DrillByteBufAllocator.java @@ -0,0 +1,141 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufAllocator; +import io.netty.buffer.CompositeByteBuf; +import io.netty.buffer.ExpandableByteBuf; + +/** + * An implementation of ByteBufAllocator that wraps a Drill BufferAllocator. This allows the RPC layer to be accounted + * and managed using Drill's BufferAllocator infrastructure. The only thin different from a typical BufferAllocator is + * the signature and the fact that this Allocator returns ExpandableByteBufs which enable otherwise non-expandable + * DrillBufs to be expandable. + */ +public class DrillByteBufAllocator implements ByteBufAllocator { + + private static final int DEFAULT_BUFFER_SIZE = 4096; + private static final int DEFAULT_MAX_COMPOSITE_COMPONENTS = 16; + + private final BufferAllocator allocator; + + public DrillByteBufAllocator(BufferAllocator allocator) { + this.allocator = allocator; + } + + @Override + public ByteBuf buffer() { + return buffer(DEFAULT_BUFFER_SIZE); + } + + @Override + public ByteBuf buffer(int initialCapacity) { + return new ExpandableByteBuf(allocator.buffer(initialCapacity), allocator); + } + + @Override + public ByteBuf buffer(int initialCapacity, int maxCapacity) { + return buffer(initialCapacity); + } + + @Override + public ByteBuf ioBuffer() { + return buffer(); + } + + @Override + public ByteBuf ioBuffer(int initialCapacity) { + return buffer(initialCapacity); + } + + @Override + public ByteBuf ioBuffer(int initialCapacity, int maxCapacity) { + return buffer(initialCapacity); + } + + @Override + public ByteBuf directBuffer() { + return buffer(); + } + + @Override + public ByteBuf directBuffer(int initialCapacity) { + return allocator.buffer(initialCapacity); + } + + @Override + public ByteBuf directBuffer(int initialCapacity, int maxCapacity) { + return buffer(initialCapacity, maxCapacity); + } + + @Override + public CompositeByteBuf compositeBuffer() { + return compositeBuffer(DEFAULT_MAX_COMPOSITE_COMPONENTS); + } + + @Override + public CompositeByteBuf compositeBuffer(int maxNumComponents) { + return new CompositeByteBuf(this, true, maxNumComponents); + } + + @Override + public CompositeByteBuf compositeDirectBuffer() { + return compositeBuffer(); + } + + @Override + public CompositeByteBuf compositeDirectBuffer(int maxNumComponents) { + return compositeBuffer(maxNumComponents); + } + + @Override + public boolean isDirectBufferPooled() { + return false; + } + + @Override + public ByteBuf heapBuffer() { + throw fail(); + } + + @Override + public ByteBuf heapBuffer(int initialCapacity) { + throw fail(); + } + + @Override + public ByteBuf heapBuffer(int initialCapacity, int maxCapacity) { + throw fail(); + } + + @Override + public CompositeByteBuf compositeHeapBuffer() { + throw fail(); + } + + @Override + public CompositeByteBuf compositeHeapBuffer(int maxNumComponents) { + throw fail(); + } + + private RuntimeException fail() { + throw new UnsupportedOperationException("Allocator doesn't support heap-based memory."); + } + +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/OutOfMemoryException.java b/java/memory/src/main/java/org/apache/arrow/memory/OutOfMemoryException.java new file mode 100644 index 00000000000..6ba0284d8d4 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/OutOfMemoryException.java @@ -0,0 +1,50 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + + +public class OutOfMemoryException extends RuntimeException { + private static final long serialVersionUID = -6858052345185793382L; + + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(OutOfMemoryException.class); + + public OutOfMemoryException() { + super(); + } + + public OutOfMemoryException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public OutOfMemoryException(String message, Throwable cause) { + super(message, cause); + + } + + public OutOfMemoryException(String message) { + super(message); + + } + + public OutOfMemoryException(Throwable cause) { + super(cause); + + } + + +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/README.md b/java/memory/src/main/java/org/apache/arrow/memory/README.md new file mode 100644 index 00000000000..09e4257ed0f --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/README.md @@ -0,0 +1,121 @@ + +# Memory: Allocation, Accounting and Management + +The memory management package contains all the memory allocation related items that Arrow uses to manage memory. + + +## Key Components +Memory management can be broken into the following main components: + +- Memory chunk allocation and fragmentation management + - `PooledByteBufAllocatorL` - A LittleEndian clone of Netty's jemalloc implementation + - `UnsafeDirectLittleEndian` - A base level memory access interface + - `LargeBuffer` - A buffer backing implementation used when working with data larger than one Netty chunk (default to 16mb) +- Memory limits & Accounting + - `Accountant` - A nestable class of lockfree memory accountors. +- Application-level memory allocation + - `BufferAllocator` - The public interface application users should be leveraging + - `BaseAllocator` - The base implementation of memory allocation, contains the meat of our the Arrow allocator implementation + - `RootAllocator` - The root allocator. Typically only one created for a JVM + - `ChildAllocator` - A child allocator that derives from the root allocator +- Buffer ownership and transfer capabilities + - `AllocationManager` - Responsible for managing the relationship between multiple allocators and a single chunk of memory + - `BufferLedger` - Responsible for allowing maintaining the relationship between an `AllocationManager`, a `BufferAllocator` and one or more individual `ArrowBuf`s +- Memory access + - `ArrowBuf` - The facade for interacting directly with a chunk of memory. + + +## Memory Management Overview +Arrow's memory model is based on the following basic concepts: + + - Memory can be allocated up to some limit. That limit could be a real limit (OS/JVM) or a locally imposed limit. + - Allocation operates in two phases: accounting then actual allocation. Allocation could fail at either point. + - Allocation failure should be recoverable. In all cases, the Allocator infrastructure should expose memory allocation failures (OS or internal limit-based) as `OutOfMemoryException`s. + - Any allocator can reserve memory when created. This memory shall be held such that this allocator will always be able to allocate that amount of memory. + - A particular application component should work to use a local allocator to understand local memory usage and better debug memory leaks. + - The same physical memory can be shared by multiple allocators and the allocator must provide an accounting paradigm for this purpose. + +## Allocator Trees + +Arrow provides a tree-based model for memory allocation. The RootAllocator is created first, then all allocators are created as children of that allocator. The RootAllocator is responsible for being the master bookeeper for memory allocations. All other allocators are created as children of this tree. Each allocator can first determine whether it has enough local memory to satisfy a particular request. If not, the allocator can ask its parent for an additional memory allocation. + +## Reserving Memory + +Arrow provides two different ways to reserve memory: + + - BufferAllocator accounting reservations: + When a new allocator (other than the `RootAllocator`) is initialized, it can set aside memory that it will keep locally for its lifetime. This is memory that will never be released back to its parent allocator until the allocator is closed. + - `AllocationReservation` via BufferAllocator.newReservation(): Allows a short-term preallocation strategy so that a particular subsystem can ensure future memory is available to support a particular request. + +## Memory Ownership, Reference Counts and Sharing +Many BufferAllocators can reference the same piece of memory at the same time. The most common situation for this is in the case of a Broadcast Join: in this situation many downstream operators in the same Arrowbit will receive the same physical memory. Each of these operators will be operating within its own Allocator context. We therefore have multiple allocators all pointing at the same physical memory. It is the AllocationManager's responsibility to ensure that in this situation, that all memory is accurately accounted for from the Root's perspective and also to ensure that the memory is correctly released once all BufferAllocators have stopped using that memory. + +For simplicity of accounting, we treat that memory as being used by one of the BufferAllocators associated with the memory. When that allocator releases its claim on that memory, the memory ownership is then moved to another BufferLedger belonging to the same AllocationManager. Note that because a ArrowBuf.release() is what actually causes memory ownership transfer to occur, we always precede with ownership transfer (even if that violates an allocator limit). It is the responsibility of the application owning a particular allocator to frequently confirm whether the allocator is over its memory limit (BufferAllocator.isOverLimit()) and if so, attempt to aggresively release memory to ameliorate the situation. + +All ArrowBufs (direct or sliced) related to a single BufferLedger/BufferAllocator combination share the same reference count and either all will be valid or all will be invalid. + +## Object Hierarchy + +There are two main ways that someone can look at the object hierarchy for Arrow's memory management scheme. The first is a memory based perspective as below: + +### Memory Perspective +
++ AllocationManager
+|
+|-- UnsignedDirectLittleEndian (One per AllocationManager)
+|
+|-+ BufferLedger 1 ==> Allocator A (owning)
+| ` - ArrowBuf 1
+|-+ BufferLedger 2 ==> Allocator B (non-owning)
+| ` - ArrowBuf 2
+|-+ BufferLedger 3 ==> Allocator C (non-owning)
+  | - ArrowBuf 3
+  | - ArrowBuf 4
+  ` - ArrowBuf 5
+
+ +In this picture, a piece of memory is owned by an allocator manager. An allocator manager is responsible for that piece of memory no matter which allocator(s) it is working with. An allocator manager will have relationships with a piece of raw memory (via its reference to UnsignedDirectLittleEndian) as well as references to each BufferAllocator it has a relationship to. + +### Allocator Perspective +
++ RootAllocator
+|-+ ChildAllocator 1
+| | - ChildAllocator 1.1
+| ` ...
+|
+|-+ ChildAllocator 2
+|-+ ChildAllocator 3
+| |
+| |-+ BufferLedger 1 ==> AllocationManager 1 (owning) ==> UDLE
+| | `- ArrowBuf 1
+| `-+ BufferLedger 2 ==> AllocationManager 2 (non-owning)==> UDLE
+| 	`- ArrowBuf 2
+|
+|-+ BufferLedger 3 ==> AllocationManager 1 (non-owning)==> UDLE
+| ` - ArrowBuf 3
+|-+ BufferLedger 4 ==> AllocationManager 2 (owning) ==> UDLE
+  | - ArrowBuf 4
+  | - ArrowBuf 5
+  ` - ArrowBuf 6
+
+ +In this picture, a RootAllocator owns three ChildAllocators. The first ChildAllocator (ChildAllocator 1) owns a subsequent ChildAllocator. ChildAllocator has two BufferLedgers/AllocationManager references. Coincidentally, each of these AllocationManager's is also associated with the RootAllocator. In this case, one of the these AllocationManagers is owned by ChildAllocator 3 (AllocationManager 1) while the other AllocationManager (AllocationManager 2) is owned/accounted for by the RootAllocator. Note that in this scenario, ArrowBuf 1 is sharing the underlying memory as ArrowBuf 3. However the subset of that memory (e.g. through slicing) might be different. Also note that ArrowBuf 2 and ArrowBuf 4, 5 and 6 are also sharing the same underlying memory. Also note that ArrowBuf 4, 5 and 6 all share the same reference count and fate. + +## Debugging Issues +The Allocator object provides a useful set of tools to better understand the status of the allocator. If in `DEBUG` mode, the allocator and supporting classes will record additional debug tracking information to better track down memory leaks and issues. To enable DEBUG mode, either enable Java assertions with `-ea` or pass the following system property to the VM when starting `-Darrow.memory.debug.allocator=true`. The BufferAllocator also provides a `BufferAllocator.toVerboseString()` which can be used in DEBUG mode to get extensive stacktrace information and events associated with various Allocator behaviors. \ No newline at end of file diff --git a/java/memory/src/main/java/org/apache/arrow/memory/RootAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/RootAllocator.java new file mode 100644 index 00000000000..571fc375772 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/RootAllocator.java @@ -0,0 +1,39 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import com.google.common.annotations.VisibleForTesting; + +/** + * The root allocator for using direct memory inside a Drillbit. Supports creating a + * tree of descendant child allocators. + */ +public class RootAllocator extends BaseAllocator { + + public RootAllocator(final long limit) { + super(null, "ROOT", 0, limit); + } + + /** + * Verify the accounting state of the allocation system. + */ + @VisibleForTesting + public void verify() { + verifyAllocator(); + } +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/package-info.java b/java/memory/src/main/java/org/apache/arrow/memory/package-info.java new file mode 100644 index 00000000000..712af3026e2 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/package-info.java @@ -0,0 +1,24 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + * Memory Allocation, Account and Management + * + * See the README.md file in this directory for detailed information about Drill's memory allocation subsystem. + * + */ +package org.apache.arrow.memory; diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/AssertionUtil.java b/java/memory/src/main/java/org/apache/arrow/memory/util/AssertionUtil.java new file mode 100644 index 00000000000..28d07852897 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/AssertionUtil.java @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory.util; + +public class AssertionUtil { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AssertionUtil.class); + + public static final boolean ASSERT_ENABLED; + + static{ + boolean isAssertEnabled = false; + assert isAssertEnabled = true; + ASSERT_ENABLED = isAssertEnabled; + } + + public static boolean isAssertionsEnabled(){ + return ASSERT_ENABLED; + } + + private AssertionUtil() { + } +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/AutoCloseableLock.java b/java/memory/src/main/java/org/apache/arrow/memory/util/AutoCloseableLock.java new file mode 100644 index 00000000000..94e5cc5fded --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/AutoCloseableLock.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory.util; + +import java.util.concurrent.locks.Lock; + +/** + * Simple wrapper class that allows Locks to be released via an try-with-resources block. + */ +public class AutoCloseableLock implements AutoCloseable { + + private final Lock lock; + + public AutoCloseableLock(Lock lock) { + this.lock = lock; + } + + public AutoCloseableLock open() { + lock.lock(); + return this; + } + + @Override + public void close() { + lock.unlock(); + } + +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java new file mode 100644 index 00000000000..38cb779343a --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java @@ -0,0 +1,185 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory.util; + +import java.util.Arrays; +import java.util.LinkedList; + +import org.slf4j.Logger; + +/** + * Utility class that can be used to log activity within a class + * for later logging and debugging. Supports recording events and + * recording the stack at the time they occur. + */ +public class HistoricalLog { + private static class Event { + private final String note; // the event text + private final StackTrace stackTrace; // where the event occurred + private final long time; + + public Event(final String note) { + this.note = note; + this.time = System.nanoTime(); + stackTrace = new StackTrace(); + } + } + + private final LinkedList history = new LinkedList<>(); + private final String idString; // the formatted id string + private Event firstEvent; // the first stack trace recorded + private final int limit; // the limit on the number of events kept + + /** + * Constructor. The format string will be formatted and have its arguments + * substituted at the time this is called. + * + * @param idStringFormat {@link String#format} format string that can be used + * to identify this object in a log. Including some kind of unique identifier + * that can be associated with the object instance is best. + * @param args for the format string, or nothing if none are required + */ + public HistoricalLog(final String idStringFormat, Object... args) { + this(Integer.MAX_VALUE, idStringFormat, args); + } + + /** + * Constructor. The format string will be formatted and have its arguments + * substituted at the time this is called. + * + *

This form supports the specification of a limit that will limit the + * number of historical entries kept (which keeps down the amount of memory + * used). With the limit, the first entry made is always kept (under the + * assumption that this is the creation site of the object, which is usually + * interesting), and then up to the limit number of entries are kept after that. + * Each time a new entry is made, the oldest that is not the first is dropped. + *

+ * + * @param limit the maximum number of historical entries that will be kept, + * not including the first entry made + * @param idStringFormat {@link String#format} format string that can be used + * to identify this object in a log. Including some kind of unique identifier + * that can be associated with the object instance is best. + * @param args for the format string, or nothing if none are required + */ + public HistoricalLog(final int limit, final String idStringFormat, Object... args) { + this.limit = limit; + this.idString = String.format(idStringFormat, args); + } + + /** + * Record an event. Automatically captures the stack trace at the time this is + * called. The format string will be formatted and have its arguments substituted + * at the time this is called. + * + * @param noteFormat {@link String#format} format string that describes the event + * @param args for the format string, or nothing if none are required + */ + public synchronized void recordEvent(final String noteFormat, Object... args) { + final String note = String.format(noteFormat, args); + final Event event = new Event(note); + if (firstEvent == null) { + firstEvent = event; + } + if (history.size() == limit) { + history.removeFirst(); + } + history.add(event); + } + + /** + * Write the history of this object to the given {@link StringBuilder}. The history + * includes the identifying string provided at construction time, and all the recorded + * events with their stack traces. + * + * @param sb {@link StringBuilder} to write to + */ + public void buildHistory(final StringBuilder sb, boolean includeStackTrace) { + buildHistory(sb, 0, includeStackTrace); + } + + /** + * Write the history of this object to the given {@link StringBuilder}. The history + * includes the identifying string provided at construction time, and all the recorded + * events with their stack traces. + * + * @param sb {@link StringBuilder} to write to + * @param additional an extra string that will be written between the identifying + * information and the history; often used for a current piece of state + */ + + /** + * + * @param sb + * @param indexLevel + * @param includeStackTrace + */ + public synchronized void buildHistory(final StringBuilder sb, int indent, boolean includeStackTrace) { + final char[] indentation = new char[indent]; + final char[] innerIndentation = new char[indent + 2]; + Arrays.fill(indentation, ' '); + Arrays.fill(innerIndentation, ' '); + + sb.append(indentation) + .append("event log for: ") + .append(idString) + .append('\n'); + + + if (firstEvent != null) { + sb.append(innerIndentation) + .append(firstEvent.time) + .append(' ') + .append(firstEvent.note) + .append('\n'); + if (includeStackTrace) { + firstEvent.stackTrace.writeToBuilder(sb, indent + 2); + } + + for(final Event event : history) { + if (event == firstEvent) { + continue; + } + sb.append(innerIndentation) + .append(" ") + .append(event.time) + .append(' ') + .append(event.note) + .append('\n'); + + if (includeStackTrace) { + event.stackTrace.writeToBuilder(sb, indent + 2); + sb.append('\n'); + } + } + } + } + + /** + * Write the history of this object to the given {@link Logger}. The history + * includes the identifying string provided at construction time, and all the recorded + * events with their stack traces. + * + * @param logger {@link Logger} to write to + */ + public void logHistory(final Logger logger) { + final StringBuilder sb = new StringBuilder(); + buildHistory(sb, 0, true); + logger.debug(sb.toString()); + } +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/Metrics.java b/java/memory/src/main/java/org/apache/arrow/memory/util/Metrics.java new file mode 100644 index 00000000000..5177a2478b5 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/Metrics.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory.util; + +import com.codahale.metrics.MetricRegistry; + +public class Metrics { + + private Metrics() { + + } + + private static class RegistryHolder { + public static final MetricRegistry REGISTRY; + + static { + REGISTRY = new MetricRegistry(); + } + + } + + public static MetricRegistry getInstance() { + return RegistryHolder.REGISTRY; + } +} \ No newline at end of file diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/Pointer.java b/java/memory/src/main/java/org/apache/arrow/memory/util/Pointer.java new file mode 100644 index 00000000000..58ab13b0a16 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/Pointer.java @@ -0,0 +1,28 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory.util; + +public class Pointer { + public T value; + + public Pointer(){} + + public Pointer(T value){ + this.value = value; + } +} diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/StackTrace.java b/java/memory/src/main/java/org/apache/arrow/memory/util/StackTrace.java new file mode 100644 index 00000000000..638c2fb9a95 --- /dev/null +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/StackTrace.java @@ -0,0 +1,70 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory.util; + +import java.util.Arrays; + +/** + * Convenient way of obtaining and manipulating stack traces for debugging. + */ +public class StackTrace { + private final StackTraceElement[] stackTraceElements; + + /** + * Constructor. Captures the current stack trace. + */ + public StackTrace() { + // skip over the first element so that we don't include this constructor call + final StackTraceElement[] stack = Thread.currentThread().getStackTrace(); + stackTraceElements = Arrays.copyOfRange(stack, 1, stack.length - 1); + } + + /** + * Write the stack trace to a StringBuilder. + * @param sb + * where to write it + * @param indent + * how many double spaces to indent each line + */ + public void writeToBuilder(final StringBuilder sb, final int indent) { + // create the indentation string + final char[] indentation = new char[indent * 2]; + Arrays.fill(indentation, ' '); + + // write the stack trace in standard Java format + for(StackTraceElement ste : stackTraceElements) { + sb.append(indentation) + .append("at ") + .append(ste.getClassName()) + .append('.') + .append(ste.getMethodName()) + .append('(') + .append(ste.getFileName()) + .append(':') + .append(Integer.toString(ste.getLineNumber())) + .append(")\n"); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + writeToBuilder(sb, 0); + return sb.toString(); + } +} diff --git a/java/memory/src/main/resources/drill-module.conf b/java/memory/src/main/resources/drill-module.conf new file mode 100644 index 00000000000..593ef8e41e7 --- /dev/null +++ b/java/memory/src/main/resources/drill-module.conf @@ -0,0 +1,25 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file tells Drill to consider this module when class path scanning. +// This file can also include any supplementary configuration information. +// This file is in HOCON format, see https://github.com/typesafehub/config/blob/master/HOCON.md for more information. +drill: { + memory: { + debug.error_on_leak: true, + top.max: 1000000000000 + } + +} diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java b/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java new file mode 100644 index 00000000000..86bccf5064a --- /dev/null +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestAccountant.java @@ -0,0 +1,164 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.memory.Accountant; +import org.apache.arrow.memory.Accountant.AllocationOutcome; +import org.junit.Assert; +import org.junit.Test; + +public class TestAccountant { + + @Test + public void basic() { + ensureAccurateReservations(null); + } + + @Test + public void nested() { + final Accountant parent = new Accountant(null, 0, Long.MAX_VALUE); + ensureAccurateReservations(parent); + assertEquals(0, parent.getAllocatedMemory()); + } + + @Test + public void multiThread() throws InterruptedException { + final Accountant parent = new Accountant(null, 0, Long.MAX_VALUE); + + final int numberOfThreads = 32; + final int loops = 100; + Thread[] threads = new Thread[numberOfThreads]; + + for (int i = 0; i < numberOfThreads; i++) { + Thread t = new Thread() { + + @Override + public void run() { + try { + for (int i = 0; i < loops; i++) { + ensureAccurateReservations(parent); + } + } catch (Exception ex) { + ex.printStackTrace(); + Assert.fail(ex.getMessage()); + } + } + + }; + threads[i] = t; + t.start(); + } + + for (Thread thread : threads) { + thread.join(); + } + + assertEquals(0, parent.getAllocatedMemory()); + } + + private void ensureAccurateReservations(Accountant outsideParent) { + final Accountant parent = new Accountant(outsideParent, 0, 10); + assertEquals(0, parent.getAllocatedMemory()); + + final Accountant child = new Accountant(parent, 2, Long.MAX_VALUE); + assertEquals(2, parent.getAllocatedMemory()); + + { + AllocationOutcome first = child.allocateBytes(1); + assertEquals(AllocationOutcome.SUCCESS, first); + } + + // child will have new allocation + assertEquals(1, child.getAllocatedMemory()); + + // root has no change since within reservation + assertEquals(2, parent.getAllocatedMemory()); + + { + AllocationOutcome first = child.allocateBytes(1); + assertEquals(AllocationOutcome.SUCCESS, first); + } + + // child will have new allocation + assertEquals(2, child.getAllocatedMemory()); + + // root has no change since within reservation + assertEquals(2, parent.getAllocatedMemory()); + + child.releaseBytes(1); + + // child will have new allocation + assertEquals(1, child.getAllocatedMemory()); + + // root has no change since within reservation + assertEquals(2, parent.getAllocatedMemory()); + + { + AllocationOutcome first = child.allocateBytes(2); + assertEquals(AllocationOutcome.SUCCESS, first); + } + + // child will have new allocation + assertEquals(3, child.getAllocatedMemory()); + + // went beyond reservation, now in parent accountant + assertEquals(3, parent.getAllocatedMemory()); + + { + AllocationOutcome first = child.allocateBytes(7); + assertEquals(AllocationOutcome.SUCCESS, first); + } + + // child will have new allocation + assertEquals(10, child.getAllocatedMemory()); + + // went beyond reservation, now in parent accountant + assertEquals(10, parent.getAllocatedMemory()); + + child.releaseBytes(9); + + assertEquals(1, child.getAllocatedMemory()); + + // back to reservation size + assertEquals(2, parent.getAllocatedMemory()); + + AllocationOutcome first = child.allocateBytes(10); + assertEquals(AllocationOutcome.FAILED_PARENT, first); + + // unchanged + assertEquals(1, child.getAllocatedMemory()); + assertEquals(2, parent.getAllocatedMemory()); + + boolean withinLimit = child.forceAllocate(10); + assertEquals(false, withinLimit); + + // at new limit + assertEquals(child.getAllocatedMemory(), 11); + assertEquals(parent.getAllocatedMemory(), 11); + + + child.releaseBytes(11); + assertEquals(child.getAllocatedMemory(), 0); + assertEquals(parent.getAllocatedMemory(), 2); + + child.close(); + parent.close(); + } +} diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java new file mode 100644 index 00000000000..e13dabb9533 --- /dev/null +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java @@ -0,0 +1,648 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import io.netty.buffer.ArrowBuf; +import io.netty.buffer.ArrowBuf.TransferResult; + +import org.apache.arrow.memory.AllocationReservation; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.memory.RootAllocator; +import org.junit.Ignore; +import org.junit.Test; + +public class TestBaseAllocator { + // private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TestBaseAllocator.class); + + private final static int MAX_ALLOCATION = 8 * 1024; + +/* + // ---------------------------------------- DEBUG ----------------------------------- + + @After + public void checkBuffers() { + final int bufferCount = UnsafeDirectLittleEndian.getBufferCount(); + if (bufferCount != 0) { + UnsafeDirectLittleEndian.logBuffers(logger); + UnsafeDirectLittleEndian.releaseBuffers(); + } + + assertEquals(0, bufferCount); + } + +// @AfterClass +// public static void dumpBuffers() { +// UnsafeDirectLittleEndian.logBuffers(logger); +// } + + // ---------------------------------------- DEBUG ------------------------------------ +*/ + + + @Test + public void test_privateMax() throws Exception { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + final ArrowBuf drillBuf1 = rootAllocator.buffer(MAX_ALLOCATION / 2); + assertNotNull("allocation failed", drillBuf1); + + try(final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("noLimits", 0, MAX_ALLOCATION)) { + final ArrowBuf drillBuf2 = childAllocator.buffer(MAX_ALLOCATION / 2); + assertNotNull("allocation failed", drillBuf2); + drillBuf2.release(); + } + + drillBuf1.release(); + } + } + + @Test(expected=IllegalStateException.class) + public void testRootAllocator_closeWithOutstanding() throws Exception { + try { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + final ArrowBuf drillBuf = rootAllocator.buffer(512); + assertNotNull("allocation failed", drillBuf); + } + } finally { + /* + * We expect there to be one unreleased underlying buffer because we're closing + * without releasing it. + */ +/* + // ------------------------------- DEBUG --------------------------------- + final int bufferCount = UnsafeDirectLittleEndian.getBufferCount(); + UnsafeDirectLittleEndian.releaseBuffers(); + assertEquals(1, bufferCount); + // ------------------------------- DEBUG --------------------------------- +*/ + } + } + + @Test + public void testRootAllocator_getEmpty() throws Exception { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + final ArrowBuf drillBuf = rootAllocator.buffer(0); + assertNotNull("allocation failed", drillBuf); + assertEquals("capacity was non-zero", 0, drillBuf.capacity()); + drillBuf.release(); + } + } + + @Ignore // TODO(DRILL-2740) + @Test(expected = IllegalStateException.class) + public void testAllocator_unreleasedEmpty() throws Exception { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + @SuppressWarnings("unused") + final ArrowBuf drillBuf = rootAllocator.buffer(0); + } + } + + @Test + public void testAllocator_transferOwnership() throws Exception { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + final BufferAllocator childAllocator1 = + rootAllocator.newChildAllocator("changeOwnership1", 0, MAX_ALLOCATION); + final BufferAllocator childAllocator2 = + rootAllocator.newChildAllocator("changeOwnership2", 0, MAX_ALLOCATION); + + final ArrowBuf drillBuf1 = childAllocator1.buffer(MAX_ALLOCATION / 4); + rootAllocator.verify(); + TransferResult transferOwnership = drillBuf1.transferOwnership(childAllocator2); + final boolean allocationFit = transferOwnership.allocationFit; + rootAllocator.verify(); + assertTrue(allocationFit); + + drillBuf1.release(); + childAllocator1.close(); + rootAllocator.verify(); + + transferOwnership.buffer.release(); + childAllocator2.close(); + } + } + + @Test + public void testAllocator_shareOwnership() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + final BufferAllocator childAllocator1 = rootAllocator.newChildAllocator("shareOwnership1", 0, MAX_ALLOCATION); + final BufferAllocator childAllocator2 = rootAllocator.newChildAllocator("shareOwnership2", 0, MAX_ALLOCATION); + final ArrowBuf drillBuf1 = childAllocator1.buffer(MAX_ALLOCATION / 4); + rootAllocator.verify(); + + // share ownership of buffer. + final ArrowBuf drillBuf2 = drillBuf1.retain(childAllocator2); + rootAllocator.verify(); + assertNotNull(drillBuf2); + assertNotEquals(drillBuf2, drillBuf1); + + // release original buffer (thus transferring ownership to allocator 2. (should leave allocator 1 in empty state) + drillBuf1.release(); + rootAllocator.verify(); + childAllocator1.close(); + rootAllocator.verify(); + + final BufferAllocator childAllocator3 = rootAllocator.newChildAllocator("shareOwnership3", 0, MAX_ALLOCATION); + final ArrowBuf drillBuf3 = drillBuf1.retain(childAllocator3); + assertNotNull(drillBuf3); + assertNotEquals(drillBuf3, drillBuf1); + assertNotEquals(drillBuf3, drillBuf2); + rootAllocator.verify(); + + drillBuf2.release(); + rootAllocator.verify(); + childAllocator2.close(); + rootAllocator.verify(); + + drillBuf3.release(); + rootAllocator.verify(); + childAllocator3.close(); + } + } + + @Test + public void testRootAllocator_createChildAndUse() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator = rootAllocator.newChildAllocator("createChildAndUse", 0, + MAX_ALLOCATION)) { + final ArrowBuf drillBuf = childAllocator.buffer(512); + assertNotNull("allocation failed", drillBuf); + drillBuf.release(); + } + } + } + + @Test(expected=IllegalStateException.class) + public void testRootAllocator_createChildDontClose() throws Exception { + try { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + final BufferAllocator childAllocator = rootAllocator.newChildAllocator("createChildDontClose", 0, + MAX_ALLOCATION); + final ArrowBuf drillBuf = childAllocator.buffer(512); + assertNotNull("allocation failed", drillBuf); + } + } finally { + /* + * We expect one underlying buffer because we closed a child allocator without + * releasing the buffer allocated from it. + */ +/* + // ------------------------------- DEBUG --------------------------------- + final int bufferCount = UnsafeDirectLittleEndian.getBufferCount(); + UnsafeDirectLittleEndian.releaseBuffers(); + assertEquals(1, bufferCount); + // ------------------------------- DEBUG --------------------------------- +*/ + } + } + + private static void allocateAndFree(final BufferAllocator allocator) { + final ArrowBuf drillBuf = allocator.buffer(512); + assertNotNull("allocation failed", drillBuf); + drillBuf.release(); + + final ArrowBuf drillBuf2 = allocator.buffer(MAX_ALLOCATION); + assertNotNull("allocation failed", drillBuf2); + drillBuf2.release(); + + final int nBufs = 8; + final ArrowBuf[] drillBufs = new ArrowBuf[nBufs]; + for(int i = 0; i < drillBufs.length; ++i) { + ArrowBuf drillBufi = allocator.buffer(MAX_ALLOCATION / nBufs); + assertNotNull("allocation failed", drillBufi); + drillBufs[i] = drillBufi; + } + for(ArrowBuf drillBufi : drillBufs) { + drillBufi.release(); + } + } + + @Test + public void testAllocator_manyAllocations() throws Exception { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try(final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("manyAllocations", 0, MAX_ALLOCATION)) { + allocateAndFree(childAllocator); + } + } + } + + @Test + public void testAllocator_overAllocate() throws Exception { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try(final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("overAllocate", 0, MAX_ALLOCATION)) { + allocateAndFree(childAllocator); + + try { + childAllocator.buffer(MAX_ALLOCATION + 1); + fail("allocated memory beyond max allowed"); + } catch (OutOfMemoryException e) { + // expected + } + } + } + } + + @Test + public void testAllocator_overAllocateParent() throws Exception { + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + try(final BufferAllocator childAllocator = + rootAllocator.newChildAllocator("overAllocateParent", 0, MAX_ALLOCATION)) { + final ArrowBuf drillBuf1 = rootAllocator.buffer(MAX_ALLOCATION / 2); + assertNotNull("allocation failed", drillBuf1); + final ArrowBuf drillBuf2 = childAllocator.buffer(MAX_ALLOCATION / 2); + assertNotNull("allocation failed", drillBuf2); + + try { + childAllocator.buffer(MAX_ALLOCATION / 4); + fail("allocated memory beyond max allowed"); + } catch (OutOfMemoryException e) { + // expected + } + + drillBuf1.release(); + drillBuf2.release(); + } + } + } + + private static void testAllocator_sliceUpBufferAndRelease( + final RootAllocator rootAllocator, final BufferAllocator bufferAllocator) { + final ArrowBuf drillBuf1 = bufferAllocator.buffer(MAX_ALLOCATION / 2); + rootAllocator.verify(); + + final ArrowBuf drillBuf2 = drillBuf1.slice(16, drillBuf1.capacity() - 32); + rootAllocator.verify(); + final ArrowBuf drillBuf3 = drillBuf2.slice(16, drillBuf2.capacity() - 32); + rootAllocator.verify(); + @SuppressWarnings("unused") + final ArrowBuf drillBuf4 = drillBuf3.slice(16, drillBuf3.capacity() - 32); + rootAllocator.verify(); + + drillBuf3.release(); // since they share refcounts, one is enough to release them all + rootAllocator.verify(); + } + + @Test + public void testAllocator_createSlices() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + testAllocator_sliceUpBufferAndRelease(rootAllocator, rootAllocator); + + try (final BufferAllocator childAllocator = rootAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { + testAllocator_sliceUpBufferAndRelease(rootAllocator, childAllocator); + } + rootAllocator.verify(); + + testAllocator_sliceUpBufferAndRelease(rootAllocator, rootAllocator); + + try (final BufferAllocator childAllocator = rootAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator2 = + childAllocator.newChildAllocator("createSlices", 0, MAX_ALLOCATION)) { + final ArrowBuf drillBuf1 = childAllocator2.buffer(MAX_ALLOCATION / 8); + @SuppressWarnings("unused") + final ArrowBuf drillBuf2 = drillBuf1.slice(MAX_ALLOCATION / 16, MAX_ALLOCATION / 16); + testAllocator_sliceUpBufferAndRelease(rootAllocator, childAllocator); + drillBuf1.release(); + rootAllocator.verify(); + } + rootAllocator.verify(); + + testAllocator_sliceUpBufferAndRelease(rootAllocator, childAllocator); + } + rootAllocator.verify(); + } + } + + @Test + public void testAllocator_sliceRanges() throws Exception { +// final AllocatorOwner allocatorOwner = new NamedOwner("sliceRanges"); + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + // Populate a buffer with byte values corresponding to their indices. + final ArrowBuf drillBuf = rootAllocator.buffer(256); + assertEquals(256, drillBuf.capacity()); + assertEquals(0, drillBuf.readerIndex()); + assertEquals(0, drillBuf.readableBytes()); + assertEquals(0, drillBuf.writerIndex()); + assertEquals(256, drillBuf.writableBytes()); + + final ArrowBuf slice3 = (ArrowBuf) drillBuf.slice(); + assertEquals(0, slice3.readerIndex()); + assertEquals(0, slice3.readableBytes()); + assertEquals(0, slice3.writerIndex()); +// assertEquals(256, slice3.capacity()); +// assertEquals(256, slice3.writableBytes()); + + for(int i = 0; i < 256; ++i) { + drillBuf.writeByte(i); + } + assertEquals(0, drillBuf.readerIndex()); + assertEquals(256, drillBuf.readableBytes()); + assertEquals(256, drillBuf.writerIndex()); + assertEquals(0, drillBuf.writableBytes()); + + final ArrowBuf slice1 = (ArrowBuf) drillBuf.slice(); + assertEquals(0, slice1.readerIndex()); + assertEquals(256, slice1.readableBytes()); + for(int i = 0; i < 10; ++i) { + assertEquals(i, slice1.readByte()); + } + assertEquals(256 - 10, slice1.readableBytes()); + for(int i = 0; i < 256; ++i) { + assertEquals((byte) i, slice1.getByte(i)); + } + + final ArrowBuf slice2 = (ArrowBuf) drillBuf.slice(25, 25); + assertEquals(0, slice2.readerIndex()); + assertEquals(25, slice2.readableBytes()); + for(int i = 25; i < 50; ++i) { + assertEquals(i, slice2.readByte()); + } + +/* + for(int i = 256; i > 0; --i) { + slice3.writeByte(i - 1); + } + for(int i = 0; i < 256; ++i) { + assertEquals(255 - i, slice1.getByte(i)); + } +*/ + + drillBuf.release(); // all the derived buffers share this fate + } + } + + @Test + public void testAllocator_slicesOfSlices() throws Exception { +// final AllocatorOwner allocatorOwner = new NamedOwner("slicesOfSlices"); + try(final RootAllocator rootAllocator = + new RootAllocator(MAX_ALLOCATION)) { + // Populate a buffer with byte values corresponding to their indices. + final ArrowBuf drillBuf = rootAllocator.buffer(256); + for(int i = 0; i < 256; ++i) { + drillBuf.writeByte(i); + } + + // Slice it up. + final ArrowBuf slice0 = drillBuf.slice(0, drillBuf.capacity()); + for(int i = 0; i < 256; ++i) { + assertEquals((byte) i, drillBuf.getByte(i)); + } + + final ArrowBuf slice10 = slice0.slice(10, drillBuf.capacity() - 10); + for(int i = 10; i < 256; ++i) { + assertEquals((byte) i, slice10.getByte(i - 10)); + } + + final ArrowBuf slice20 = slice10.slice(10, drillBuf.capacity() - 20); + for(int i = 20; i < 256; ++i) { + assertEquals((byte) i, slice20.getByte(i - 20)); + } + + final ArrowBuf slice30 = slice20.slice(10, drillBuf.capacity() - 30); + for(int i = 30; i < 256; ++i) { + assertEquals((byte) i, slice30.getByte(i - 30)); + } + + drillBuf.release(); + } + } + + @Test + public void testAllocator_transferSliced() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + final BufferAllocator childAllocator1 = rootAllocator.newChildAllocator("transferSliced1", 0, MAX_ALLOCATION); + final BufferAllocator childAllocator2 = rootAllocator.newChildAllocator("transferSliced2", 0, MAX_ALLOCATION); + + final ArrowBuf drillBuf1 = childAllocator1.buffer(MAX_ALLOCATION / 8); + final ArrowBuf drillBuf2 = childAllocator2.buffer(MAX_ALLOCATION / 8); + + final ArrowBuf drillBuf1s = drillBuf1.slice(0, drillBuf1.capacity() / 2); + final ArrowBuf drillBuf2s = drillBuf2.slice(0, drillBuf2.capacity() / 2); + + rootAllocator.verify(); + + TransferResult result1 = drillBuf2s.transferOwnership(childAllocator1); + rootAllocator.verify(); + TransferResult result2 = drillBuf1s.transferOwnership(childAllocator2); + rootAllocator.verify(); + + result1.buffer.release(); + result2.buffer.release(); + + drillBuf1s.release(); // releases drillBuf1 + drillBuf2s.release(); // releases drillBuf2 + + childAllocator1.close(); + childAllocator2.close(); + } + } + + @Test + public void testAllocator_shareSliced() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + final BufferAllocator childAllocator1 = rootAllocator.newChildAllocator("transferSliced", 0, MAX_ALLOCATION); + final BufferAllocator childAllocator2 = rootAllocator.newChildAllocator("transferSliced", 0, MAX_ALLOCATION); + + final ArrowBuf drillBuf1 = childAllocator1.buffer(MAX_ALLOCATION / 8); + final ArrowBuf drillBuf2 = childAllocator2.buffer(MAX_ALLOCATION / 8); + + final ArrowBuf drillBuf1s = drillBuf1.slice(0, drillBuf1.capacity() / 2); + final ArrowBuf drillBuf2s = drillBuf2.slice(0, drillBuf2.capacity() / 2); + + rootAllocator.verify(); + + final ArrowBuf drillBuf2s1 = drillBuf2s.retain(childAllocator1); + final ArrowBuf drillBuf1s2 = drillBuf1s.retain(childAllocator2); + rootAllocator.verify(); + + drillBuf1s.release(); // releases drillBuf1 + drillBuf2s.release(); // releases drillBuf2 + rootAllocator.verify(); + + drillBuf2s1.release(); // releases the shared drillBuf2 slice + drillBuf1s2.release(); // releases the shared drillBuf1 slice + + childAllocator1.close(); + childAllocator2.close(); + } + } + + @Test + public void testAllocator_transferShared() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + final BufferAllocator childAllocator1 = rootAllocator.newChildAllocator("transferShared1", 0, MAX_ALLOCATION); + final BufferAllocator childAllocator2 = rootAllocator.newChildAllocator("transferShared2", 0, MAX_ALLOCATION); + final BufferAllocator childAllocator3 = rootAllocator.newChildAllocator("transferShared3", 0, MAX_ALLOCATION); + + final ArrowBuf drillBuf1 = childAllocator1.buffer(MAX_ALLOCATION / 8); + + boolean allocationFit; + + ArrowBuf drillBuf2 = drillBuf1.retain(childAllocator2); + rootAllocator.verify(); + assertNotNull(drillBuf2); + assertNotEquals(drillBuf2, drillBuf1); + + TransferResult result = drillBuf1.transferOwnership(childAllocator3); + allocationFit = result.allocationFit; + final ArrowBuf drillBuf3 = result.buffer; + assertTrue(allocationFit); + rootAllocator.verify(); + + // Since childAllocator3 now has childAllocator1's buffer, 1, can close + drillBuf1.release(); + childAllocator1.close(); + rootAllocator.verify(); + + drillBuf2.release(); + childAllocator2.close(); + rootAllocator.verify(); + + final BufferAllocator childAllocator4 = rootAllocator.newChildAllocator("transferShared4", 0, MAX_ALLOCATION); + TransferResult result2 = drillBuf3.transferOwnership(childAllocator4); + allocationFit = result.allocationFit; + final ArrowBuf drillBuf4 = result2.buffer; + assertTrue(allocationFit); + rootAllocator.verify(); + + drillBuf3.release(); + childAllocator3.close(); + rootAllocator.verify(); + + drillBuf4.release(); + childAllocator4.close(); + rootAllocator.verify(); + } + } + + @Test + public void testAllocator_unclaimedReservation() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + try (final BufferAllocator childAllocator1 = + rootAllocator.newChildAllocator("unclaimedReservation", 0, MAX_ALLOCATION)) { + try(final AllocationReservation reservation = childAllocator1.newReservation()) { + assertTrue(reservation.add(64)); + } + rootAllocator.verify(); + } + } + } + + @Test + public void testAllocator_claimedReservation() throws Exception { + try (final RootAllocator rootAllocator = new RootAllocator(MAX_ALLOCATION)) { + + try (final BufferAllocator childAllocator1 = rootAllocator.newChildAllocator("claimedReservation", 0, + MAX_ALLOCATION)) { + + try (final AllocationReservation reservation = childAllocator1.newReservation()) { + assertTrue(reservation.add(32)); + assertTrue(reservation.add(32)); + + final ArrowBuf drillBuf = reservation.allocateBuffer(); + assertEquals(64, drillBuf.capacity()); + rootAllocator.verify(); + + drillBuf.release(); + rootAllocator.verify(); + } + rootAllocator.verify(); + } + } + } + + @Test + public void multiple() throws Exception { + final String owner = "test"; + try (RootAllocator allocator = new RootAllocator(Long.MAX_VALUE)) { + + final int op = 100000; + + BufferAllocator frag1 = allocator.newChildAllocator(owner, 1500000, Long.MAX_VALUE); + BufferAllocator frag2 = allocator.newChildAllocator(owner, 500000, Long.MAX_VALUE); + + allocator.verify(); + + BufferAllocator allocator11 = frag1.newChildAllocator(owner, op, Long.MAX_VALUE); + ArrowBuf b11 = allocator11.buffer(1000000); + + allocator.verify(); + + BufferAllocator allocator12 = frag1.newChildAllocator(owner, op, Long.MAX_VALUE); + ArrowBuf b12 = allocator12.buffer(500000); + + allocator.verify(); + + BufferAllocator allocator21 = frag1.newChildAllocator(owner, op, Long.MAX_VALUE); + + allocator.verify(); + + BufferAllocator allocator22 = frag2.newChildAllocator(owner, op, Long.MAX_VALUE); + ArrowBuf b22 = allocator22.buffer(2000000); + + allocator.verify(); + + BufferAllocator frag3 = allocator.newChildAllocator(owner, 1000000, Long.MAX_VALUE); + + allocator.verify(); + + BufferAllocator allocator31 = frag3.newChildAllocator(owner, op, Long.MAX_VALUE); + ArrowBuf b31a = allocator31.buffer(200000); + + allocator.verify(); + + // Previously running operator completes + b22.release(); + + allocator.verify(); + + allocator22.close(); + + b31a.release(); + allocator31.close(); + + b12.release(); + allocator12.close(); + + allocator21.close(); + + b11.release(); + allocator11.close(); + + frag1.close(); + frag2.close(); + frag3.close(); + + } + } +} diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java b/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java new file mode 100644 index 00000000000..25357dc7b07 --- /dev/null +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestEndianess.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.memory; + +import static org.junit.Assert.assertEquals; +import io.netty.buffer.ByteBuf; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.junit.Test; + + +public class TestEndianess { + + @Test + public void testLittleEndian() { + final BufferAllocator a = new RootAllocator(10000); + final ByteBuf b = a.buffer(4); + b.setInt(0, 35); + assertEquals(b.getByte(0), 35); + assertEquals(b.getByte(1), 0); + assertEquals(b.getByte(2), 0); + assertEquals(b.getByte(3), 0); + b.release(); + a.close(); + } + +} diff --git a/java/pom.xml b/java/pom.xml new file mode 100644 index 00000000000..8a3b192e13e --- /dev/null +++ b/java/pom.xml @@ -0,0 +1,470 @@ + + + + 4.0.0 + + + org.apache + apache + 14 + + + org.apache.arrow + arrow-java-root + 0.1-SNAPSHOT + pom + + Apache Arrow Java Root POM + Apache arrow is an open source, low latency SQL query engine for Hadoop and NoSQL. + http://arrow.apache.org/ + + + ${project.basedir}/target/generated-sources + 4.11 + 1.7.6 + 18.0 + 2 + 2.7.1 + 2.7.1 + 0.9.15 + 2.3.21 + + + + scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git + scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git + https://github.com/apache/arrow + HEAD + + + + + Developer List + dev-subscribe@arrow.apache.org + dev-unsubscribe@arrow.apache.org + dev@arrow.apache.org + http://mail-archives.apache.org/mod_mbox/arrow-dev/ + + + Commits List + commits-subscribe@arrow.apache.org + commits-unsubscribe@arrow.apache.org + commits@arrow.apache.org + http://mail-archives.apache.org/mod_mbox/arrow-commits/ + + + Issues List + issues-subscribe@arrow.apache.org + issues-unsubscribe@arrow.apache.org + http://mail-archives.apache.org/mod_mbox/arrow-issues/ + + + + + + + + + Jira + https://issues.apache.org/jira/browse/arrow + + + + + + + org.apache.rat + apache-rat-plugin + + + rat-checks + validate + + check + + + + + false + + **/*.log + **/*.css + **/*.js + **/*.md + **/*.eps + **/*.json + **/*.seq + **/*.parquet + **/*.sql + **/git.properties + **/*.csv + **/*.csvh + **/*.csvh-test + **/*.tsv + **/*.txt + **/*.ssv + **/arrow-*.conf + **/.buildpath + **/*.proto + **/*.fmpp + **/target/** + **/*.iml + **/*.tdd + **/*.project + **/TAGS + **/*.checkstyle + **/.classpath + **/.settings/** + .*/** + **/*.patch + **/*.pb.cc + **/*.pb.h + **/*.linux + **/client/build/** + **/*.tbl + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + **/logging.properties + **/logback-test.xml + **/logback.out.xml + **/logback.xml + + + true + + true + true + + + org.apache.arrow + ${username} + http://arrow.apache.org/ + + + + + + + test-jar + + + true + + + + + + + + org.apache.maven.plugins + maven-resources-plugin + + UTF-8 + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.7 + 1.7 + 2048m + false + true + + + + maven-enforcer-plugin + + + validate_java_and_maven_version + verify + + enforce + + false + + + + [3.0.4,4) + + + + + + avoid_bad_dependencies + verify + + enforce + + + + + + commons-logging + javax.servlet:servlet-api + org.mortbay.jetty:servlet-api + org.mortbay.jetty:servlet-api-2.5 + log4j:log4j + + + + + + + + + pl.project13.maven + git-commit-id-plugin + 2.1.9 + + + for-jars + true + + revision + + + target/classes/git.properties + + + + for-source-tarball + + revision + + false + + ./git.properties + + + + + + dd.MM.yyyy '@' HH:mm:ss z + true + false + true + false + + false + false + 7 + -dirty + true + + + + + + + + + org.apache.rat + apache-rat-plugin + 0.11 + + + org.apache.maven.plugins + maven-resources-plugin + 2.6 + + + org.apache.maven.plugins + maven-compiler-plugin + 3.2 + + + maven-enforcer-plugin + 1.3.1 + + + maven-surefire-plugin + 2.17 + + -ea + ${forkCount} + true + + ${project.build.directory} + + + + + org.apache.maven.plugins + maven-release-plugin + 2.5.2 + + false + false + deploy + -Papache-release ${arguments} + + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.maven.plugins + maven-antrun-plugin + [1.6,) + + run + + + + + + + + + org.apache.maven.plugins + maven-enforcer-plugin + [1.2,) + + enforce + + + + + + + + + org.apache.maven.plugins + + maven-remote-resources-plugin + + [1.1,) + + process + + + + + + + + + org.apache.rat + apache-rat-plugin + [0.10,) + + check + + + + + + + + + + + + + + + + + io.netty + netty-handler + 4.0.27.Final + + + + com.google.guava + guava + ${dep.guava.version} + + + + org.slf4j + slf4j-api + ${dep.slf4j.version} + + + + org.slf4j + jul-to-slf4j + ${dep.slf4j.version} + + + + org.slf4j + jcl-over-slf4j + ${dep.slf4j.version} + + + + org.slf4j + log4j-over-slf4j + ${dep.slf4j.version} + + + + + + com.googlecode.jmockit + jmockit + 1.3 + test + + + junit + junit + ${dep.junit.version} + test + + + + org.mockito + mockito-core + 1.9.5 + + + ch.qos.logback + logback-classic + 1.0.13 + test + + + de.huxhorn.lilith + de.huxhorn.lilith.logback.appender.multiplex-classic + 0.9.44 + test + + + + + + memory + vector + + diff --git a/java/vector/pom.xml b/java/vector/pom.xml new file mode 100644 index 00000000000..e693344221b --- /dev/null +++ b/java/vector/pom.xml @@ -0,0 +1,165 @@ + + + + 4.0.0 + + org.apache.arrow + arrow-java-root + 0.1-SNAPSHOT + + vector + vectors + + + + + org.apache.arrow + arrow-memory + ${project.version} + + + joda-time + joda-time + 2.9 + + + com.fasterxml.jackson.core + jackson-annotations + 2.7.1 + + + com.fasterxml.jackson.core + jackson-databind + 2.7.1 + + + com.carrotsearch + hppc + 0.7.1 + + + org.apache.commons + commons-lang3 + 3.4 + + + + + + + + apache + apache + https://repo.maven.apache.org/ + + true + + + false + + + + + + + + + + ${basedir}/src/main/codegen + codegen + + + + + + maven-resources-plugin + + + copy-fmpp-resources + initialize + + copy-resources + + + ${project.build.directory}/codegen + + + src/main/codegen + false + + + + + + + + org.apache.drill.tools + drill-fmpp-maven-plugin + 1.4.0 + + + generate-fmpp + generate-sources + + generate + + + src/main/codegen/config.fmpp + ${project.build.directory}/generated-sources + ${project.build.directory}/codegen/templates + + + + + + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.drill.tools + drill-fmpp-maven-plugin + [1.0,) + + generate + + + + + false + true + + + + + + + + + + + + + + + + diff --git a/java/vector/src/main/codegen/config.fmpp b/java/vector/src/main/codegen/config.fmpp new file mode 100644 index 00000000000..663677cbb5a --- /dev/null +++ b/java/vector/src/main/codegen/config.fmpp @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http:# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +data: { + # TODO: Rename to ~valueVectorModesAndTypes for clarity. + vv: tdd(../data/ValueVectorTypes.tdd), + +} +freemarkerLinks: { + includes: includes/ +} diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd new file mode 100644 index 00000000000..e747c30c5d1 --- /dev/null +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -0,0 +1,168 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http:# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{ + modes: [ + {name: "Optional", prefix: "Nullable"}, + {name: "Required", prefix: ""}, + {name: "Repeated", prefix: "Repeated"} + ], + types: [ + { + major: "Fixed", + width: 1, + javaType: "byte", + boxedType: "Byte", + fields: [{name: "value", type: "byte"}], + minor: [ + { class: "TinyInt", valueHolder: "IntHolder" }, + { class: "UInt1", valueHolder: "UInt1Holder" } + ] + }, + { + major: "Fixed", + width: 2, + javaType: "char", + boxedType: "Character", + fields: [{name: "value", type: "char"}], + minor: [ + { class: "UInt2", valueHolder: "UInt2Holder"} + ] + }, { + major: "Fixed", + width: 2, + javaType: "short", + boxedType: "Short", + fields: [{name: "value", type: "short"}], + minor: [ + { class: "SmallInt", valueHolder: "Int2Holder"}, + ] + }, + { + major: "Fixed", + width: 4, + javaType: "int", + boxedType: "Integer", + fields: [{name: "value", type: "int"}], + minor: [ + { class: "Int", valueHolder: "IntHolder"}, + { class: "UInt4", valueHolder: "UInt4Holder" }, + { class: "Float4", javaType: "float" , boxedType: "Float", fields: [{name: "value", type: "float"}]}, + { class: "Time", javaType: "int", friendlyType: "DateTime" }, + { class: "IntervalYear", javaType: "int", friendlyType: "Period" } + { class: "Decimal9", maxPrecisionDigits: 9, friendlyType: "BigDecimal", fields: [{name:"value", type:"int"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] }, + ] + }, + { + major: "Fixed", + width: 8, + javaType: "long", + boxedType: "Long", + fields: [{name: "value", type: "long"}], + minor: [ + { class: "BigInt"}, + { class: "UInt8" }, + { class: "Float8", javaType: "double" , boxedType: "Double", fields: [{name: "value", type: "double"}], }, + { class: "Date", javaType: "long", friendlyType: "DateTime" }, + { class: "TimeStamp", javaType: "long", friendlyType: "DateTime" } + { class: "Decimal18", maxPrecisionDigits: 18, friendlyType: "BigDecimal", fields: [{name:"value", type:"long"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] }, + <#-- + { class: "Money", maxPrecisionDigits: 2, scale: 1, }, + --> + ] + }, + { + major: "Fixed", + width: 12, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + minor: [ + { class: "IntervalDay", millisecondsOffset: 4, friendlyType: "Period", fields: [ {name: "days", type:"int"}, {name: "milliseconds", type:"int"}] } + ] + }, + { + major: "Fixed", + width: 16, + javaType: "ArrowBuf" + boxedType: "ArrowBuf", + minor: [ + { class: "Interval", daysOffset: 4, millisecondsOffset: 8, friendlyType: "Period", fields: [ {name: "months", type: "int"}, {name: "days", type:"int"}, {name: "milliseconds", type:"int"}] } + ] + }, + { + major: "Fixed", + width: 12, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + minor: [ + <#-- + { class: "TimeTZ" }, + { class: "Interval" } + --> + { class: "Decimal28Dense", maxPrecisionDigits: 28, nDecimalDigits: 3, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } + ] + }, + { + major: "Fixed", + width: 16, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + + minor: [ + { class: "Decimal38Dense", maxPrecisionDigits: 38, nDecimalDigits: 4, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } + ] + }, + { + major: "Fixed", + width: 24, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + minor: [ + { class: "Decimal38Sparse", maxPrecisionDigits: 38, nDecimalDigits: 6, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } + ] + }, + { + major: "Fixed", + width: 20, + javaType: "ArrowBuf", + boxedType: "ArrowBuf", + minor: [ + { class: "Decimal28Sparse", maxPrecisionDigits: 28, nDecimalDigits: 5, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } + ] + }, + { + major: "VarLen", + width: 4, + javaType: "int", + boxedType: "ArrowBuf", + fields: [{name: "start", type: "int"}, {name: "end", type: "int"}, {name: "buffer", type: "ArrowBuf"}], + minor: [ + { class: "VarBinary" , friendlyType: "byte[]" }, + { class: "VarChar" , friendlyType: "Text" }, + { class: "Var16Char" , friendlyType: "String" } + ] + }, + { + major: "Bit", + width: 1, + javaType: "int", + boxedType: "Integer", + minor: [ + { class: "Bit" , friendlyType: "Boolean", fields: [{name: "value", type: "int"}] } + ] + } + ] +} diff --git a/java/vector/src/main/codegen/includes/license.ftl b/java/vector/src/main/codegen/includes/license.ftl new file mode 100644 index 00000000000..0455fd87ddc --- /dev/null +++ b/java/vector/src/main/codegen/includes/license.ftl @@ -0,0 +1,18 @@ +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ \ No newline at end of file diff --git a/java/vector/src/main/codegen/includes/vv_imports.ftl b/java/vector/src/main/codegen/includes/vv_imports.ftl new file mode 100644 index 00000000000..2d808b1b3cb --- /dev/null +++ b/java/vector/src/main/codegen/includes/vv_imports.ftl @@ -0,0 +1,62 @@ +<#-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + You under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +import com.google.common.collect.Lists; +import com.google.common.collect.ObjectArrays; +import com.google.common.base.Charsets; +import com.google.common.collect.ObjectArrays; + +import com.google.common.base.Preconditions; +import io.netty.buffer.*; + +import org.apache.commons.lang3.ArrayUtils; + +import org.apache.arrow.memory.*; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.*; +import org.apache.arrow.vector.types.*; +import org.apache.arrow.vector.*; +import org.apache.arrow.vector.holders.*; +import org.apache.arrow.vector.util.*; +import org.apache.arrow.vector.complex.*; +import org.apache.arrow.vector.complex.reader.*; +import org.apache.arrow.vector.complex.impl.*; +import org.apache.arrow.vector.complex.writer.*; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.util.JsonStringArrayList; + +import java.util.Arrays; +import java.util.Random; +import java.util.List; + +import java.io.Closeable; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.ByteBuffer; + +import java.sql.Date; +import java.sql.Time; +import java.sql.Timestamp; +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.joda.time.DateTime; +import org.joda.time.Period; + + + + + + diff --git a/java/vector/src/main/codegen/templates/AbstractFieldReader.java b/java/vector/src/main/codegen/templates/AbstractFieldReader.java new file mode 100644 index 00000000000..b83dba28791 --- /dev/null +++ b/java/vector/src/main/codegen/templates/AbstractFieldReader.java @@ -0,0 +1,124 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/AbstractFieldReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +@SuppressWarnings("unused") +abstract class AbstractFieldReader extends AbstractBaseReader implements FieldReader{ + + AbstractFieldReader(){ + super(); + } + + /** + * Returns true if the current value of the reader is not null + * @return + */ + public boolean isSet() { + return true; + } + + <#list ["Object", "BigDecimal", "Integer", "Long", "Boolean", + "Character", "DateTime", "Period", "Double", "Float", + "Text", "String", "Byte", "Short", "byte[]"] as friendlyType> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + + public ${friendlyType} read${safeType}(int arrayIndex){ + fail("read${safeType}(int arrayIndex)"); + return null; + } + + public ${friendlyType} read${safeType}(){ + fail("read${safeType}()"); + return null; + } + + + + public void copyAsValue(MapWriter writer){ + fail("CopyAsValue MapWriter"); + } + public void copyAsField(String name, MapWriter writer){ + fail("CopyAsField MapWriter"); + } + + public void copyAsField(String name, ListWriter writer){ + fail("CopyAsFieldList"); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign boxedType = (minor.boxedType!type.boxedType) /> + + public void read(${name}Holder holder){ + fail("${name}"); + } + + public void read(Nullable${name}Holder holder){ + fail("${name}"); + } + + public void read(int arrayIndex, ${name}Holder holder){ + fail("Repeated${name}"); + } + + public void read(int arrayIndex, Nullable${name}Holder holder){ + fail("Repeated${name}"); + } + + public void copyAsValue(${name}Writer writer){ + fail("CopyAsValue${name}"); + } + public void copyAsField(String name, ${name}Writer writer){ + fail("CopyAsField${name}"); + } + + + public FieldReader reader(String name){ + fail("reader(String name)"); + return null; + } + + public FieldReader reader(){ + fail("reader()"); + return null; + + } + + public int size(){ + fail("size()"); + return -1; + } + + private void fail(String name){ + throw new IllegalArgumentException(String.format("You tried to read a [%s] type when you are using a field reader of type [%s].", name, this.getClass().getSimpleName())); + } + + +} + + + diff --git a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java new file mode 100644 index 00000000000..6ee9dad44e9 --- /dev/null +++ b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java @@ -0,0 +1,147 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/AbstractFieldWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +abstract class AbstractFieldWriter extends AbstractBaseWriter implements FieldWriter { + AbstractFieldWriter(FieldWriter parent) { + super(parent); + } + + @Override + public void start() { + throw new IllegalStateException(String.format("You tried to start when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void end() { + throw new IllegalStateException(String.format("You tried to end when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void startList() { + throw new IllegalStateException(String.format("You tried to start when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + @Override + public void endList() { + throw new IllegalStateException(String.format("You tried to end when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + @Override + public void write(${name}Holder holder) { + fail("${name}"); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + fail("${name}"); + } + + + + public void writeNull() { + fail("${name}"); + } + + /** + * This implementation returns {@code false}. + *

+ * Must be overridden by map writers. + *

+ */ + @Override + public boolean isEmptyMap() { + return false; + } + + @Override + public MapWriter map() { + fail("Map"); + return null; + } + + @Override + public ListWriter list() { + fail("List"); + return null; + } + + @Override + public MapWriter map(String name) { + fail("Map"); + return null; + } + + @Override + public ListWriter list(String name) { + fail("List"); + return null; + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if minor.class?starts_with("Decimal") > + public ${capName}Writer ${lowerName}(String name, int scale, int precision) { + fail("${capName}"); + return null; + } + + + @Override + public ${capName}Writer ${lowerName}(String name) { + fail("${capName}"); + return null; + } + + @Override + public ${capName}Writer ${lowerName}() { + fail("${capName}"); + return null; + } + + + + public void copyReader(FieldReader reader) { + fail("Copy FieldReader"); + } + + public void copyReaderToField(String name, FieldReader reader) { + fail("Copy FieldReader to STring"); + } + + private void fail(String name) { + throw new IllegalArgumentException(String.format("You tried to write a %s type when you are using a ValueWriter of type %s.", name, this.getClass().getSimpleName())); + } +} diff --git a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java new file mode 100644 index 00000000000..549dbf107ea --- /dev/null +++ b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.drill.common.types.TypeProtos.MinorType; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/AbstractPromotableFieldWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * A FieldWriter which delegates calls to another FieldWriter. The delegate FieldWriter can be promoted to a new type + * when necessary. Classes that extend this class are responsible for handling promotion. + * + * This class is generated using freemarker and the ${.template_name} template. + * + */ +@SuppressWarnings("unused") +abstract class AbstractPromotableFieldWriter extends AbstractFieldWriter { + AbstractPromotableFieldWriter(FieldWriter parent) { + super(parent); + } + + /** + * Retrieve the FieldWriter, promoting if it is not a FieldWriter of the specified type + * @param type + * @return + */ + abstract protected FieldWriter getWriter(MinorType type); + + /** + * Return the current FieldWriter + * @return + */ + abstract protected FieldWriter getWriter(); + + @Override + public void start() { + getWriter(MinorType.MAP).start(); + } + + @Override + public void end() { + getWriter(MinorType.MAP).end(); + } + + @Override + public void startList() { + getWriter(MinorType.LIST).startList(); + } + + @Override + public void endList() { + getWriter(MinorType.LIST).endList(); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#if !minor.class?starts_with("Decimal") > + @Override + public void write(${name}Holder holder) { + getWriter(MinorType.${name?upper_case}).write(holder); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + getWriter(MinorType.${name?upper_case}).write${minor.class}(<#list fields as field>${field.name}<#if field_has_next>, ); + } + + + + + public void writeNull() { + } + + @Override + public MapWriter map() { + return getWriter(MinorType.LIST).map(); + } + + @Override + public ListWriter list() { + return getWriter(MinorType.LIST).list(); + } + + @Override + public MapWriter map(String name) { + return getWriter(MinorType.MAP).map(name); + } + + @Override + public ListWriter list(String name) { + return getWriter(MinorType.MAP).list(name); + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if !minor.class?starts_with("Decimal") > + + @Override + public ${capName}Writer ${lowerName}(String name) { + return getWriter(MinorType.MAP).${lowerName}(name); + } + + @Override + public ${capName}Writer ${lowerName}() { + return getWriter(MinorType.LIST).${lowerName}(); + } + + + + + public void copyReader(FieldReader reader) { + getWriter().copyReader(reader); + } + + public void copyReaderToField(String name, FieldReader reader) { + getWriter().copyReaderToField(name, reader); + } +} diff --git a/java/vector/src/main/codegen/templates/BaseReader.java b/java/vector/src/main/codegen/templates/BaseReader.java new file mode 100644 index 00000000000..8f12b1da804 --- /dev/null +++ b/java/vector/src/main/codegen/templates/BaseReader.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/reader/BaseReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.reader; + +<#include "/@includes/vv_imports.ftl" /> + + + +@SuppressWarnings("unused") +public interface BaseReader extends Positionable{ + MajorType getType(); + MaterializedField getField(); + void reset(); + void read(UnionHolder holder); + void read(int index, UnionHolder holder); + void copyAsValue(UnionWriter writer); + boolean isSet(); + + public interface MapReader extends BaseReader, Iterable{ + FieldReader reader(String name); + } + + public interface RepeatedMapReader extends MapReader{ + boolean next(); + int size(); + void copyAsValue(MapWriter writer); + } + + public interface ListReader extends BaseReader{ + FieldReader reader(); + } + + public interface RepeatedListReader extends ListReader{ + boolean next(); + int size(); + void copyAsValue(ListWriter writer); + } + + public interface ScalarReader extends + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> ${name}Reader, + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> Repeated${name}Reader, + BaseReader {} + + interface ComplexReader{ + MapReader rootAsMap(); + ListReader rootAsList(); + boolean rootIsMap(); + boolean ok(); + } +} + diff --git a/java/vector/src/main/codegen/templates/BaseWriter.java b/java/vector/src/main/codegen/templates/BaseWriter.java new file mode 100644 index 00000000000..299b2389bb3 --- /dev/null +++ b/java/vector/src/main/codegen/templates/BaseWriter.java @@ -0,0 +1,117 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/writer/BaseWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.writer; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * File generated from ${.template_name} using FreeMarker. + */ +@SuppressWarnings("unused") + public interface BaseWriter extends AutoCloseable, Positionable { + FieldWriter getParent(); + int getValueCapacity(); + + public interface MapWriter extends BaseWriter { + + MaterializedField getField(); + + /** + * Whether this writer is a map writer and is empty (has no children). + * + *

+ * Intended only for use in determining whether to add dummy vector to + * avoid empty (zero-column) schema, as in JsonReader. + *

+ * + */ + boolean isEmptyMap(); + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if minor.class?starts_with("Decimal") > + ${capName}Writer ${lowerName}(String name, int scale, int precision); + + ${capName}Writer ${lowerName}(String name); + + + void copyReaderToField(String name, FieldReader reader); + MapWriter map(String name); + ListWriter list(String name); + void start(); + void end(); + } + + public interface ListWriter extends BaseWriter { + void startList(); + void endList(); + MapWriter map(); + ListWriter list(); + void copyReader(FieldReader reader); + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + ${capName}Writer ${lowerName}(); + + } + + public interface ScalarWriter extends + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> ${name}Writer, BaseWriter {} + + public interface ComplexWriter { + void allocate(); + void clear(); + void copyReader(FieldReader reader); + MapWriter rootAsMap(); + ListWriter rootAsList(); + + void setPosition(int index); + void setValueCount(int count); + void reset(); + } + + public interface MapOrListWriter { + void start(); + void end(); + MapOrListWriter map(String name); + MapOrListWriter listoftmap(String name); + MapOrListWriter list(String name); + boolean isMapWriter(); + boolean isListWriter(); + VarCharWriter varChar(String name); + IntWriter integer(String name); + BigIntWriter bigInt(String name); + Float4Writer float4(String name); + Float8Writer float8(String name); + BitWriter bit(String name); + VarBinaryWriter binary(String name); + } +} diff --git a/java/vector/src/main/codegen/templates/BasicTypeHelper.java b/java/vector/src/main/codegen/templates/BasicTypeHelper.java new file mode 100644 index 00000000000..bb6446e8d6b --- /dev/null +++ b/java/vector/src/main/codegen/templates/BasicTypeHelper.java @@ -0,0 +1,538 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/util/BasicTypeHelper.java" /> + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.util; + +<#include "/@includes/vv_imports.ftl" /> +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.RepeatedMapVector; +import org.apache.arrow.vector.util.CallBack; + +public class BasicTypeHelper { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BasicTypeHelper.class); + + private static final int WIDTH_ESTIMATE = 50; + + // Default length when casting to varchar : 65536 = 2^16 + // This only defines an absolute maximum for values, setting + // a high value like this will not inflate the size for small values + public static final int VARCHAR_DEFAULT_CAST_LEN = 65536; + + protected static String buildErrorMessage(final String operation, final MinorType type, final DataMode mode) { + return String.format("Unable to %s for minor type [%s] and mode [%s]", operation, type, mode); + } + + protected static String buildErrorMessage(final String operation, final MajorType type) { + return buildErrorMessage(operation, type.getMinorType(), type.getMode()); + } + + public static int getSize(MajorType major) { + switch (major.getMinorType()) { +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case}: + return ${type.width}<#if minor.class?substring(0, 3) == "Var" || + minor.class?substring(0, 3) == "PRO" || + minor.class?substring(0, 3) == "MSG"> + WIDTH_ESTIMATE; + + +// case FIXEDCHAR: return major.getWidth(); +// case FIXED16CHAR: return major.getWidth(); +// case FIXEDBINARY: return major.getWidth(); + } + throw new UnsupportedOperationException(buildErrorMessage("get size", major)); + } + + public static ValueVector getNewVector(String name, BufferAllocator allocator, MajorType type, CallBack callback){ + MaterializedField field = MaterializedField.create(name, type); + return getNewVector(field, allocator, callback); + } + + + public static Class getValueVectorClass(MinorType type, DataMode mode){ + switch (type) { + case UNION: + return UnionVector.class; + case MAP: + switch (mode) { + case OPTIONAL: + case REQUIRED: + return MapVector.class; + case REPEATED: + return RepeatedMapVector.class; + } + + case LIST: + switch (mode) { + case REPEATED: + return RepeatedListVector.class; + case REQUIRED: + case OPTIONAL: + return ListVector.class; + } + +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case}: + switch (mode) { + case REQUIRED: + return ${minor.class}Vector.class; + case OPTIONAL: + return Nullable${minor.class}Vector.class; + case REPEATED: + return Repeated${minor.class}Vector.class; + } + + + case GENERIC_OBJECT : + return ObjectVector.class ; + default: + break; + } + throw new UnsupportedOperationException(buildErrorMessage("get value vector class", type, mode)); + } + public static Class getReaderClassName( MinorType type, DataMode mode, boolean isSingularRepeated){ + switch (type) { + case MAP: + switch (mode) { + case REQUIRED: + if (!isSingularRepeated) + return SingleMapReaderImpl.class; + else + return SingleLikeRepeatedMapReaderImpl.class; + case REPEATED: + return RepeatedMapReaderImpl.class; + } + case LIST: + switch (mode) { + case REQUIRED: + return SingleListReaderImpl.class; + case REPEATED: + return RepeatedListReaderImpl.class; + } + +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case}: + switch (mode) { + case REQUIRED: + return ${minor.class}ReaderImpl.class; + case OPTIONAL: + return Nullable${minor.class}ReaderImpl.class; + case REPEATED: + return Repeated${minor.class}ReaderImpl.class; + } + + + default: + break; + } + throw new UnsupportedOperationException(buildErrorMessage("get reader class name", type, mode)); + } + + public static Class getWriterInterface( MinorType type, DataMode mode){ + switch (type) { + case UNION: return UnionWriter.class; + case MAP: return MapWriter.class; + case LIST: return ListWriter.class; +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case}: return ${minor.class}Writer.class; + + + default: + break; + } + throw new UnsupportedOperationException(buildErrorMessage("get writer interface", type, mode)); + } + + public static Class getWriterImpl( MinorType type, DataMode mode){ + switch (type) { + case UNION: + return UnionWriter.class; + case MAP: + switch (mode) { + case REQUIRED: + case OPTIONAL: + return SingleMapWriter.class; + case REPEATED: + return RepeatedMapWriter.class; + } + case LIST: + switch (mode) { + case REQUIRED: + case OPTIONAL: + return UnionListWriter.class; + case REPEATED: + return RepeatedListWriter.class; + } + +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case}: + switch (mode) { + case REQUIRED: + return ${minor.class}WriterImpl.class; + case OPTIONAL: + return Nullable${minor.class}WriterImpl.class; + case REPEATED: + return Repeated${minor.class}WriterImpl.class; + } + + + default: + break; + } + throw new UnsupportedOperationException(buildErrorMessage("get writer implementation", type, mode)); + } + + public static Class getHolderReaderImpl( MinorType type, DataMode mode){ + switch (type) { +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case}: + switch (mode) { + case REQUIRED: + return ${minor.class}HolderReaderImpl.class; + case OPTIONAL: + return Nullable${minor.class}HolderReaderImpl.class; + case REPEATED: + return Repeated${minor.class}HolderReaderImpl.class; + } + + + default: + break; + } + throw new UnsupportedOperationException(buildErrorMessage("get holder reader implementation", type, mode)); + } + + public static ValueVector getNewVector(MaterializedField field, BufferAllocator allocator){ + return getNewVector(field, allocator, null); + } + public static ValueVector getNewVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ + MajorType type = field.getType(); + + switch (type.getMinorType()) { + + case UNION: + return new UnionVector(field, allocator, callBack); + + case MAP: + switch (type.getMode()) { + case REQUIRED: + case OPTIONAL: + return new MapVector(field, allocator, callBack); + case REPEATED: + return new RepeatedMapVector(field, allocator, callBack); + } + case LIST: + switch (type.getMode()) { + case REPEATED: + return new RepeatedListVector(field, allocator, callBack); + case OPTIONAL: + case REQUIRED: + return new ListVector(field, allocator, callBack); + } +<#list vv. types as type> + <#list type.minor as minor> + case ${minor.class?upper_case}: + switch (type.getMode()) { + case REQUIRED: + return new ${minor.class}Vector(field, allocator); + case OPTIONAL: + return new Nullable${minor.class}Vector(field, allocator); + case REPEATED: + return new Repeated${minor.class}Vector(field, allocator); + } + + + case GENERIC_OBJECT: + return new ObjectVector(field, allocator) ; + default: + break; + } + // All ValueVector types have been handled. + throw new UnsupportedOperationException(buildErrorMessage("get new vector", type)); + } + + public static ValueHolder getValue(ValueVector vector, int index) { + MajorType type = vector.getField().getType(); + ValueHolder holder; + switch(type.getMinorType()) { +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + <#if minor.class?starts_with("Var") || minor.class == "IntervalDay" || minor.class == "Interval" || + minor.class?starts_with("Decimal28") || minor.class?starts_with("Decimal38")> + switch (type.getMode()) { + case REQUIRED: + holder = new ${minor.class}Holder(); + ((${minor.class}Vector) vector).getAccessor().get(index, (${minor.class}Holder)holder); + return holder; + case OPTIONAL: + holder = new Nullable${minor.class}Holder(); + ((Nullable${minor.class}Holder)holder).isSet = ((Nullable${minor.class}Vector) vector).getAccessor().isSet(index); + if (((Nullable${minor.class}Holder)holder).isSet == 1) { + ((Nullable${minor.class}Vector) vector).getAccessor().get(index, (Nullable${minor.class}Holder)holder); + } + return holder; + } + <#else> + switch (type.getMode()) { + case REQUIRED: + holder = new ${minor.class}Holder(); + ((${minor.class}Holder)holder).value = ((${minor.class}Vector) vector).getAccessor().get(index); + return holder; + case OPTIONAL: + holder = new Nullable${minor.class}Holder(); + ((Nullable${minor.class}Holder)holder).isSet = ((Nullable${minor.class}Vector) vector).getAccessor().isSet(index); + if (((Nullable${minor.class}Holder)holder).isSet == 1) { + ((Nullable${minor.class}Holder)holder).value = ((Nullable${minor.class}Vector) vector).getAccessor().get(index); + } + return holder; + } + + + + case GENERIC_OBJECT: + holder = new ObjectHolder(); + ((ObjectHolder)holder).obj = ((ObjectVector) vector).getAccessor().getObject(index) ; + break; + } + + throw new UnsupportedOperationException(buildErrorMessage("get value", type)); + } + + public static void setValue(ValueVector vector, int index, ValueHolder holder) { + MajorType type = vector.getField().getType(); + + switch(type.getMinorType()) { +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + switch (type.getMode()) { + case REQUIRED: + ((${minor.class}Vector) vector).getMutator().setSafe(index, (${minor.class}Holder) holder); + return; + case OPTIONAL: + if (((Nullable${minor.class}Holder) holder).isSet == 1) { + ((Nullable${minor.class}Vector) vector).getMutator().setSafe(index, (Nullable${minor.class}Holder) holder); + } + return; + } + + + case GENERIC_OBJECT: + ((ObjectVector) vector).getMutator().setSafe(index, (ObjectHolder) holder); + return; + default: + throw new UnsupportedOperationException(buildErrorMessage("set value", type)); + } + } + + public static void setValueSafe(ValueVector vector, int index, ValueHolder holder) { + MajorType type = vector.getField().getType(); + + switch(type.getMinorType()) { + <#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + switch (type.getMode()) { + case REQUIRED: + ((${minor.class}Vector) vector).getMutator().setSafe(index, (${minor.class}Holder) holder); + return; + case OPTIONAL: + if (((Nullable${minor.class}Holder) holder).isSet == 1) { + ((Nullable${minor.class}Vector) vector).getMutator().setSafe(index, (Nullable${minor.class}Holder) holder); + } else { + ((Nullable${minor.class}Vector) vector).getMutator().isSafe(index); + } + return; + } + + + case GENERIC_OBJECT: + ((ObjectVector) vector).getMutator().setSafe(index, (ObjectHolder) holder); + default: + throw new UnsupportedOperationException(buildErrorMessage("set value safe", type)); + } + } + + public static boolean compareValues(ValueVector v1, int v1index, ValueVector v2, int v2index) { + MajorType type1 = v1.getField().getType(); + MajorType type2 = v2.getField().getType(); + + if (type1.getMinorType() != type2.getMinorType()) { + return false; + } + + switch(type1.getMinorType()) { +<#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + if ( ((${minor.class}Vector) v1).getAccessor().get(v1index) == + ((${minor.class}Vector) v2).getAccessor().get(v2index) ) + return true; + break; + + + default: + break; + } + return false; + } + + /** + * Create a ValueHolder of MajorType. + * @param type + * @return + */ + public static ValueHolder createValueHolder(MajorType type) { + switch(type.getMinorType()) { + <#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + + switch (type.getMode()) { + case REQUIRED: + return new ${minor.class}Holder(); + case OPTIONAL: + return new Nullable${minor.class}Holder(); + case REPEATED: + return new Repeated${minor.class}Holder(); + } + + + case GENERIC_OBJECT: + return new ObjectHolder(); + default: + throw new UnsupportedOperationException(buildErrorMessage("create value holder", type)); + } + } + + public static boolean isNull(ValueHolder holder) { + MajorType type = getValueHolderType(holder); + + switch(type.getMinorType()) { + <#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + + switch (type.getMode()) { + case REQUIRED: + return true; + case OPTIONAL: + return ((Nullable${minor.class}Holder) holder).isSet == 0; + case REPEATED: + return true; + } + + + default: + throw new UnsupportedOperationException(buildErrorMessage("check is null", type)); + } + } + + public static ValueHolder deNullify(ValueHolder holder) { + MajorType type = getValueHolderType(holder); + + switch(type.getMinorType()) { + <#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + + switch (type.getMode()) { + case REQUIRED: + return holder; + case OPTIONAL: + if( ((Nullable${minor.class}Holder) holder).isSet == 1) { + ${minor.class}Holder newHolder = new ${minor.class}Holder(); + + <#assign fields = minor.fields!type.fields /> + <#list fields as field> + newHolder.${field.name} = ((Nullable${minor.class}Holder) holder).${field.name}; + + + return newHolder; + } else { + throw new UnsupportedOperationException("You can not convert a null value into a non-null value!"); + } + case REPEATED: + return holder; + } + + + default: + throw new UnsupportedOperationException(buildErrorMessage("deNullify", type)); + } + } + + public static ValueHolder nullify(ValueHolder holder) { + MajorType type = getValueHolderType(holder); + + switch(type.getMinorType()) { + <#list vv.types as type> + <#list type.minor as minor> + case ${minor.class?upper_case} : + switch (type.getMode()) { + case REQUIRED: + Nullable${minor.class}Holder newHolder = new Nullable${minor.class}Holder(); + newHolder.isSet = 1; + <#assign fields = minor.fields!type.fields /> + <#list fields as field> + newHolder.${field.name} = ((${minor.class}Holder) holder).${field.name}; + + return newHolder; + case OPTIONAL: + return holder; + case REPEATED: + throw new UnsupportedOperationException("You can not convert repeated type " + type + " to nullable type!"); + } + + + default: + throw new UnsupportedOperationException(buildErrorMessage("nullify", type)); + } + } + + public static MajorType getValueHolderType(ValueHolder holder) { + + if (0 == 1) { + return null; + } + <#list vv.types as type> + <#list type.minor as minor> + else if (holder instanceof ${minor.class}Holder) { + return ((${minor.class}Holder) holder).TYPE; + } else if (holder instanceof Nullable${minor.class}Holder) { + return ((Nullable${minor.class}Holder) holder).TYPE; + } + + + + throw new UnsupportedOperationException("ValueHolder is not supported for 'getValueHolderType' method."); + + } + +} diff --git a/java/vector/src/main/codegen/templates/ComplexCopier.java b/java/vector/src/main/codegen/templates/ComplexCopier.java new file mode 100644 index 00000000000..3614231c834 --- /dev/null +++ b/java/vector/src/main/codegen/templates/ComplexCopier.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/ComplexCopier.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ComplexCopier { + + /** + * Do a deep copy of the value in input into output + * @param in + * @param out + */ + public static void copy(FieldReader input, FieldWriter output) { + writeValue(input, output); + } + + private static void writeValue(FieldReader reader, FieldWriter writer) { + final DataMode m = reader.getType().getMode(); + final MinorType mt = reader.getType().getMinorType(); + + switch(m){ + case OPTIONAL: + case REQUIRED: + + + switch (mt) { + + case LIST: + writer.startList(); + while (reader.next()) { + writeValue(reader.reader(), getListWriterForReader(reader.reader(), writer)); + } + writer.endList(); + break; + case MAP: + writer.start(); + if (reader.isSet()) { + for(String name : reader){ + FieldReader childReader = reader.reader(name); + if(childReader.isSet()){ + writeValue(childReader, getMapWriterForReader(childReader, writer, name)); + } + } + } + writer.end(); + break; + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + + case ${name?upper_case}: + if (reader.isSet()) { + Nullable${name}Holder ${uncappedName}Holder = new Nullable${name}Holder(); + reader.read(${uncappedName}Holder); + if (${uncappedName}Holder.isSet == 1) { + writer.write${name}(<#list fields as field>${uncappedName}Holder.${field.name}<#if field_has_next>, ); + } + } + break; + + + + } + break; + } + } + + private static FieldWriter getMapWriterForReader(FieldReader reader, MapWriter writer, String name) { + switch (reader.getType().getMinorType()) { + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return (FieldWriter) writer.<#if name == "Int">integer<#else>${uncappedName}(name); + + + case MAP: + return (FieldWriter) writer.map(name); + case LIST: + return (FieldWriter) writer.list(name); + default: + throw new UnsupportedOperationException(reader.getType().toString()); + } + } + + private static FieldWriter getListWriterForReader(FieldReader reader, ListWriter writer) { + switch (reader.getType().getMinorType()) { + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return (FieldWriter) writer.<#if name == "Int">integer<#else>${uncappedName}(); + + + case MAP: + return (FieldWriter) writer.map(); + case LIST: + return (FieldWriter) writer.list(); + default: + throw new UnsupportedOperationException(reader.getType().toString()); + } + } +} diff --git a/java/vector/src/main/codegen/templates/ComplexReaders.java b/java/vector/src/main/codegen/templates/ComplexReaders.java new file mode 100644 index 00000000000..34c65712601 --- /dev/null +++ b/java/vector/src/main/codegen/templates/ComplexReaders.java @@ -0,0 +1,183 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.Override; +import java.util.List; + +import org.apache.arrow.record.TransferPair; +import org.apache.arrow.vector.complex.IndexHolder; +import org.apache.arrow.vector.complex.writer.IntervalWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#list ["", "Repeated"] as mode> +<#assign lowerName = minor.class?uncap_first /> +<#if lowerName == "int" ><#assign lowerName = "integer" /> +<#assign name = mode + minor.class?cap_first /> +<#assign javaType = (minor.javaType!type.javaType) /> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> +<#assign safeType=friendlyType /> +<#if safeType=="byte[]"><#assign safeType="ByteArray" /> + +<#assign hasFriendly = minor.friendlyType!"no" == "no" /> + +<#list ["", "Nullable"] as nullMode> +<#if (mode == "Repeated" && nullMode == "") || mode == "" > +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${nullMode}${name}ReaderImpl.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +@SuppressWarnings("unused") +public class ${nullMode}${name}ReaderImpl extends AbstractFieldReader { + + private final ${nullMode}${name}Vector vector; + + public ${nullMode}${name}ReaderImpl(${nullMode}${name}Vector vector){ + super(); + this.vector = vector; + } + + public MajorType getType(){ + return vector.getField().getType(); + } + + public MaterializedField getField(){ + return vector.getField(); + } + + public boolean isSet(){ + <#if nullMode == "Nullable"> + return !vector.getAccessor().isNull(idx()); + <#else> + return true; + + } + + + + + <#if mode == "Repeated"> + + public void copyAsValue(${minor.class?cap_first}Writer writer){ + Repeated${minor.class?cap_first}WriterImpl impl = (Repeated${minor.class?cap_first}WriterImpl) writer; + impl.vector.copyFromSafe(idx(), impl.idx(), vector); + } + + public void copyAsField(String name, MapWriter writer){ + Repeated${minor.class?cap_first}WriterImpl impl = (Repeated${minor.class?cap_first}WriterImpl) writer.list(name).${lowerName}(); + impl.vector.copyFromSafe(idx(), impl.idx(), vector); + } + + public int size(){ + return vector.getAccessor().getInnerValueCountAt(idx()); + } + + public void read(int arrayIndex, ${minor.class?cap_first}Holder h){ + vector.getAccessor().get(idx(), arrayIndex, h); + } + public void read(int arrayIndex, Nullable${minor.class?cap_first}Holder h){ + vector.getAccessor().get(idx(), arrayIndex, h); + } + + public ${friendlyType} read${safeType}(int arrayIndex){ + return vector.getAccessor().getSingleObject(idx(), arrayIndex); + } + + + public List readObject(){ + return (List) (Object) vector.getAccessor().getObject(idx()); + } + + <#else> + + public void copyAsValue(${minor.class?cap_first}Writer writer){ + ${nullMode}${minor.class?cap_first}WriterImpl impl = (${nullMode}${minor.class?cap_first}WriterImpl) writer; + impl.vector.copyFromSafe(idx(), impl.idx(), vector); + } + + public void copyAsField(String name, MapWriter writer){ + ${nullMode}${minor.class?cap_first}WriterImpl impl = (${nullMode}${minor.class?cap_first}WriterImpl) writer.${lowerName}(name); + impl.vector.copyFromSafe(idx(), impl.idx(), vector); + } + + <#if nullMode != "Nullable"> + public void read(${minor.class?cap_first}Holder h){ + vector.getAccessor().get(idx(), h); + } + + + public void read(Nullable${minor.class?cap_first}Holder h){ + vector.getAccessor().get(idx(), h); + } + + public ${friendlyType} read${safeType}(){ + return vector.getAccessor().getObject(idx()); + } + + public void copyValue(FieldWriter w){ + + } + + public Object readObject(){ + return vector.getAccessor().getObject(idx()); + } + + + +} + + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/reader/${name}Reader.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.reader; + +<#include "/@includes/vv_imports.ftl" /> +@SuppressWarnings("unused") +public interface ${name}Reader extends BaseReader{ + + <#if mode == "Repeated"> + public int size(); + public void read(int arrayIndex, ${minor.class?cap_first}Holder h); + public void read(int arrayIndex, Nullable${minor.class?cap_first}Holder h); + public Object readObject(int arrayIndex); + public ${friendlyType} read${safeType}(int arrayIndex); + <#else> + public void read(${minor.class?cap_first}Holder h); + public void read(Nullable${minor.class?cap_first}Holder h); + public Object readObject(); + public ${friendlyType} read${safeType}(); + + public boolean isSet(); + public void copyAsValue(${minor.class}Writer writer); + public void copyAsField(String name, ${minor.class}Writer writer); + +} + + + + + + + + diff --git a/java/vector/src/main/codegen/templates/ComplexWriters.java b/java/vector/src/main/codegen/templates/ComplexWriters.java new file mode 100644 index 00000000000..8f9a6e7b971 --- /dev/null +++ b/java/vector/src/main/codegen/templates/ComplexWriters.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#list ["", "Nullable", "Repeated"] as mode> +<#assign name = mode + minor.class?cap_first /> +<#assign eName = name /> +<#assign javaType = (minor.javaType!type.javaType) /> +<#assign fields = minor.fields!type.fields /> + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${eName}WriterImpl.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using FreeMarker on the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ${eName}WriterImpl extends AbstractFieldWriter { + + private final ${name}Vector.Mutator mutator; + final ${name}Vector vector; + + public ${eName}WriterImpl(${name}Vector vector, AbstractFieldWriter parent) { + super(parent); + this.mutator = vector.getMutator(); + this.vector = vector; + } + + @Override + public MaterializedField getField() { + return vector.getField(); + } + + @Override + public int getValueCapacity() { + return vector.getValueCapacity(); + } + + @Override + public void allocate() { + vector.allocateNew(); + } + + @Override + public void close() { + vector.close(); + } + + @Override + public void clear() { + vector.clear(); + } + + @Override + protected int idx() { + return super.idx(); + } + + <#if mode == "Repeated"> + + public void write(${minor.class?cap_first}Holder h) { + mutator.addSafe(idx(), h); + vector.getMutator().setValueCount(idx()+1); + } + + public void write(Nullable${minor.class?cap_first}Holder h) { + mutator.addSafe(idx(), h); + vector.getMutator().setValueCount(idx()+1); + } + + <#if !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + mutator.addSafe(idx(), <#list fields as field>${field.name}<#if field_has_next>, ); + vector.getMutator().setValueCount(idx()+1); + } + + + public void setPosition(int idx) { + super.setPosition(idx); + mutator.startNewValue(idx); + } + + + <#else> + + public void write(${minor.class}Holder h) { + mutator.setSafe(idx(), h); + vector.getMutator().setValueCount(idx()+1); + } + + public void write(Nullable${minor.class}Holder h) { + mutator.setSafe(idx(), h); + vector.getMutator().setValueCount(idx()+1); + } + + <#if !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + mutator.setSafe(idx(), <#if mode == "Nullable">1, <#list fields as field>${field.name}<#if field_has_next>, ); + vector.getMutator().setValueCount(idx()+1); + } + + <#if mode == "Nullable"> + + public void writeNull() { + mutator.setNull(idx()); + vector.getMutator().setValueCount(idx()+1); + } + + + +} + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/writer/${eName}Writer.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.writer; + +<#include "/@includes/vv_imports.ftl" /> +@SuppressWarnings("unused") +public interface ${eName}Writer extends BaseWriter { + public void write(${minor.class}Holder h); + + <#if !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ); + +} + + + + diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java new file mode 100644 index 00000000000..18fcac93bb6 --- /dev/null +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -0,0 +1,813 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.Override; + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + +<#if type.major == "Fixed"> +<@pp.changeOutputFile name="/org/apache/arrow/vector/${minor.class}Vector.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * ${minor.class} implements a vector of fixed width values. Elements in the vector are accessed + * by position, starting from the logical start of the vector. Values should be pushed onto the + * vector sequentially, but may be randomly accessed. + * The width of each element is ${type.width} byte(s) + * The equivalent Java primitive is '${minor.javaType!type.javaType}' + * + * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. + */ +public final class ${minor.class}Vector extends BaseDataValueVector implements FixedWidthVector{ + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${minor.class}Vector.class); + + private final FieldReader reader = new ${minor.class}ReaderImpl(${minor.class}Vector.this); + private final Accessor accessor = new Accessor(); + private final Mutator mutator = new Mutator(); + + private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; + private int allocationMonitor = 0; + + public ${minor.class}Vector(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + } + + @Override + public FieldReader getReader(){ + return reader; + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + return valueCount * ${type.width}; + } + + @Override + public int getValueCapacity(){ + return (int) (data.capacity() *1.0 / ${type.width}); + } + + @Override + public Accessor getAccessor(){ + return accessor; + } + + @Override + public Mutator getMutator(){ + return mutator; + } + + @Override + public void setInitialCapacity(final int valueCount) { + final long size = 1L * valueCount * ${type.width}; + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); + } + allocationSizeInBytes = (int)size; + } + + @Override + public void allocateNew() { + if(!allocateNewSafe()){ + throw new OutOfMemoryException("Failure while allocating buffer."); + } + } + + @Override + public boolean allocateNewSafe() { + long curAllocationSize = allocationSizeInBytes; + if (allocationMonitor > 10) { + curAllocationSize = Math.max(8, curAllocationSize / 2); + allocationMonitor = 0; + } else if (allocationMonitor < -2) { + curAllocationSize = allocationSizeInBytes * 2L; + allocationMonitor = 0; + } + + try{ + allocateBytes(curAllocationSize); + } catch (RuntimeException ex) { + return false; + } + return true; + } + + /** + * Allocate a new buffer that supports setting at least the provided number of values. May actually be sized bigger + * depending on underlying buffer rounding size. Must be called prior to using the ValueVector. + * + * Note that the maximum number of values a vector can allocate is Integer.MAX_VALUE / value width. + * + * @param valueCount + * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the new buffer + */ + @Override + public void allocateNew(final int valueCount) { + allocateBytes(valueCount * ${type.width}); + } + + @Override + public void reset() { + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION; + allocationMonitor = 0; + zeroVector(); + super.reset(); + } + + private void allocateBytes(final long size) { + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); + } + + final int curSize = (int)size; + clear(); + data = allocator.buffer(curSize); + data.readerIndex(0); + allocationSizeInBytes = curSize; + } + +/** + * Allocate new buffer with double capacity, and copy data into the new buffer. Replace vector's buffer with new buffer, and release old one + * + * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the new buffer + */ + public void reAlloc() { + final long newAllocationSize = allocationSizeInBytes * 2L; + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer. Max allowed buffer size is reached."); + } + + logger.debug("Reallocating vector [{}]. # of bytes: [{}] -> [{}]", field, allocationSizeInBytes, newAllocationSize); + final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); + newBuf.setBytes(0, data, 0, data.capacity()); + final int halfNewCapacity = newBuf.capacity() / 2; + newBuf.setZero(halfNewCapacity, halfNewCapacity); + newBuf.writerIndex(data.writerIndex()); + data.release(1); + data = newBuf; + allocationSizeInBytes = (int)newAllocationSize; + } + + /** + * {@inheritDoc} + */ + @Override + public void zeroVector() { + data.setZero(0, data.capacity()); + } + +// @Override +// public void load(SerializedField metadata, ArrowBuf buffer) { +// Preconditions.checkArgument(this.field.getPath().equals(metadata.getNamePart().getName()), "The field %s doesn't match the provided metadata %s.", this.field, metadata); +// final int actualLength = metadata.getBufferLength(); +// final int valueCount = metadata.getValueCount(); +// final int expectedLength = valueCount * ${type.width}; +// assert actualLength == expectedLength : String.format("Expected to load %d bytes but actually loaded %d bytes", expectedLength, actualLength); +// +// clear(); +// if (data != null) { +// data.release(1); +// } +// data = buffer.slice(0, actualLength); +// data.retain(1); +// data.writerIndex(actualLength); +// } + + public TransferPair getTransferPair(BufferAllocator allocator){ + return new TransferImpl(getField(), allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator){ + return new TransferImpl(getField().withPath(ref), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((${minor.class}Vector) to); + } + + public void transferTo(${minor.class}Vector target){ + target.clear(); + target.data = data.transferOwnership(target.allocator).buffer; + target.data.writerIndex(data.writerIndex()); + clear(); + } + + public void splitAndTransferTo(int startIndex, int length, ${minor.class}Vector target) { + final int startPoint = startIndex * ${type.width}; + final int sliceLength = length * ${type.width}; + target.clear(); + target.data = data.slice(startPoint, sliceLength).transferOwnership(target.allocator).buffer; + target.data.writerIndex(sliceLength); + } + + private class TransferImpl implements TransferPair{ + private ${minor.class}Vector to; + + public TransferImpl(MaterializedField field, BufferAllocator allocator){ + to = new ${minor.class}Vector(field, allocator); + } + + public TransferImpl(${minor.class}Vector to) { + this.to = to; + } + + @Override + public ${minor.class}Vector getTo(){ + return to; + } + + @Override + public void transfer(){ + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, ${minor.class}Vector.this); + } + } + + public void copyFrom(int fromIndex, int thisIndex, ${minor.class}Vector from){ + <#if (type.width > 8)> + from.data.getBytes(fromIndex * ${type.width}, data, thisIndex * ${type.width}, ${type.width}); + <#else> <#-- type.width <= 8 --> + data.set${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}, + from.data.get${(minor.javaType!type.javaType)?cap_first}(fromIndex * ${type.width}) + ); + <#-- type.width --> + } + + public void copyFromSafe(int fromIndex, int thisIndex, ${minor.class}Vector from){ + while(thisIndex >= getValueCapacity()) { + reAlloc(); + } + copyFrom(fromIndex, thisIndex, from); + } + + public void decrementAllocationMonitor() { + if (allocationMonitor > 0) { + allocationMonitor = 0; + } + --allocationMonitor; + } + + private void incrementAllocationMonitor() { + ++allocationMonitor; + } + + public final class Accessor extends BaseDataValueVector.BaseAccessor { + @Override + public int getValueCount() { + return data.writerIndex() / ${type.width}; + } + + @Override + public boolean isNull(int index){ + return false; + } + + <#if (type.width > 8)> + + public ${minor.javaType!type.javaType} get(int index) { + return data.slice(index * ${type.width}, ${type.width}); + } + + <#if (minor.class == "Interval")> + public void get(int index, ${minor.class}Holder holder){ + + final int offsetIndex = index * ${type.width}; + holder.months = data.getInt(offsetIndex); + holder.days = data.getInt(offsetIndex + ${minor.daysOffset}); + holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + } + + public void get(int index, Nullable${minor.class}Holder holder){ + final int offsetIndex = index * ${type.width}; + holder.isSet = 1; + holder.months = data.getInt(offsetIndex); + holder.days = data.getInt(offsetIndex + ${minor.daysOffset}); + holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + } + + @Override + public ${friendlyType} getObject(int index) { + final int offsetIndex = index * ${type.width}; + final int months = data.getInt(offsetIndex); + final int days = data.getInt(offsetIndex + ${minor.daysOffset}); + final int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + final Period p = new Period(); + return p.plusMonths(months).plusDays(days).plusMillis(millis); + } + + public StringBuilder getAsStringBuilder(int index) { + + final int offsetIndex = index * ${type.width}; + + int months = data.getInt(offsetIndex); + final int days = data.getInt(offsetIndex + ${minor.daysOffset}); + int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + + final int years = (months / org.apache.arrow.vector.util.DateUtility.yearsToMonths); + months = (months % org.apache.arrow.vector.util.DateUtility.yearsToMonths); + + final int hours = millis / (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + + final int minutes = millis / (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + + final long seconds = millis / (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + + final String yearString = (Math.abs(years) == 1) ? " year " : " years "; + final String monthString = (Math.abs(months) == 1) ? " month " : " months "; + final String dayString = (Math.abs(days) == 1) ? " day " : " days "; + + + return(new StringBuilder(). + append(years).append(yearString). + append(months).append(monthString). + append(days).append(dayString). + append(hours).append(":"). + append(minutes).append(":"). + append(seconds).append("."). + append(millis)); + } + + <#elseif (minor.class == "IntervalDay")> + public void get(int index, ${minor.class}Holder holder){ + + final int offsetIndex = index * ${type.width}; + holder.days = data.getInt(offsetIndex); + holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + } + + public void get(int index, Nullable${minor.class}Holder holder){ + final int offsetIndex = index * ${type.width}; + holder.isSet = 1; + holder.days = data.getInt(offsetIndex); + holder.milliseconds = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + } + + @Override + public ${friendlyType} getObject(int index) { + final int offsetIndex = index * ${type.width}; + final int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + final int days = data.getInt(offsetIndex); + final Period p = new Period(); + return p.plusDays(days).plusMillis(millis); + } + + + public StringBuilder getAsStringBuilder(int index) { + final int offsetIndex = index * ${type.width}; + + int millis = data.getInt(offsetIndex + ${minor.millisecondsOffset}); + final int days = data.getInt(offsetIndex); + + final int hours = millis / (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.hoursToMillis); + + final int minutes = millis / (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.minutesToMillis); + + final int seconds = millis / (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + millis = millis % (org.apache.arrow.vector.util.DateUtility.secondsToMillis); + + final String dayString = (Math.abs(days) == 1) ? " day " : " days "; + + return(new StringBuilder(). + append(days).append(dayString). + append(hours).append(":"). + append(minutes).append(":"). + append(seconds).append("."). + append(millis)); + } + + <#elseif (minor.class == "Decimal28Sparse") || (minor.class == "Decimal38Sparse") || (minor.class == "Decimal28Dense") || (minor.class == "Decimal38Dense")> + + public void get(int index, ${minor.class}Holder holder) { + holder.start = index * ${type.width}; + holder.buffer = data; + holder.scale = getField().getScale(); + holder.precision = getField().getPrecision(); + } + + public void get(int index, Nullable${minor.class}Holder holder) { + holder.isSet = 1; + holder.start = index * ${type.width}; + holder.buffer = data; + holder.scale = getField().getScale(); + holder.precision = getField().getPrecision(); + } + + @Override + public ${friendlyType} getObject(int index) { + <#if (minor.class == "Decimal28Sparse") || (minor.class == "Decimal38Sparse")> + // Get the BigDecimal object + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromSparse(data, index * ${type.width}, ${minor.nDecimalDigits}, getField().getScale()); + <#else> + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromDense(data, index * ${type.width}, ${minor.nDecimalDigits}, getField().getScale(), ${minor.maxPrecisionDigits}, ${type.width}); + + } + + <#else> + public void get(int index, ${minor.class}Holder holder){ + holder.buffer = data; + holder.start = index * ${type.width}; + } + + public void get(int index, Nullable${minor.class}Holder holder){ + holder.isSet = 1; + holder.buffer = data; + holder.start = index * ${type.width}; + } + + @Override + public ${friendlyType} getObject(int index) { + return data.slice(index * ${type.width}, ${type.width}) + } + + + <#else> <#-- type.width <= 8 --> + + public ${minor.javaType!type.javaType} get(int index) { + return data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + } + + <#if type.width == 4> + public long getTwoAsLong(int index) { + return data.getLong(index * ${type.width}); + } + + + + <#if minor.class == "Date"> + @Override + public ${friendlyType} getObject(int index) { + org.joda.time.DateTime date = new org.joda.time.DateTime(get(index), org.joda.time.DateTimeZone.UTC); + date = date.withZoneRetainFields(org.joda.time.DateTimeZone.getDefault()); + return date; + } + + <#elseif minor.class == "TimeStamp"> + @Override + public ${friendlyType} getObject(int index) { + org.joda.time.DateTime date = new org.joda.time.DateTime(get(index), org.joda.time.DateTimeZone.UTC); + date = date.withZoneRetainFields(org.joda.time.DateTimeZone.getDefault()); + return date; + } + + <#elseif minor.class == "IntervalYear"> + @Override + public ${friendlyType} getObject(int index) { + + final int value = get(index); + + final int years = (value / org.apache.arrow.vector.util.DateUtility.yearsToMonths); + final int months = (value % org.apache.arrow.vector.util.DateUtility.yearsToMonths); + final Period p = new Period(); + return p.plusYears(years).plusMonths(months); + } + + public StringBuilder getAsStringBuilder(int index) { + + int months = data.getInt(index); + + final int years = (months / org.apache.arrow.vector.util.DateUtility.yearsToMonths); + months = (months % org.apache.arrow.vector.util.DateUtility.yearsToMonths); + + final String yearString = (Math.abs(years) == 1) ? " year " : " years "; + final String monthString = (Math.abs(months) == 1) ? " month " : " months "; + + return(new StringBuilder(). + append(years).append(yearString). + append(months).append(monthString)); + } + + <#elseif minor.class == "Time"> + @Override + public DateTime getObject(int index) { + + org.joda.time.DateTime time = new org.joda.time.DateTime(get(index), org.joda.time.DateTimeZone.UTC); + time = time.withZoneRetainFields(org.joda.time.DateTimeZone.getDefault()); + return time; + } + + <#elseif minor.class == "Decimal9" || minor.class == "Decimal18"> + @Override + public ${friendlyType} getObject(int index) { + + final BigInteger value = BigInteger.valueOf(((${type.boxedType})get(index)).${type.javaType}Value()); + return new BigDecimal(value, getField().getScale()); + } + + <#else> + @Override + public ${friendlyType} getObject(int index) { + return get(index); + } + public ${minor.javaType!type.javaType} getPrimitiveObject(int index) { + return get(index); + } + + + public void get(int index, ${minor.class}Holder holder){ + <#if minor.class.startsWith("Decimal")> + holder.scale = getField().getScale(); + holder.precision = getField().getPrecision(); + + + holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + } + + public void get(int index, Nullable${minor.class}Holder holder){ + holder.isSet = 1; + holder.value = data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + } + + + <#-- type.width --> + } + + /** + * ${minor.class}.Mutator implements a mutable vector of fixed width values. Elements in the + * vector are accessed by position from the logical start of the vector. Values should be pushed + * onto the vector sequentially, but may be randomly accessed. + * The width of each element is ${type.width} byte(s) + * The equivalent Java primitive is '${minor.javaType!type.javaType}' + * + * NB: this class is automatically generated from ValueVectorTypes.tdd using FreeMarker. + */ + public final class Mutator extends BaseDataValueVector.BaseMutator { + + private Mutator(){}; + /** + * Set the element at the given index to the given value. Note that widths smaller than + * 32 bits are handled by the ArrowBuf interface. + * + * @param index position of the bit to set + * @param value value to set + */ + <#if (type.width > 8)> + public void set(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { + data.setBytes(index * ${type.width}, value, 0, ${type.width}); + } + + public void setSafe(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { + while(index >= getValueCapacity()) { + reAlloc(); + } + data.setBytes(index * ${type.width}, value, 0, ${type.width}); + } + + <#if (minor.class == "Interval")> + public void set(int index, int months, int days, int milliseconds){ + final int offsetIndex = index * ${type.width}; + data.setInt(offsetIndex, months); + data.setInt((offsetIndex + ${minor.daysOffset}), days); + data.setInt((offsetIndex + ${minor.millisecondsOffset}), milliseconds); + } + + protected void set(int index, ${minor.class}Holder holder){ + set(index, holder.months, holder.days, holder.milliseconds); + } + + protected void set(int index, Nullable${minor.class}Holder holder){ + set(index, holder.months, holder.days, holder.milliseconds); + } + + public void setSafe(int index, int months, int days, int milliseconds){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, months, days, milliseconds); + } + + public void setSafe(int index, Nullable${minor.class}Holder holder){ + setSafe(index, holder.months, holder.days, holder.milliseconds); + } + + public void setSafe(int index, ${minor.class}Holder holder){ + setSafe(index, holder.months, holder.days, holder.milliseconds); + } + + <#elseif (minor.class == "IntervalDay")> + public void set(int index, int days, int milliseconds){ + final int offsetIndex = index * ${type.width}; + data.setInt(offsetIndex, days); + data.setInt((offsetIndex + ${minor.millisecondsOffset}), milliseconds); + } + + protected void set(int index, ${minor.class}Holder holder){ + set(index, holder.days, holder.milliseconds); + } + protected void set(int index, Nullable${minor.class}Holder holder){ + set(index, holder.days, holder.milliseconds); + } + + public void setSafe(int index, int days, int milliseconds){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, days, milliseconds); + } + + public void setSafe(int index, ${minor.class}Holder holder){ + setSafe(index, holder.days, holder.milliseconds); + } + + public void setSafe(int index, Nullable${minor.class}Holder holder){ + setSafe(index, holder.days, holder.milliseconds); + } + + <#elseif (minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse") || (minor.class == "Decimal28Dense") || (minor.class == "Decimal38Dense")> + + public void set(int index, ${minor.class}Holder holder){ + set(index, holder.start, holder.buffer); + } + + void set(int index, Nullable${minor.class}Holder holder){ + set(index, holder.start, holder.buffer); + } + + public void setSafe(int index, Nullable${minor.class}Holder holder){ + setSafe(index, holder.start, holder.buffer); + } + public void setSafe(int index, ${minor.class}Holder holder){ + setSafe(index, holder.start, holder.buffer); + } + + public void setSafe(int index, int start, ArrowBuf buffer){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, start, buffer); + } + + public void set(int index, int start, ArrowBuf buffer){ + data.setBytes(index * ${type.width}, buffer, start, ${type.width}); + } + + <#else> + + protected void set(int index, ${minor.class}Holder holder){ + set(index, holder.start, holder.buffer); + } + + public void set(int index, Nullable${minor.class}Holder holder){ + set(index, holder.start, holder.buffer); + } + + public void set(int index, int start, ArrowBuf buffer){ + data.setBytes(index * ${type.width}, buffer, start, ${type.width}); + } + + public void setSafe(int index, ${minor.class}Holder holder){ + setSafe(index, holder.start, holder.buffer); + } + public void setSafe(int index, Nullable${minor.class}Holder holder){ + setSafe(index, holder.start, holder.buffer); + } + + public void setSafe(int index, int start, ArrowBuf buffer){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, holder); + } + + public void set(int index, Nullable${minor.class}Holder holder){ + data.setBytes(index * ${type.width}, holder.buffer, holder.start, ${type.width}); + } + + + @Override + public void generateTestData(int count) { + setValueCount(count); + boolean even = true; + final int valueCount = getAccessor().getValueCount(); + for(int i = 0; i < valueCount; i++, even = !even) { + final byte b = even ? Byte.MIN_VALUE : Byte.MAX_VALUE; + for(int w = 0; w < ${type.width}; w++){ + data.setByte(i + w, b); + } + } + } + + <#else> <#-- type.width <= 8 --> + public void set(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int value) { + data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, value); + } + + public void setSafe(int index, <#if (type.width >= 4)>${minor.javaType!type.javaType}<#else>int value) { + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, value); + } + + protected void set(int index, ${minor.class}Holder holder){ + data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); + } + + public void setSafe(int index, ${minor.class}Holder holder){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, holder); + } + + protected void set(int index, Nullable${minor.class}Holder holder){ + data.set${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}, holder.value); + } + + public void setSafe(int index, Nullable${minor.class}Holder holder){ + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, holder); + } + + @Override + public void generateTestData(int size) { + setValueCount(size); + boolean even = true; + final int valueCount = getAccessor().getValueCount(); + for(int i = 0; i < valueCount; i++, even = !even) { + if(even){ + set(i, ${minor.boxedType!type.boxedType}.MIN_VALUE); + }else{ + set(i, ${minor.boxedType!type.boxedType}.MAX_VALUE); + } + } + } + + public void generateTestDataAlt(int size) { + setValueCount(size); + boolean even = true; + final int valueCount = getAccessor().getValueCount(); + for(int i = 0; i < valueCount; i++, even = !even) { + if(even){ + set(i, (${(minor.javaType!type.javaType)}) 1); + }else{ + set(i, (${(minor.javaType!type.javaType)}) 0); + } + } + } + + <#-- type.width --> + + @Override + public void setValueCount(int valueCount) { + final int currentValueCapacity = getValueCapacity(); + final int idx = (${type.width} * valueCount); + while(valueCount > getValueCapacity()) { + reAlloc(); + } + if (valueCount > 0 && currentValueCapacity > valueCount * 2) { + incrementAllocationMonitor(); + } else if (allocationMonitor > 0) { + allocationMonitor = 0; + } + VectorTrimmer.trim(data, idx); + data.writerIndex(valueCount * ${type.width}); + } + } +} + + <#-- type.major --> + + diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java new file mode 100644 index 00000000000..3005fca0385 --- /dev/null +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -0,0 +1,290 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#list ["", "Nullable", "Repeated"] as holderMode> +<#assign nullMode = holderMode /> +<#if holderMode == "Repeated"><#assign nullMode = "Nullable" /> + +<#assign lowerName = minor.class?uncap_first /> +<#if lowerName == "int" ><#assign lowerName = "integer" /> +<#assign name = minor.class?cap_first /> +<#assign javaType = (minor.javaType!type.javaType) /> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> +<#assign safeType=friendlyType /> +<#if safeType=="byte[]"><#assign safeType="ByteArray" /> +<#assign fields = minor.fields!type.fields /> + +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${holderMode}${name}HolderReaderImpl.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +import java.math.BigDecimal; +import java.math.BigInteger; + +import org.joda.time.Period; + +// Source code generated using FreeMarker template ${.template_name} + +@SuppressWarnings("unused") +public class ${holderMode}${name}HolderReaderImpl extends AbstractFieldReader { + + private ${nullMode}${name}Holder holder; +<#if holderMode == "Repeated" > + private int index = -1; + private ${holderMode}${name}Holder repeatedHolder; + + + public ${holderMode}${name}HolderReaderImpl(${holderMode}${name}Holder holder) { +<#if holderMode == "Repeated" > + this.holder = new ${nullMode}${name}Holder(); + this.repeatedHolder = holder; +<#else> + this.holder = holder; + + } + + @Override + public int size() { +<#if holderMode == "Repeated"> + return repeatedHolder.end - repeatedHolder.start; +<#else> + throw new UnsupportedOperationException("You can't call size on a Holder value reader."); + + } + + @Override + public boolean next() { +<#if holderMode == "Repeated"> + if(index + 1 < repeatedHolder.end) { + index++; + repeatedHolder.vector.getAccessor().get(repeatedHolder.start + index, holder); + return true; + } else { + return false; + } +<#else> + throw new UnsupportedOperationException("You can't call next on a single value reader."); + + + } + + @Override + public void setPosition(int index) { + throw new UnsupportedOperationException("You can't call next on a single value reader."); + } + + @Override + public MajorType getType() { +<#if holderMode == "Repeated"> + return this.repeatedHolder.TYPE; +<#else> + return this.holder.TYPE; + + } + + @Override + public boolean isSet() { + <#if holderMode == "Repeated"> + return this.repeatedHolder.end!=this.repeatedHolder.start; + <#elseif nullMode == "Nullable"> + return this.holder.isSet == 1; + <#else> + return true; + + + } + +<#if holderMode != "Repeated"> +@Override + public void read(${name}Holder h) { + <#list fields as field> + h.${field.name} = holder.${field.name}; + + } + + @Override + public void read(Nullable${name}Holder h) { + <#list fields as field> + h.${field.name} = holder.${field.name}; + + h.isSet = isSet() ? 1 : 0; + } + + +<#if holderMode == "Repeated"> + @Override + public ${friendlyType} read${safeType}(int index){ + repeatedHolder.vector.getAccessor().get(repeatedHolder.start + index, holder); + ${friendlyType} value = read${safeType}(); + if (this.index > -1) { + repeatedHolder.vector.getAccessor().get(repeatedHolder.start + this.index, holder); + } + return value; + } + + + @Override + public ${friendlyType} read${safeType}(){ +<#if nullMode == "Nullable"> + if (!isSet()) { + return null; + } + + +<#if type.major == "VarLen"> + + int length = holder.end - holder.start; + byte[] value = new byte [length]; + holder.buffer.getBytes(holder.start, value, 0, length); + +<#if minor.class == "VarBinary"> + return value; +<#elseif minor.class == "Var16Char"> + return new String(value); +<#elseif minor.class == "VarChar"> + Text text = new Text(); + text.set(value); + return text; + + +<#elseif minor.class == "Interval"> + Period p = new Period(); + return p.plusMonths(holder.months).plusDays(holder.days).plusMillis(holder.milliseconds); + +<#elseif minor.class == "IntervalDay"> + Period p = new Period(); + return p.plusDays(holder.days).plusMillis(holder.milliseconds); + +<#elseif minor.class == "Decimal9" || + minor.class == "Decimal18" > + BigInteger value = BigInteger.valueOf(holder.value); + return new BigDecimal(value, holder.scale); + +<#elseif minor.class == "Decimal28Dense" || + minor.class == "Decimal38Dense"> + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromDense(holder.buffer, + holder.start, + holder.nDecimalDigits, + holder.scale, + holder.maxPrecision, + holder.WIDTH); + +<#elseif minor.class == "Decimal28Sparse" || + minor.class == "Decimal38Sparse"> + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromSparse(holder.buffer, + holder.start, + holder.nDecimalDigits, + holder.scale); + +<#elseif minor.class == "Bit" > + return new Boolean(holder.value != 0); +<#else> + ${friendlyType} value = new ${friendlyType}(this.holder.value); + return value; + + + } + + @Override + public Object readObject() { +<#if holderMode == "Repeated" > + List valList = Lists.newArrayList(); + for (int i = repeatedHolder.start; i < repeatedHolder.end; i++) { + valList.add(repeatedHolder.vector.getAccessor().getObject(i)); + } + return valList; +<#else> + return readSingleObject(); + + } + + private Object readSingleObject() { +<#if nullMode == "Nullable"> + if (!isSet()) { + return null; + } + + +<#if type.major == "VarLen"> + int length = holder.end - holder.start; + byte[] value = new byte [length]; + holder.buffer.getBytes(holder.start, value, 0, length); + +<#if minor.class == "VarBinary"> + return value; +<#elseif minor.class == "Var16Char"> + return new String(value); +<#elseif minor.class == "VarChar"> + Text text = new Text(); + text.set(value); + return text; + + +<#elseif minor.class == "Interval"> + Period p = new Period(); + return p.plusMonths(holder.months).plusDays(holder.days).plusMillis(holder.milliseconds); + +<#elseif minor.class == "IntervalDay"> + Period p = new Period(); + return p.plusDays(holder.days).plusMillis(holder.milliseconds); + +<#elseif minor.class == "Decimal9" || + minor.class == "Decimal18" > + BigInteger value = BigInteger.valueOf(holder.value); + return new BigDecimal(value, holder.scale); + +<#elseif minor.class == "Decimal28Dense" || + minor.class == "Decimal38Dense"> + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromDense(holder.buffer, + holder.start, + holder.nDecimalDigits, + holder.scale, + holder.maxPrecision, + holder.WIDTH); + +<#elseif minor.class == "Decimal28Sparse" || + minor.class == "Decimal38Sparse"> + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromSparse(holder.buffer, + holder.start, + holder.nDecimalDigits, + holder.scale); + +<#elseif minor.class == "Bit" > + return new Boolean(holder.value != 0); +<#else> + ${friendlyType} value = new ${friendlyType}(this.holder.value); + return value; + + } + +<#if holderMode != "Repeated" && nullMode != "Nullable"> + public void copyAsValue(${minor.class?cap_first}Writer writer){ + writer.write(holder); + } + +} + + + + diff --git a/java/vector/src/main/codegen/templates/ListWriters.java b/java/vector/src/main/codegen/templates/ListWriters.java new file mode 100644 index 00000000000..cf9fa30fa47 --- /dev/null +++ b/java/vector/src/main/codegen/templates/ListWriters.java @@ -0,0 +1,234 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> + +<#list ["Single", "Repeated"] as mode> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${mode}ListWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; +<#if mode == "Single"> + <#assign containerClass = "AbstractContainerVector" /> + <#assign index = "idx()"> +<#else> + <#assign containerClass = "RepeatedListVector" /> + <#assign index = "currentChildIndex"> + + + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using FreeMarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ${mode}ListWriter extends AbstractFieldWriter { + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${mode}ListWriter.class); + + static enum Mode { INIT, IN_MAP, IN_LIST <#list vv.types as type><#list type.minor as minor>, IN_${minor.class?upper_case} } + + private final String name; + protected final ${containerClass} container; + private Mode mode = Mode.INIT; + private FieldWriter writer; + protected RepeatedValueVector innerVector; + + <#if mode == "Repeated">private int currentChildIndex = 0; + public ${mode}ListWriter(String name, ${containerClass} container, FieldWriter parent){ + super(parent); + this.name = name; + this.container = container; + } + + public ${mode}ListWriter(${containerClass} container, FieldWriter parent){ + super(parent); + this.name = null; + this.container = container; + } + + @Override + public void allocate() { + if(writer != null) { + writer.allocate(); + } + + <#if mode == "Repeated"> + container.allocateNew(); + + } + + @Override + public void clear() { + if (writer != null) { + writer.clear(); + } + } + + @Override + public void close() { + clear(); + container.close(); + if (innerVector != null) { + innerVector.close(); + } + } + + @Override + public int getValueCapacity() { + return innerVector == null ? 0 : innerVector.getValueCapacity(); + } + + public void setValueCount(int count){ + if(innerVector != null) innerVector.getMutator().setValueCount(count); + } + + @Override + public MapWriter map() { + switch(mode) { + case INIT: + int vectorCount = container.size(); + final RepeatedMapVector vector = container.addOrGet(name, RepeatedMapVector.TYPE, RepeatedMapVector.class); + innerVector = vector; + writer = new RepeatedMapWriter(vector, this); + if(vectorCount != container.size()) { + writer.allocate(); + } + writer.setPosition(${index}); + mode = Mode.IN_MAP; + return writer; + case IN_MAP: + return writer; + } + + throw new RuntimeException(getUnsupportedErrorMsg("MAP", mode.name())); + + } + + @Override + public ListWriter list() { + switch(mode) { + case INIT: + final int vectorCount = container.size(); + final RepeatedListVector vector = container.addOrGet(name, RepeatedListVector.TYPE, RepeatedListVector.class); + innerVector = vector; + writer = new RepeatedListWriter(null, vector, this); + if(vectorCount != container.size()) { + writer.allocate(); + } + writer.setPosition(${index}); + mode = Mode.IN_LIST; + return writer; + case IN_LIST: + return writer; + } + + throw new RuntimeException(getUnsupportedErrorMsg("LIST", mode.name())); + + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + + private static final MajorType ${upperName}_TYPE = Types.repeated(MinorType.${upperName}); + + @Override + public ${capName}Writer ${lowerName}() { + switch(mode) { + case INIT: + final int vectorCount = container.size(); + final Repeated${capName}Vector vector = container.addOrGet(name, ${upperName}_TYPE, Repeated${capName}Vector.class); + innerVector = vector; + writer = new Repeated${capName}WriterImpl(vector, this); + if(vectorCount != container.size()) { + writer.allocate(); + } + writer.setPosition(${index}); + mode = Mode.IN_${upperName}; + return writer; + case IN_${upperName}: + return writer; + } + + throw new RuntimeException(getUnsupportedErrorMsg("${upperName}", mode.name())); + + } + + + public MaterializedField getField() { + return container.getField(); + } + + <#if mode == "Repeated"> + + public void startList() { + final RepeatedListVector list = (RepeatedListVector) container; + final RepeatedListVector.RepeatedMutator mutator = list.getMutator(); + + // make sure that the current vector can support the end position of this list. + if(container.getValueCapacity() <= idx()) { + mutator.setValueCount(idx()+1); + } + + // update the repeated vector to state that there is current+1 objects. + final RepeatedListHolder h = new RepeatedListHolder(); + list.getAccessor().get(idx(), h); + if (h.start >= h.end) { + mutator.startNewValue(idx()); + } + currentChildIndex = container.getMutator().add(idx()); + if(writer != null) { + writer.setPosition(currentChildIndex); + } + } + + public void endList() { + // noop, we initialize state at start rather than end. + } + <#else> + + public void setPosition(int index) { + super.setPosition(index); + if(writer != null) { + writer.setPosition(index); + } + } + + public void startList() { + // noop + } + + public void endList() { + // noop + } + + + private String getUnsupportedErrorMsg(String expected, String found) { + final String f = found.substring(3); + return String.format("In a list of type %s, encountered a value of type %s. "+ + "Drill does not support lists of different types.", + f, expected + ); + } +} + diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java new file mode 100644 index 00000000000..7001367bb37 --- /dev/null +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -0,0 +1,240 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list ["Single", "Repeated"] as mode> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${mode}MapWriter.java" /> +<#if mode == "Single"> +<#assign containerClass = "MapVector" /> +<#assign index = "idx()"> +<#else> +<#assign containerClass = "RepeatedMapVector" /> +<#assign index = "currentChildIndex"> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> +import java.util.Map; + +import org.apache.arrow.vector.holders.RepeatedMapHolder; +import org.apache.arrow.vector.AllocationHelper; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; + +import com.google.common.collect.Maps; + +/* + * This class is generated using FreeMarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class ${mode}MapWriter extends AbstractFieldWriter { + + protected final ${containerClass} container; + private final Map fields = Maps.newHashMap(); + <#if mode == "Repeated">private int currentChildIndex = 0; + + private final boolean unionEnabled; + + public ${mode}MapWriter(${containerClass} container, FieldWriter parent, boolean unionEnabled) { + super(parent); + this.container = container; + this.unionEnabled = unionEnabled; + } + + public ${mode}MapWriter(${containerClass} container, FieldWriter parent) { + this(container, parent, false); + } + + @Override + public int getValueCapacity() { + return container.getValueCapacity(); + } + + @Override + public boolean isEmptyMap() { + return 0 == container.size(); + } + + @Override + public MaterializedField getField() { + return container.getField(); + } + + @Override + public MapWriter map(String name) { + FieldWriter writer = fields.get(name.toLowerCase()); + if(writer == null){ + int vectorCount=container.size(); + MapVector vector = container.addOrGet(name, MapVector.TYPE, MapVector.class); + if(!unionEnabled){ + writer = new SingleMapWriter(vector, this); + } else { + writer = new PromotableWriter(vector, container); + } + if(vectorCount != container.size()) { + writer.allocate(); + } + writer.setPosition(${index}); + fields.put(name.toLowerCase(), writer); + } + return writer; + } + + @Override + public void close() throws Exception { + clear(); + container.close(); + } + + @Override + public void allocate() { + container.allocateNew(); + for(final FieldWriter w : fields.values()) { + w.allocate(); + } + } + + @Override + public void clear() { + container.clear(); + for(final FieldWriter w : fields.values()) { + w.clear(); + } + } + + @Override + public ListWriter list(String name) { + FieldWriter writer = fields.get(name.toLowerCase()); + int vectorCount = container.size(); + if(writer == null) { + if (!unionEnabled){ + writer = new SingleListWriter(name,container,this); + } else{ + writer = new PromotableWriter(container.addOrGet(name, Types.optional(MinorType.LIST), ListVector.class), container); + } + if (container.size() > vectorCount) { + writer.allocate(); + } + writer.setPosition(${index}); + fields.put(name.toLowerCase(), writer); + } + return writer; + } + + <#if mode == "Repeated"> + public void start() { + // update the repeated vector to state that there is current+1 objects. + final RepeatedMapHolder h = new RepeatedMapHolder(); + final RepeatedMapVector map = (RepeatedMapVector) container; + final RepeatedMapVector.Mutator mutator = map.getMutator(); + + // Make sure that the current vector can support the end position of this list. + if(container.getValueCapacity() <= idx()) { + mutator.setValueCount(idx()+1); + } + + map.getAccessor().get(idx(), h); + if (h.start >= h.end) { + container.getMutator().startNewValue(idx()); + } + currentChildIndex = container.getMutator().add(idx()); + for(final FieldWriter w : fields.values()) { + w.setPosition(currentChildIndex); + } + } + + + public void end() { + // noop + } + <#else> + + public void setValueCount(int count) { + container.getMutator().setValueCount(count); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for(final FieldWriter w: fields.values()) { + w.setPosition(index); + } + } + + @Override + public void start() { + } + + @Override + public void end() { + } + + + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#assign vectName = capName /> + <#assign vectName = "Nullable${capName}" /> + + <#if minor.class?starts_with("Decimal") > + public ${minor.class}Writer ${lowerName}(String name) { + // returns existing writer + final FieldWriter writer = fields.get(name.toLowerCase()); + assert writer != null; + return writer; + } + + public ${minor.class}Writer ${lowerName}(String name, int scale, int precision) { + final MajorType ${upperName}_TYPE = new MajorType(MinorType.${upperName}, DataMode.OPTIONAL, scale, precision, null, null); + <#else> + private static final MajorType ${upperName}_TYPE = Types.optional(MinorType.${upperName}); + @Override + public ${minor.class}Writer ${lowerName}(String name) { + + FieldWriter writer = fields.get(name.toLowerCase()); + if(writer == null) { + ValueVector vector; + ValueVector currentVector = container.getChild(name); + if (unionEnabled){ + ${vectName}Vector v = container.addOrGet(name, ${upperName}_TYPE, ${vectName}Vector.class); + writer = new PromotableWriter(v, container); + vector = v; + } else { + ${vectName}Vector v = container.addOrGet(name, ${upperName}_TYPE, ${vectName}Vector.class); + writer = new ${vectName}WriterImpl(v, this); + vector = v; + } + if (currentVector == null || currentVector != vector) { + vector.allocateNewSafe(); + } + writer.setPosition(${index}); + fields.put(name.toLowerCase(), writer); + } + return writer; + } + + + +} + diff --git a/java/vector/src/main/codegen/templates/NullReader.java b/java/vector/src/main/codegen/templates/NullReader.java new file mode 100644 index 00000000000..3ef6c7dcc49 --- /dev/null +++ b/java/vector/src/main/codegen/templates/NullReader.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/NullReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + + +@SuppressWarnings("unused") +public class NullReader extends AbstractBaseReader implements FieldReader{ + + public static final NullReader INSTANCE = new NullReader(); + public static final NullReader EMPTY_LIST_INSTANCE = new NullReader(Types.repeated(MinorType.NULL)); + public static final NullReader EMPTY_MAP_INSTANCE = new NullReader(Types.required(MinorType.MAP)); + private MajorType type; + + private NullReader(){ + super(); + type = Types.required(MinorType.NULL); + } + + private NullReader(MajorType type){ + super(); + this.type = type; + } + + @Override + public MajorType getType() { + return type; + } + + public void copyAsValue(MapWriter writer) {} + + public void copyAsValue(ListWriter writer) {} + + public void copyAsValue(UnionWriter writer) {} + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + public void read(${name}Holder holder){ + throw new UnsupportedOperationException("NullReader cannot write into non-nullable holder"); + } + + public void read(Nullable${name}Holder holder){ + holder.isSet = 0; + } + + public void read(int arrayIndex, ${name}Holder holder){ + throw new ArrayIndexOutOfBoundsException(); + } + + public void copyAsValue(${minor.class}Writer writer){} + public void copyAsField(String name, ${minor.class}Writer writer){} + + public void read(int arrayIndex, Nullable${name}Holder holder){ + throw new ArrayIndexOutOfBoundsException(); + } + + + public int size(){ + return 0; + } + + public boolean isSet(){ + return false; + } + + public boolean next(){ + return false; + } + + public RepeatedMapReader map(){ + return this; + } + + public RepeatedListReader list(){ + return this; + } + + public MapReader map(String name){ + return this; + } + + public ListReader list(String name){ + return this; + } + + public FieldReader reader(String name){ + return this; + } + + public FieldReader reader(){ + return this; + } + + private void fail(String name){ + throw new IllegalArgumentException(String.format("You tried to read a %s type when you are using a ValueReader of type %s.", name, this.getClass().getSimpleName())); + } + + <#list ["Object", "BigDecimal", "Integer", "Long", "Boolean", + "Character", "DateTime", "Period", "Double", "Float", + "Text", "String", "Byte", "Short", "byte[]"] as friendlyType> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + + public ${friendlyType} read${safeType}(int arrayIndex){ + return null; + } + + public ${friendlyType} read${safeType}(){ + return null; + } + + +} + + + diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java new file mode 100644 index 00000000000..6893a25efbe --- /dev/null +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -0,0 +1,630 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> + +<#assign className = "Nullable${minor.class}Vector" /> +<#assign valuesName = "${minor.class}Vector" /> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + +<@pp.changeOutputFile name="/org/apache/arrow/vector/${className}.java" /> + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * Nullable${minor.class} implements a vector of values which could be null. Elements in the vector + * are first checked against a fixed length vector of boolean values. Then the element is retrieved + * from the base class (if not null). + * + * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. + */ +@SuppressWarnings("unused") +public final class ${className} extends BaseDataValueVector implements <#if type.major == "VarLen">VariableWidth<#else>FixedWidthVector, NullableVector{ + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); + + private final FieldReader reader = new Nullable${minor.class}ReaderImpl(Nullable${minor.class}Vector.this); + + private final MaterializedField bitsField = MaterializedField.create("$bits$", new MajorType(MinorType.UINT1, DataMode.REQUIRED)); + private final UInt1Vector bits = new UInt1Vector(bitsField, allocator); + private final ${valuesName} values = new ${minor.class}Vector(field, allocator); + + private final Mutator mutator = new Mutator(); + private final Accessor accessor = new Accessor(); + + public ${className}(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + } + + @Override + public FieldReader getReader(){ + return reader; + } + + @Override + public int getValueCapacity(){ + return Math.min(bits.getValueCapacity(), values.getValueCapacity()); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers = ObjectArrays.concat(bits.getBuffers(false), values.getBuffers(false), ArrowBuf.class); + if (clear) { + for (final ArrowBuf buffer:buffers) { + buffer.retain(1); + } + clear(); + } + return buffers; + } + + @Override + public void close() { + bits.close(); + values.close(); + super.close(); + } + + @Override + public void clear() { + bits.clear(); + values.clear(); + super.clear(); + } + + @Override + public int getBufferSize(){ + return values.getBufferSize() + bits.getBufferSize(); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + return values.getBufferSizeFor(valueCount) + + bits.getBufferSizeFor(valueCount); + } + + @Override + public ArrowBuf getBuffer() { + return values.getBuffer(); + } + + @Override + public ${valuesName} getValuesVector() { + return values; + } + + @Override + public void setInitialCapacity(int numRecords) { + bits.setInitialCapacity(numRecords); + values.setInitialCapacity(numRecords); + } + +// @Override +// public SerializedField.Builder getMetadataBuilder() { +// return super.getMetadataBuilder() +// .addChild(bits.getMetadata()) +// .addChild(values.getMetadata()); +// } + + @Override + public void allocateNew() { + if(!allocateNewSafe()){ + throw new OutOfMemoryException("Failure while allocating buffer."); + } + } + + @Override + public boolean allocateNewSafe() { + /* Boolean to keep track if all the memory allocations were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + success = values.allocateNewSafe() && bits.allocateNewSafe(); + } finally { + if (!success) { + clear(); + } + } + bits.zeroVector(); + mutator.reset(); + accessor.reset(); + return success; + } + + <#if type.major == "VarLen"> + @Override + public void allocateNew(int totalBytes, int valueCount) { + try { + values.allocateNew(totalBytes, valueCount); + bits.allocateNew(valueCount); + } catch(RuntimeException e) { + clear(); + throw e; + } + bits.zeroVector(); + mutator.reset(); + accessor.reset(); + } + + public void reset() { + bits.zeroVector(); + mutator.reset(); + accessor.reset(); + super.reset(); + } + + @Override + public int getByteCapacity(){ + return values.getByteCapacity(); + } + + @Override + public int getCurrentSizeInBytes(){ + return values.getCurrentSizeInBytes(); + } + + <#else> + @Override + public void allocateNew(int valueCount) { + try { + values.allocateNew(valueCount); + bits.allocateNew(valueCount+1); + } catch(OutOfMemoryException e) { + clear(); + throw e; + } + bits.zeroVector(); + mutator.reset(); + accessor.reset(); + } + + @Override + public void reset() { + bits.zeroVector(); + mutator.reset(); + accessor.reset(); + super.reset(); + } + + /** + * {@inheritDoc} + */ + @Override + public void zeroVector() { + bits.zeroVector(); + values.zeroVector(); + } + + + +// @Override +// public void load(SerializedField metadata, ArrowBuf buffer) { +// clear(); + // the bits vector is the first child (the order in which the children are added in getMetadataBuilder is significant) +// final SerializedField bitsField = metadata.getChild(0); +// bits.load(bitsField, buffer); +// +// final int capacity = buffer.capacity(); +// final int bitsLength = bitsField.getBufferLength(); +// final SerializedField valuesField = metadata.getChild(1); +// values.load(valuesField, buffer.slice(bitsLength, capacity - bitsLength)); +// } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator){ + return new TransferImpl(getField(), allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator){ + return new TransferImpl(getField().withPath(ref), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Nullable${minor.class}Vector) to); + } + + public void transferTo(Nullable${minor.class}Vector target){ + bits.transferTo(target.bits); + values.transferTo(target.values); + <#if type.major == "VarLen"> + target.mutator.lastSet = mutator.lastSet; + + clear(); + } + + public void splitAndTransferTo(int startIndex, int length, Nullable${minor.class}Vector target) { + bits.splitAndTransferTo(startIndex, length, target.bits); + values.splitAndTransferTo(startIndex, length, target.values); + <#if type.major == "VarLen"> + target.mutator.lastSet = length - 1; + + } + + private class TransferImpl implements TransferPair { + Nullable${minor.class}Vector to; + + public TransferImpl(MaterializedField field, BufferAllocator allocator){ + to = new Nullable${minor.class}Vector(field, allocator); + } + + public TransferImpl(Nullable${minor.class}Vector to){ + this.to = to; + } + + @Override + public Nullable${minor.class}Vector getTo(){ + return to; + } + + @Override + public void transfer(){ + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Nullable${minor.class}Vector.this); + } + } + + @Override + public Accessor getAccessor(){ + return accessor; + } + + @Override + public Mutator getMutator(){ + return mutator; + } + + public ${minor.class}Vector convertToRequiredVector(){ + ${minor.class}Vector v = new ${minor.class}Vector(getField().getOtherNullableVersion(), allocator); + if (v.data != null) { + v.data.release(1); + } + v.data = values.data; + v.data.retain(1); + clear(); + return v; + } + + public void copyFrom(int fromIndex, int thisIndex, Nullable${minor.class}Vector from){ + final Accessor fromAccessor = from.getAccessor(); + if (!fromAccessor.isNull(fromIndex)) { + mutator.set(thisIndex, fromAccessor.get(fromIndex)); + } + } + + public void copyFromSafe(int fromIndex, int thisIndex, ${minor.class}Vector from){ + <#if type.major == "VarLen"> + mutator.fillEmpties(thisIndex); + + values.copyFromSafe(fromIndex, thisIndex, from); + bits.getMutator().setSafe(thisIndex, 1); + } + + public void copyFromSafe(int fromIndex, int thisIndex, Nullable${minor.class}Vector from){ + <#if type.major == "VarLen"> + mutator.fillEmpties(thisIndex); + + bits.copyFromSafe(fromIndex, thisIndex, from.bits); + values.copyFromSafe(fromIndex, thisIndex, from.values); + } + + public final class Accessor extends BaseDataValueVector.BaseAccessor <#if type.major = "VarLen">implements VariableWidthVector.VariableWidthAccessor { + final UInt1Vector.Accessor bAccessor = bits.getAccessor(); + final ${valuesName}.Accessor vAccessor = values.getAccessor(); + + /** + * Get the element at the specified position. + * + * @param index position of the value + * @return value of the element, if not null + * @throws NullValueException if the value is null + */ + public <#if type.major == "VarLen">byte[]<#else>${minor.javaType!type.javaType} get(int index) { + if (isNull(index)) { + throw new IllegalStateException("Can't get a null value"); + } + return vAccessor.get(index); + } + + @Override + public boolean isNull(int index) { + return isSet(index) == 0; + } + + public int isSet(int index){ + return bAccessor.get(index); + } + + <#if type.major == "VarLen"> + public long getStartEnd(int index){ + return vAccessor.getStartEnd(index); + } + + @Override + public int getValueLength(int index) { + return values.getAccessor().getValueLength(index); + } + + + public void get(int index, Nullable${minor.class}Holder holder){ + vAccessor.get(index, holder); + holder.isSet = bAccessor.get(index); + + <#if minor.class.startsWith("Decimal")> + holder.scale = getField().getScale(); + holder.precision = getField().getPrecision(); + + } + + @Override + public ${friendlyType} getObject(int index) { + if (isNull(index)) { + return null; + }else{ + return vAccessor.getObject(index); + } + } + + <#if minor.class == "Interval" || minor.class == "IntervalDay" || minor.class == "IntervalYear"> + public StringBuilder getAsStringBuilder(int index) { + if (isNull(index)) { + return null; + }else{ + return vAccessor.getAsStringBuilder(index); + } + } + + + @Override + public int getValueCount(){ + return bits.getAccessor().getValueCount(); + } + + public void reset(){} + } + + public final class Mutator extends BaseDataValueVector.BaseMutator implements NullableVectorDefinitionSetter<#if type.major = "VarLen">, VariableWidthVector.VariableWidthMutator { + private int setCount; + <#if type.major = "VarLen"> private int lastSet = -1; + + private Mutator(){ + } + + public ${valuesName} getVectorWithValues(){ + return values; + } + + @Override + public void setIndexDefined(int index){ + bits.getMutator().set(index, 1); + } + + /** + * Set the variable length element at the specified index to the supplied byte array. + * + * @param index position of the bit to set + * @param bytes array of bytes to write + */ + public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width < 4)>int<#else>${minor.javaType!type.javaType} value) { + setCount++; + final ${valuesName}.Mutator valuesMutator = values.getMutator(); + final UInt1Vector.Mutator bitsMutator = bits.getMutator(); + <#if type.major == "VarLen"> + for (int i = lastSet + 1; i < index; i++) { + valuesMutator.set(i, emptyByteArray); + } + + bitsMutator.set(index, 1); + valuesMutator.set(index, value); + <#if type.major == "VarLen">lastSet = index; + } + + <#if type.major == "VarLen"> + + private void fillEmpties(int index){ + final ${valuesName}.Mutator valuesMutator = values.getMutator(); + for (int i = lastSet; i < index; i++) { + valuesMutator.setSafe(i + 1, emptyByteArray); + } + while(index > bits.getValueCapacity()) { + bits.reAlloc(); + } + lastSet = index; + } + + @Override + public void setValueLengthSafe(int index, int length) { + values.getMutator().setValueLengthSafe(index, length); + lastSet = index; + } + + + public void setSafe(int index, byte[] value, int start, int length) { + <#if type.major != "VarLen"> + throw new UnsupportedOperationException(); + <#else> + fillEmpties(index); + + bits.getMutator().setSafe(index, 1); + values.getMutator().setSafe(index, value, start, length); + setCount++; + <#if type.major == "VarLen">lastSet = index; + + } + + public void setSafe(int index, ByteBuffer value, int start, int length) { + <#if type.major != "VarLen"> + throw new UnsupportedOperationException(); + <#else> + fillEmpties(index); + + bits.getMutator().setSafe(index, 1); + values.getMutator().setSafe(index, value, start, length); + setCount++; + <#if type.major == "VarLen">lastSet = index; + + } + + public void setNull(int index){ + bits.getMutator().setSafe(index, 0); + } + + public void setSkipNull(int index, ${minor.class}Holder holder){ + values.getMutator().set(index, holder); + } + + public void setSkipNull(int index, Nullable${minor.class}Holder holder){ + values.getMutator().set(index, holder); + } + + + public void set(int index, Nullable${minor.class}Holder holder){ + final ${valuesName}.Mutator valuesMutator = values.getMutator(); + <#if type.major == "VarLen"> + for (int i = lastSet + 1; i < index; i++) { + valuesMutator.set(i, emptyByteArray); + } + + bits.getMutator().set(index, holder.isSet); + valuesMutator.set(index, holder); + <#if type.major == "VarLen">lastSet = index; + } + + public void set(int index, ${minor.class}Holder holder){ + final ${valuesName}.Mutator valuesMutator = values.getMutator(); + <#if type.major == "VarLen"> + for (int i = lastSet + 1; i < index; i++) { + valuesMutator.set(i, emptyByteArray); + } + + bits.getMutator().set(index, 1); + valuesMutator.set(index, holder); + <#if type.major == "VarLen">lastSet = index; + } + + public boolean isSafe(int outIndex) { + return outIndex < Nullable${minor.class}Vector.this.getValueCapacity(); + } + + <#assign fields = minor.fields!type.fields /> + public void set(int index, int isSet<#list fields as field><#if field.include!true >, ${field.type} ${field.name}Field ){ + final ${valuesName}.Mutator valuesMutator = values.getMutator(); + <#if type.major == "VarLen"> + for (int i = lastSet + 1; i < index; i++) { + valuesMutator.set(i, emptyByteArray); + } + + bits.getMutator().set(index, isSet); + valuesMutator.set(index<#list fields as field><#if field.include!true >, ${field.name}Field); + <#if type.major == "VarLen">lastSet = index; + } + + public void setSafe(int index, int isSet<#list fields as field><#if field.include!true >, ${field.type} ${field.name}Field ) { + <#if type.major == "VarLen"> + fillEmpties(index); + + + bits.getMutator().setSafe(index, isSet); + values.getMutator().setSafe(index<#list fields as field><#if field.include!true >, ${field.name}Field); + setCount++; + <#if type.major == "VarLen">lastSet = index; + } + + + public void setSafe(int index, Nullable${minor.class}Holder value) { + + <#if type.major == "VarLen"> + fillEmpties(index); + + bits.getMutator().setSafe(index, value.isSet); + values.getMutator().setSafe(index, value); + setCount++; + <#if type.major == "VarLen">lastSet = index; + } + + public void setSafe(int index, ${minor.class}Holder value) { + + <#if type.major == "VarLen"> + fillEmpties(index); + + bits.getMutator().setSafe(index, 1); + values.getMutator().setSafe(index, value); + setCount++; + <#if type.major == "VarLen">lastSet = index; + } + + <#if !(type.major == "VarLen" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense" || minor.class == "Interval" || minor.class == "IntervalDay")> + public void setSafe(int index, ${minor.javaType!type.javaType} value) { + <#if type.major == "VarLen"> + fillEmpties(index); + + bits.getMutator().setSafe(index, 1); + values.getMutator().setSafe(index, value); + setCount++; + } + + + + @Override + public void setValueCount(int valueCount) { + assert valueCount >= 0; + <#if type.major == "VarLen"> + fillEmpties(valueCount); + + values.getMutator().setValueCount(valueCount); + bits.getMutator().setValueCount(valueCount); + } + + @Override + public void generateTestData(int valueCount){ + bits.getMutator().generateTestDataAlt(valueCount); + values.getMutator().generateTestData(valueCount); + <#if type.major = "VarLen">lastSet = valueCount; + setValueCount(valueCount); + } + + @Override + public void reset(){ + setCount = 0; + <#if type.major = "VarLen">lastSet = -1; + } + } +} + + diff --git a/java/vector/src/main/codegen/templates/RepeatedValueVectors.java b/java/vector/src/main/codegen/templates/RepeatedValueVectors.java new file mode 100644 index 00000000000..5ac80f57737 --- /dev/null +++ b/java/vector/src/main/codegen/templates/RepeatedValueVectors.java @@ -0,0 +1,421 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> +<#assign fields = minor.fields!type.fields /> + +<@pp.changeOutputFile name="/org/apache/arrow/vector/Repeated${minor.class}Vector.java" /> +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * Repeated${minor.class} implements a vector with multple values per row (e.g. JSON array or + * repeated protobuf field). The implementation uses two additional value vectors; one to convert + * the index offset to the underlying element offset, and another to store the number of values + * in the vector. + * + * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. + */ + +public final class Repeated${minor.class}Vector extends BaseRepeatedValueVector implements Repeated<#if type.major == "VarLen">VariableWidth<#else>FixedWidthVectorLike { + //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Repeated${minor.class}Vector.class); + + // we maintain local reference to concrete vector type for performance reasons. + private ${minor.class}Vector values; + private final FieldReader reader = new Repeated${minor.class}ReaderImpl(Repeated${minor.class}Vector.this); + private final Mutator mutator = new Mutator(); + private final Accessor accessor = new Accessor(); + + public Repeated${minor.class}Vector(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + addOrGetVector(VectorDescriptor.create(new MajorType(field.getType().getMinorType(), DataMode.REQUIRED))); + } + + @Override + public Mutator getMutator() { + return mutator; + } + + @Override + public Accessor getAccessor() { + return accessor; + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public ${minor.class}Vector getDataVector() { + return values; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new TransferImpl(getField(), allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator){ + return new TransferImpl(getField().withPath(ref), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((Repeated${minor.class}Vector) to); + } + + @Override + public AddOrGetResult<${minor.class}Vector> addOrGetVector(VectorDescriptor descriptor) { + final AddOrGetResult<${minor.class}Vector> result = super.addOrGetVector(descriptor); + if (result.isCreated()) { + values = result.getVector(); + } + return result; + } + + public void transferTo(Repeated${minor.class}Vector target) { + target.clear(); + offsets.transferTo(target.offsets); + values.transferTo(target.values); + clear(); + } + + public void splitAndTransferTo(final int startIndex, final int groups, Repeated${minor.class}Vector to) { + final UInt4Vector.Accessor a = offsets.getAccessor(); + final UInt4Vector.Mutator m = to.offsets.getMutator(); + + final int startPos = a.get(startIndex); + final int endPos = a.get(startIndex + groups); + final int valuesToCopy = endPos - startPos; + + values.splitAndTransferTo(startPos, valuesToCopy, to.values); + to.offsets.clear(); + to.offsets.allocateNew(groups + 1); + int normalizedPos = 0; + for (int i=0; i < groups + 1;i++ ) { + normalizedPos = a.get(startIndex+i) - startPos; + m.set(i, normalizedPos); + } + m.setValueCount(groups == 0 ? 0 : groups + 1); + } + + private class TransferImpl implements TransferPair { + final Repeated${minor.class}Vector to; + + public TransferImpl(MaterializedField field, BufferAllocator allocator) { + this.to = new Repeated${minor.class}Vector(field, allocator); + } + + public TransferImpl(Repeated${minor.class}Vector to) { + this.to = to; + } + + @Override + public Repeated${minor.class}Vector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, Repeated${minor.class}Vector.this); + } + } + + public void copyFrom(int inIndex, int outIndex, Repeated${minor.class}Vector v) { + final Accessor vAccessor = v.getAccessor(); + final int count = vAccessor.getInnerValueCountAt(inIndex); + mutator.startNewValue(outIndex); + for (int i = 0; i < count; i++) { + mutator.add(outIndex, vAccessor.get(inIndex, i)); + } + } + + public void copyFromSafe(int inIndex, int outIndex, Repeated${minor.class}Vector v) { + final Accessor vAccessor = v.getAccessor(); + final int count = vAccessor.getInnerValueCountAt(inIndex); + mutator.startNewValue(outIndex); + for (int i = 0; i < count; i++) { + mutator.addSafe(outIndex, vAccessor.get(inIndex, i)); + } + } + + public boolean allocateNewSafe() { + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + if(!offsets.allocateNewSafe()) return false; + if(!values.allocateNewSafe()) return false; + success = true; + } finally { + if (!success) { + clear(); + } + } + offsets.zeroVector(); + mutator.reset(); + return true; + } + + @Override + public void allocateNew() { + try { + offsets.allocateNew(); + values.allocateNew(); + } catch (OutOfMemoryException e) { + clear(); + throw e; + } + offsets.zeroVector(); + mutator.reset(); + } + + <#if type.major == "VarLen"> +// @Override +// protected SerializedField.Builder getMetadataBuilder() { +// return super.getMetadataBuilder() +// .setVarByteLength(values.getVarByteLength()); +// } + + public void allocateNew(int totalBytes, int valueCount, int innerValueCount) { + try { + offsets.allocateNew(valueCount + 1); + values.allocateNew(totalBytes, innerValueCount); + } catch (OutOfMemoryException e) { + clear(); + throw e; + } + offsets.zeroVector(); + mutator.reset(); + } + + public int getByteCapacity(){ + return values.getByteCapacity(); + } + + <#else> + + @Override + public void allocateNew(int valueCount, int innerValueCount) { + clear(); + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to// + * clear all the memory that we allocated + */ + boolean success = false; + try { + offsets.allocateNew(valueCount + 1); + values.allocateNew(innerValueCount); + } catch(OutOfMemoryException e){ + clear(); + throw e; + } + offsets.zeroVector(); + mutator.reset(); + } + + + + // This is declared a subclass of the accessor declared inside of FixedWidthVector, this is also used for + // variable length vectors, as they should ahve consistent interface as much as possible, if they need to diverge + // in the future, the interface shold be declared in the respective value vector superclasses for fixed and variable + // and we should refer to each in the generation template + public final class Accessor extends BaseRepeatedValueVector.BaseRepeatedAccessor { + @Override + public List<${friendlyType}> getObject(int index) { + final List<${friendlyType}> vals = new JsonStringArrayList<>(); + final UInt4Vector.Accessor offsetsAccessor = offsets.getAccessor(); + final int start = offsetsAccessor.get(index); + final int end = offsetsAccessor.get(index + 1); + final ${minor.class}Vector.Accessor valuesAccessor = values.getAccessor(); + for(int i = start; i < end; i++) { + vals.add(valuesAccessor.getObject(i)); + } + return vals; + } + + public ${friendlyType} getSingleObject(int index, int arrayIndex) { + final int start = offsets.getAccessor().get(index); + return values.getAccessor().getObject(start + arrayIndex); + } + + /** + * Get a value for the given record. Each element in the repeated field is accessed by + * the positionIndex param. + * + * @param index record containing the repeated field + * @param positionIndex position within the repeated field + * @return element at the given position in the given record + */ + public <#if type.major == "VarLen">byte[] + <#else>${minor.javaType!type.javaType} + get(int index, int positionIndex) { + return values.getAccessor().get(offsets.getAccessor().get(index) + positionIndex); + } + + public void get(int index, Repeated${minor.class}Holder holder) { + holder.start = offsets.getAccessor().get(index); + holder.end = offsets.getAccessor().get(index+1); + holder.vector = values; + } + + public void get(int index, int positionIndex, ${minor.class}Holder holder) { + final int offset = offsets.getAccessor().get(index); + assert offset >= 0; + assert positionIndex < getInnerValueCountAt(index); + values.getAccessor().get(offset + positionIndex, holder); + } + + public void get(int index, int positionIndex, Nullable${minor.class}Holder holder) { + final int offset = offsets.getAccessor().get(index); + assert offset >= 0; + if (positionIndex >= getInnerValueCountAt(index)) { + holder.isSet = 0; + return; + } + values.getAccessor().get(offset + positionIndex, holder); + } + } + + public final class Mutator extends BaseRepeatedValueVector.BaseRepeatedMutator implements RepeatedMutator { + private Mutator() {} + + /** + * Add an element to the given record index. This is similar to the set() method in other + * value vectors, except that it permits setting multiple values for a single record. + * + * @param index record of the element to add + * @param value value to add to the given row + */ + public void add(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width < 4)>int<#else>${minor.javaType!type.javaType} value) { + int nextOffset = offsets.getAccessor().get(index+1); + values.getMutator().set(nextOffset, value); + offsets.getMutator().set(index+1, nextOffset+1); + } + + <#if type.major == "VarLen"> + public void addSafe(int index, byte[] bytes) { + addSafe(index, bytes, 0, bytes.length); + } + + public void addSafe(int index, byte[] bytes, int start, int length) { + final int nextOffset = offsets.getAccessor().get(index+1); + values.getMutator().setSafe(nextOffset, bytes, start, length); + offsets.getMutator().setSafe(index+1, nextOffset+1); + } + + <#else> + + public void addSafe(int index, ${minor.javaType!type.javaType} srcValue) { + final int nextOffset = offsets.getAccessor().get(index+1); + values.getMutator().setSafe(nextOffset, srcValue); + offsets.getMutator().setSafe(index+1, nextOffset+1); + } + + + + public void setSafe(int index, Repeated${minor.class}Holder h) { + final ${minor.class}Holder ih = new ${minor.class}Holder(); + final ${minor.class}Vector.Accessor hVectorAccessor = h.vector.getAccessor(); + mutator.startNewValue(index); + for(int i = h.start; i < h.end; i++){ + hVectorAccessor.get(i, ih); + mutator.addSafe(index, ih); + } + } + + public void addSafe(int index, ${minor.class}Holder holder) { + int nextOffset = offsets.getAccessor().get(index+1); + values.getMutator().setSafe(nextOffset, holder); + offsets.getMutator().setSafe(index+1, nextOffset+1); + } + + public void addSafe(int index, Nullable${minor.class}Holder holder) { + final int nextOffset = offsets.getAccessor().get(index+1); + values.getMutator().setSafe(nextOffset, holder); + offsets.getMutator().setSafe(index+1, nextOffset+1); + } + + <#if (fields?size > 1) && !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> + public void addSafe(int arrayIndex, <#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + int nextOffset = offsets.getAccessor().get(arrayIndex+1); + values.getMutator().setSafe(nextOffset, <#list fields as field>${field.name}<#if field_has_next>, ); + offsets.getMutator().setSafe(arrayIndex+1, nextOffset+1); + } + + + protected void add(int index, ${minor.class}Holder holder) { + int nextOffset = offsets.getAccessor().get(index+1); + values.getMutator().set(nextOffset, holder); + offsets.getMutator().set(index+1, nextOffset+1); + } + + public void add(int index, Repeated${minor.class}Holder holder) { + + ${minor.class}Vector.Accessor accessor = holder.vector.getAccessor(); + ${minor.class}Holder innerHolder = new ${minor.class}Holder(); + + for(int i = holder.start; i < holder.end; i++) { + accessor.get(i, innerHolder); + add(index, innerHolder); + } + } + + @Override + public void generateTestData(final int valCount) { + final int[] sizes = {1, 2, 0, 6}; + int size = 0; + int runningOffset = 0; + final UInt4Vector.Mutator offsetsMutator = offsets.getMutator(); + for(int i = 1; i < valCount + 1; i++, size++) { + runningOffset += sizes[size % sizes.length]; + offsetsMutator.set(i, runningOffset); + } + values.getMutator().generateTestData(valCount * 9); + setValueCount(size); + } + + @Override + public void reset() { + } + } +} + + diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java new file mode 100644 index 00000000000..9a6b08fc561 --- /dev/null +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -0,0 +1,185 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.UnsupportedOperationException; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionListWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ + +@SuppressWarnings("unused") +public class UnionListWriter extends AbstractFieldWriter { + + private ListVector vector; + private UInt4Vector offsets; + private PromotableWriter writer; + private boolean inMap = false; + private String mapName; + private int lastIndex = 0; + + public UnionListWriter(ListVector vector) { + super(null); + this.vector = vector; + this.writer = new PromotableWriter(vector.getDataVector(), vector); + this.offsets = vector.getOffsetVector(); + } + + public UnionListWriter(ListVector vector, AbstractFieldWriter parent) { + this(vector); + } + + @Override + public void allocate() { + vector.allocateNew(); + } + + @Override + public void clear() { + vector.clear(); + } + + @Override + public MaterializedField getField() { + return null; + } + + @Override + public int getValueCapacity() { + return vector.getValueCapacity(); + } + + @Override + public void close() throws Exception { + + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + + <#if !minor.class?starts_with("Decimal")> + + @Override + public ${name}Writer <#if uncappedName == "int">integer<#else>${uncappedName}() { + return this; + } + + @Override + public ${name}Writer <#if uncappedName == "int">integer<#else>${uncappedName}(String name) { + assert inMap; + mapName = name; + final int nextOffset = offsets.getAccessor().get(idx() + 1); + vector.getMutator().setNotNull(idx()); + writer.setPosition(nextOffset); + ${name}Writer ${uncappedName}Writer = writer.<#if uncappedName == "int">integer<#else>${uncappedName}(name); + return ${uncappedName}Writer; + } + + + + + + @Override + public MapWriter map() { + inMap = true; + return this; + } + + @Override + public ListWriter list() { + final int nextOffset = offsets.getAccessor().get(idx() + 1); + vector.getMutator().setNotNull(idx()); + offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); + writer.setPosition(nextOffset); + return writer; + } + + @Override + public ListWriter list(String name) { + final int nextOffset = offsets.getAccessor().get(idx() + 1); + vector.getMutator().setNotNull(idx()); + writer.setPosition(nextOffset); + ListWriter listWriter = writer.list(name); + return listWriter; + } + + @Override + public MapWriter map(String name) { + MapWriter mapWriter = writer.map(name); + return mapWriter; + } + + @Override + public void startList() { + vector.getMutator().startNewValue(idx()); + } + + @Override + public void endList() { + + } + + @Override + public void start() { + assert inMap; + final int nextOffset = offsets.getAccessor().get(idx() + 1); + vector.getMutator().setNotNull(idx()); + offsets.getMutator().setSafe(idx() + 1, nextOffset); + writer.setPosition(nextOffset); + } + + @Override + public void end() { + if (inMap) { + inMap = false; + final int nextOffset = offsets.getAccessor().get(idx() + 1); + offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); + } + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + + <#if !minor.class?starts_with("Decimal")> + + @Override + public void write${name}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + assert !inMap; + final int nextOffset = offsets.getAccessor().get(idx() + 1); + vector.getMutator().setNotNull(idx()); + writer.setPosition(nextOffset); + writer.write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); + offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); + } + + + + + +} diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java new file mode 100644 index 00000000000..44c3e55dcc6 --- /dev/null +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -0,0 +1,194 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionReader.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +@SuppressWarnings("unused") +public class UnionReader extends AbstractFieldReader { + + private BaseReader[] readers = new BaseReader[43]; + public UnionVector data; + + public UnionReader(UnionVector data) { + this.data = data; + } + + private static MajorType[] TYPES = new MajorType[43]; + + static { + for (MinorType minorType : MinorType.values()) { + TYPES[minorType.ordinal()] = new MajorType(minorType, DataMode.OPTIONAL); + } + } + + public MajorType getType() { + return TYPES[data.getTypeValue(idx())]; + } + + public boolean isSet(){ + return !data.getAccessor().isNull(idx()); + } + + public void read(UnionHolder holder) { + holder.reader = this; + holder.isSet = this.isSet() ? 1 : 0; + } + + public void read(int index, UnionHolder holder) { + getList().read(index, holder); + } + + private FieldReader getReaderForIndex(int index) { + int typeValue = data.getTypeValue(index); + FieldReader reader = (FieldReader) readers[typeValue]; + if (reader != null) { + return reader; + } + switch (MinorType.values()[typeValue]) { + case LATE: + return NullReader.INSTANCE; + case MAP: + return (FieldReader) getMap(); + case LIST: + return (FieldReader) getList(); + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return (FieldReader) get${name}(); + + + default: + throw new UnsupportedOperationException("Unsupported type: " + MinorType.values()[typeValue]); + } + } + + private SingleMapReaderImpl mapReader; + + private MapReader getMap() { + if (mapReader == null) { + mapReader = (SingleMapReaderImpl) data.getMap().getReader(); + mapReader.setPosition(idx()); + readers[MinorType.MAP.ordinal()] = mapReader; + } + return mapReader; + } + + private UnionListReader listReader; + + private FieldReader getList() { + if (listReader == null) { + listReader = new UnionListReader(data.getList()); + listReader.setPosition(idx()); + readers[MinorType.LIST.ordinal()] = listReader; + } + return listReader; + } + + @Override + public java.util.Iterator iterator() { + return getMap().iterator(); + } + + @Override + public void copyAsValue(UnionWriter writer) { + writer.data.copyFrom(idx(), writer.idx(), data); + } + + <#list ["Object", "BigDecimal", "Integer", "Long", "Boolean", + "Character", "DateTime", "Period", "Double", "Float", + "Text", "String", "Byte", "Short", "byte[]"] as friendlyType> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + + @Override + public ${friendlyType} read${safeType}() { + return getReaderForIndex(idx()).read${safeType}(); + } + + + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign uncappedName = name?uncap_first/> + <#assign boxedType = (minor.boxedType!type.boxedType) /> + <#assign javaType = (minor.javaType!type.javaType) /> + <#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + <#assign safeType=friendlyType /> + <#if safeType=="byte[]"><#assign safeType="ByteArray" /> + <#if !minor.class?starts_with("Decimal")> + + private Nullable${name}ReaderImpl ${uncappedName}Reader; + + private Nullable${name}ReaderImpl get${name}() { + if (${uncappedName}Reader == null) { + ${uncappedName}Reader = new Nullable${name}ReaderImpl(data.get${name}Vector()); + ${uncappedName}Reader.setPosition(idx()); + readers[MinorType.${name?upper_case}.ordinal()] = ${uncappedName}Reader; + } + return ${uncappedName}Reader; + } + + public void read(Nullable${name}Holder holder){ + getReaderForIndex(idx()).read(holder); + } + + public void copyAsValue(${name}Writer writer){ + getReaderForIndex(idx()).copyAsValue(writer); + } + + + + @Override + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for (BaseReader reader : readers) { + if (reader != null) { + reader.setPosition(index); + } + } + } + + public FieldReader reader(String name){ + return getMap().reader(name); + } + + public FieldReader reader() { + return getList().reader(); + } + + public boolean next() { + return getReaderForIndex(idx()).next(); + } +} + + + diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java new file mode 100644 index 00000000000..ba94ac22a05 --- /dev/null +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -0,0 +1,467 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/UnionVector.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex; + +<#include "/@includes/vv_imports.ftl" /> +import java.util.ArrayList; +import java.util.Iterator; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.BasicTypeHelper; + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") + + +/** + * A vector which can hold values of different types. It does so by using a MapVector which contains a vector for each + * primitive type that is stored. MapVector is used in order to take advantage of its serialization/deserialization methods, + * as well as the addOrGet method. + * + * For performance reasons, UnionVector stores a cached reference to each subtype vector, to avoid having to do the map lookup + * each time the vector is accessed. + */ +public class UnionVector implements ValueVector { + + private MaterializedField field; + private BufferAllocator allocator; + private Accessor accessor = new Accessor(); + private Mutator mutator = new Mutator(); + private int valueCount; + + private MapVector internalMap; + private UInt1Vector typeVector; + + private MapVector mapVector; + private ListVector listVector; + + private FieldReader reader; + private NullableBitVector bit; + + private int singleType = 0; + private ValueVector singleVector; + private MajorType majorType; + + private final CallBack callBack; + + public UnionVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { + this.field = field.clone(); + this.allocator = allocator; + this.internalMap = new MapVector("internal", allocator, callBack); + this.typeVector = internalMap.addOrGet("types", new MajorType(MinorType.UINT1, DataMode.REQUIRED), UInt1Vector.class); + this.field.addChild(internalMap.getField().clone()); + this.majorType = field.getType(); + this.callBack = callBack; + } + + public BufferAllocator getAllocator() { + return allocator; + } + + public List getSubTypes() { + return majorType.getSubTypes(); + } + + public void addSubType(MinorType type) { + if (majorType.getSubTypes().contains(type)) { + return; + } + List subTypes = this.majorType.getSubTypes(); + List newSubTypes = new ArrayList<>(subTypes); + newSubTypes.add(type); + majorType = new MajorType(this.majorType.getMinorType(), this.majorType.getMode(), this.majorType.getPrecision(), + this.majorType.getScale(), this.majorType.getTimezone(), newSubTypes); + field = MaterializedField.create(field.getName(), majorType); + if (callBack != null) { + callBack.doWork(); + } + } + + private static final MajorType MAP_TYPE = new MajorType(MinorType.MAP, DataMode.OPTIONAL); + + public MapVector getMap() { + if (mapVector == null) { + int vectorCount = internalMap.size(); + mapVector = internalMap.addOrGet("map", MAP_TYPE, MapVector.class); + addSubType(MinorType.MAP); + if (internalMap.size() > vectorCount) { + mapVector.allocateNew(); + } + } + return mapVector; + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + + private Nullable${name}Vector ${uncappedName}Vector; + private static final MajorType ${name?upper_case}_TYPE = new MajorType(MinorType.${name?upper_case}, DataMode.OPTIONAL); + + public Nullable${name}Vector get${name}Vector() { + if (${uncappedName}Vector == null) { + int vectorCount = internalMap.size(); + ${uncappedName}Vector = internalMap.addOrGet("${uncappedName}", ${name?upper_case}_TYPE, Nullable${name}Vector.class); + addSubType(MinorType.${name?upper_case}); + if (internalMap.size() > vectorCount) { + ${uncappedName}Vector.allocateNew(); + } + } + return ${uncappedName}Vector; + } + + + + + + private static final MajorType LIST_TYPE = new MajorType(MinorType.LIST, DataMode.OPTIONAL); + + public ListVector getList() { + if (listVector == null) { + int vectorCount = internalMap.size(); + listVector = internalMap.addOrGet("list", LIST_TYPE, ListVector.class); + addSubType(MinorType.LIST); + if (internalMap.size() > vectorCount) { + listVector.allocateNew(); + } + } + return listVector; + } + + public int getTypeValue(int index) { + return typeVector.getAccessor().get(index); + } + + public UInt1Vector getTypeVector() { + return typeVector; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + internalMap.allocateNew(); + if (typeVector != null) { + typeVector.zeroVector(); + } + } + + @Override + public boolean allocateNewSafe() { + boolean safe = internalMap.allocateNewSafe(); + if (safe) { + if (typeVector != null) { + typeVector.zeroVector(); + } + } + return safe; + } + + @Override + public void setInitialCapacity(int numRecords) { + } + + @Override + public int getValueCapacity() { + return Math.min(typeVector.getValueCapacity(), internalMap.getValueCapacity()); + } + + @Override + public void close() { + } + + @Override + public void clear() { + internalMap.clear(); + } + + @Override + public MaterializedField getField() { + return field; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new TransferImpl(field, allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(field.withPath(ref), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((UnionVector) target); + } + + public void transferTo(UnionVector target) { + internalMap.makeTransferPair(target.internalMap).transfer(); + target.valueCount = valueCount; + target.majorType = majorType; + } + + public void copyFrom(int inIndex, int outIndex, UnionVector from) { + from.getReader().setPosition(inIndex); + getWriter().setPosition(outIndex); + ComplexCopier.copy(from.reader, mutator.writer); + } + + public void copyFromSafe(int inIndex, int outIndex, UnionVector from) { + copyFrom(inIndex, outIndex, from); + } + + public ValueVector addVector(ValueVector v) { + String name = v.getField().getType().getMinorType().name().toLowerCase(); + MajorType type = v.getField().getType(); + Preconditions.checkState(internalMap.getChild(name) == null, String.format("%s vector already exists", name)); + final ValueVector newVector = internalMap.addOrGet(name, type, (Class) BasicTypeHelper.getValueVectorClass(type.getMinorType(), type.getMode())); + v.makeTransferPair(newVector).transfer(); + internalMap.putChild(name, newVector); + addSubType(v.getField().getType().getMinorType()); + return newVector; + } + + private class TransferImpl implements TransferPair { + + UnionVector to; + + public TransferImpl(MaterializedField field, BufferAllocator allocator) { + to = new UnionVector(field, allocator, null); + } + + public TransferImpl(UnionVector to) { + this.to = to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, UnionVector.this); + } + } + + @Override + public Accessor getAccessor() { + return accessor; + } + + @Override + public Mutator getMutator() { + return mutator; + } + + @Override + public FieldReader getReader() { + if (reader == null) { + reader = new UnionReader(this); + } + return reader; + } + + public FieldWriter getWriter() { + if (mutator.writer == null) { + mutator.writer = new UnionWriter(this); + } + return mutator.writer; + } + +// @Override +// public UserBitShared.SerializedField getMetadata() { +// SerializedField.Builder b = getField() // +// .getAsBuilder() // +// .setBufferLength(getBufferSize()) // +// .setValueCount(valueCount); +// +// b.addChild(internalMap.getMetadata()); +// return b.build(); +// } + + @Override + public int getBufferSize() { + return internalMap.getBufferSize(); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + long bufferSize = 0; + for (final ValueVector v : (Iterable) this) { + bufferSize += v.getBufferSizeFor(valueCount); + } + + return (int) bufferSize; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return internalMap.getBuffers(clear); + } + + @Override + public Iterator iterator() { + List vectors = Lists.newArrayList(internalMap.iterator()); + vectors.add(typeVector); + return vectors.iterator(); + } + + public class Accessor extends BaseValueVector.BaseAccessor { + + + @Override + public Object getObject(int index) { + int type = typeVector.getAccessor().get(index); + switch (MinorType.values()[type]) { + case LATE: + return null; + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return get${name}Vector().getAccessor().getObject(index); + + + + case MAP: + return getMap().getAccessor().getObject(index); + case LIST: + return getList().getAccessor().getObject(index); + default: + throw new UnsupportedOperationException("Cannot support type: " + MinorType.values()[type]); + } + } + + public byte[] get(int index) { + return null; + } + + public void get(int index, ComplexHolder holder) { + } + + public void get(int index, UnionHolder holder) { + FieldReader reader = new UnionReader(UnionVector.this); + reader.setPosition(index); + holder.reader = reader; + } + + @Override + public int getValueCount() { + return valueCount; + } + + @Override + public boolean isNull(int index) { + return typeVector.getAccessor().get(index) == 0; + } + + public int isSet(int index) { + return isNull(index) ? 0 : 1; + } + } + + public class Mutator extends BaseValueVector.BaseMutator { + + UnionWriter writer; + + @Override + public void setValueCount(int valueCount) { + UnionVector.this.valueCount = valueCount; + internalMap.getMutator().setValueCount(valueCount); + } + + public void setSafe(int index, UnionHolder holder) { + FieldReader reader = holder.reader; + if (writer == null) { + writer = new UnionWriter(UnionVector.this); + } + writer.setPosition(index); + MinorType type = reader.getType().getMinorType(); + switch (type) { + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + case ${name?upper_case}: + Nullable${name}Holder ${uncappedName}Holder = new Nullable${name}Holder(); + reader.read(${uncappedName}Holder); + setSafe(index, ${uncappedName}Holder); + break; + + + case MAP: { + ComplexCopier.copy(reader, writer); + break; + } + case LIST: { + ComplexCopier.copy(reader, writer); + break; + } + default: + throw new UnsupportedOperationException(); + } + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + public void setSafe(int index, Nullable${name}Holder holder) { + setType(index, MinorType.${name?upper_case}); + get${name}Vector().getMutator().setSafe(index, holder); + } + + + + + public void setType(int index, MinorType type) { + typeVector.getMutator().setSafe(index, type.ordinal()); + } + + @Override + public void reset() { } + + @Override + public void generateTestData(int values) { } + } +} diff --git a/java/vector/src/main/codegen/templates/UnionWriter.java b/java/vector/src/main/codegen/templates/UnionWriter.java new file mode 100644 index 00000000000..c9c29e0dd5f --- /dev/null +++ b/java/vector/src/main/codegen/templates/UnionWriter.java @@ -0,0 +1,228 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionWriter.java" /> + + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.complex.impl; + +<#include "/@includes/vv_imports.ftl" /> + +/* + * This class is generated using freemarker and the ${.template_name} template. + */ +@SuppressWarnings("unused") +public class UnionWriter extends AbstractFieldWriter implements FieldWriter { + + UnionVector data; + private MapWriter mapWriter; + private UnionListWriter listWriter; + private List writers = Lists.newArrayList(); + + public UnionWriter(BufferAllocator allocator) { + super(null); + } + + public UnionWriter(UnionVector vector) { + super(null); + data = vector; + } + + public UnionWriter(UnionVector vector, FieldWriter parent) { + super(null); + data = vector; + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + for (BaseWriter writer : writers) { + writer.setPosition(index); + } + } + + + @Override + public void start() { + data.getMutator().setType(idx(), MinorType.MAP); + getMapWriter().start(); + } + + @Override + public void end() { + getMapWriter().end(); + } + + @Override + public void startList() { + getListWriter().startList(); + data.getMutator().setType(idx(), MinorType.LIST); + } + + @Override + public void endList() { + getListWriter().endList(); + } + + private MapWriter getMapWriter() { + if (mapWriter == null) { + mapWriter = new SingleMapWriter(data.getMap(), null, true); + mapWriter.setPosition(idx()); + writers.add(mapWriter); + } + return mapWriter; + } + + public MapWriter asMap() { + data.getMutator().setType(idx(), MinorType.MAP); + return getMapWriter(); + } + + private ListWriter getListWriter() { + if (listWriter == null) { + listWriter = new UnionListWriter(data.getList()); + listWriter.setPosition(idx()); + writers.add(listWriter); + } + return listWriter; + } + + public ListWriter asList() { + data.getMutator().setType(idx(), MinorType.LIST); + return getListWriter(); + } + + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + + <#if !minor.class?starts_with("Decimal")> + + private ${name}Writer ${name?uncap_first}Writer; + + private ${name}Writer get${name}Writer() { + if (${uncappedName}Writer == null) { + ${uncappedName}Writer = new Nullable${name}WriterImpl(data.get${name}Vector(), null); + ${uncappedName}Writer.setPosition(idx()); + writers.add(${uncappedName}Writer); + } + return ${uncappedName}Writer; + } + + public ${name}Writer as${name}() { + data.getMutator().setType(idx(), MinorType.${name?upper_case}); + return get${name}Writer(); + } + + @Override + public void write(${name}Holder holder) { + data.getMutator().setType(idx(), MinorType.${name?upper_case}); + get${name}Writer().setPosition(idx()); + get${name}Writer().write${name}(<#list fields as field>holder.${field.name}<#if field_has_next>, ); + } + + public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + data.getMutator().setType(idx(), MinorType.${name?upper_case}); + get${name}Writer().setPosition(idx()); + get${name}Writer().write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); + } + + + + + public void writeNull() { + } + + @Override + public MapWriter map() { + data.getMutator().setType(idx(), MinorType.LIST); + getListWriter().setPosition(idx()); + return getListWriter().map(); + } + + @Override + public ListWriter list() { + data.getMutator().setType(idx(), MinorType.LIST); + getListWriter().setPosition(idx()); + return getListWriter().list(); + } + + @Override + public ListWriter list(String name) { + data.getMutator().setType(idx(), MinorType.MAP); + getMapWriter().setPosition(idx()); + return getMapWriter().list(name); + } + + @Override + public MapWriter map(String name) { + data.getMutator().setType(idx(), MinorType.MAP); + getMapWriter().setPosition(idx()); + return getMapWriter().map(name); + } + + <#list vv.types as type><#list type.minor as minor> + <#assign lowerName = minor.class?uncap_first /> + <#if lowerName == "int" ><#assign lowerName = "integer" /> + <#assign upperName = minor.class?upper_case /> + <#assign capName = minor.class?cap_first /> + <#if !minor.class?starts_with("Decimal")> + @Override + public ${capName}Writer ${lowerName}(String name) { + data.getMutator().setType(idx(), MinorType.MAP); + getMapWriter().setPosition(idx()); + return getMapWriter().${lowerName}(name); + } + + @Override + public ${capName}Writer ${lowerName}() { + data.getMutator().setType(idx(), MinorType.LIST); + getListWriter().setPosition(idx()); + return getListWriter().${lowerName}(); + } + + + + @Override + public void allocate() { + data.allocateNew(); + } + + @Override + public void clear() { + data.clear(); + } + + @Override + public void close() throws Exception { + data.close(); + } + + @Override + public MaterializedField getField() { + return data.getField(); + } + + @Override + public int getValueCapacity() { + return data.getValueCapacity(); + } +} diff --git a/java/vector/src/main/codegen/templates/ValueHolders.java b/java/vector/src/main/codegen/templates/ValueHolders.java new file mode 100644 index 00000000000..2b14194574a --- /dev/null +++ b/java/vector/src/main/codegen/templates/ValueHolders.java @@ -0,0 +1,116 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +<@pp.dropOutputFile /> +<#list vv.modes as mode> +<#list vv.types as type> +<#list type.minor as minor> + +<#assign className="${mode.prefix}${minor.class}Holder" /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/holders/${className}.java" /> + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector.holders; + +<#include "/@includes/vv_imports.ftl" /> + +public final class ${className} implements ValueHolder{ + + public static final MajorType TYPE = new MajorType(MinorType.${minor.class?upper_case}, DataMode.${mode.name?upper_case}); + + public MajorType getType() {return TYPE;} + + <#if mode.name == "Repeated"> + + /** The first index (inclusive) into the Vector. **/ + public int start; + + /** The last index (exclusive) into the Vector. **/ + public int end; + + /** The Vector holding the actual values. **/ + public ${minor.class}Vector vector; + + <#else> + public static final int WIDTH = ${type.width}; + + <#if mode.name == "Optional">public int isSet; + <#assign fields = minor.fields!type.fields /> + <#list fields as field> + public ${field.type} ${field.name}; + + + <#if minor.class.startsWith("Decimal")> + public static final int maxPrecision = ${minor.maxPrecisionDigits}; + <#if minor.class.startsWith("Decimal28") || minor.class.startsWith("Decimal38")> + public static final int nDecimalDigits = ${minor.nDecimalDigits}; + + public static int getInteger(int index, int start, ArrowBuf buffer) { + int value = buffer.getInt(start + (index * 4)); + + if (index == 0) { + /* the first byte contains sign bit, return value without it */ + <#if minor.class.endsWith("Sparse")> + value = (value & 0x7FFFFFFF); + <#elseif minor.class.endsWith("Dense")> + value = (value & 0x0000007F); + + } + return value; + } + + public static void setInteger(int index, int value, int start, ArrowBuf buffer) { + buffer.setInt(start + (index * 4), value); + } + + public static void setSign(boolean sign, int start, ArrowBuf buffer) { + // Set MSB to 1 if sign is negative + if (sign == true) { + int value = getInteger(0, start, buffer); + setInteger(0, (value | 0x80000000), start, buffer); + } + } + + public static boolean getSign(int start, ArrowBuf buffer) { + return ((buffer.getInt(start) & 0x80000000) != 0); + } + + + @Deprecated + public int hashCode(){ + throw new UnsupportedOperationException(); + } + + /* + * Reason for deprecation is that ValueHolders are potential scalar replacements + * and hence we don't want any methods to be invoked on them. + */ + @Deprecated + public String toString(){ + throw new UnsupportedOperationException(); + } + + + + + +} + + + + \ No newline at end of file diff --git a/java/vector/src/main/codegen/templates/VariableLengthVectors.java b/java/vector/src/main/codegen/templates/VariableLengthVectors.java new file mode 100644 index 00000000000..13d53b8e846 --- /dev/null +++ b/java/vector/src/main/codegen/templates/VariableLengthVectors.java @@ -0,0 +1,644 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.lang.Override; + +import org.apache.drill.exec.exception.OutOfMemoryException; +import org.apache.drill.exec.vector.BaseDataValueVector; +import org.apache.drill.exec.vector.BaseValueVector; +import org.apache.drill.exec.vector.VariableWidthVector; + +<@pp.dropOutputFile /> +<#list vv.types as type> +<#list type.minor as minor> + +<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> + +<#if type.major == "VarLen"> +<@pp.changeOutputFile name="/org/apache/arrow/vector/${minor.class}Vector.java" /> + +<#include "/@includes/license.ftl" /> + +package org.apache.arrow.vector; + +<#include "/@includes/vv_imports.ftl" /> + +/** + * ${minor.class}Vector implements a vector of variable width values. Elements in the vector + * are accessed by position from the logical start of the vector. A fixed width offsetVector + * is used to convert an element's position to it's offset from the start of the (0-based) + * ArrowBuf. Size is inferred by adjacent elements. + * The width of each element is ${type.width} byte(s) + * The equivalent Java primitive is '${minor.javaType!type.javaType}' + * + * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. + */ +public final class ${minor.class}Vector extends BaseDataValueVector implements VariableWidthVector{ + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${minor.class}Vector.class); + + private static final int DEFAULT_RECORD_BYTE_COUNT = 8; + private static final int INITIAL_BYTE_COUNT = 4096 * DEFAULT_RECORD_BYTE_COUNT; + private static final int MIN_BYTE_COUNT = 4096; + + public final static String OFFSETS_VECTOR_NAME = "$offsets$"; + private final MaterializedField offsetsField = MaterializedField.create(OFFSETS_VECTOR_NAME, new MajorType(MinorType.UINT4, DataMode.REQUIRED)); + private final UInt${type.width}Vector offsetVector = new UInt${type.width}Vector(offsetsField, allocator); + private final FieldReader reader = new ${minor.class}ReaderImpl(${minor.class}Vector.this); + + private final Accessor accessor; + private final Mutator mutator; + + private final UInt${type.width}Vector.Accessor oAccessor; + + private int allocationSizeInBytes = INITIAL_BYTE_COUNT; + private int allocationMonitor = 0; + + public ${minor.class}Vector(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + this.oAccessor = offsetVector.getAccessor(); + this.accessor = new Accessor(); + this.mutator = new Mutator(); + } + + @Override + public FieldReader getReader(){ + return reader; + } + + @Override + public int getBufferSize(){ + if (getAccessor().getValueCount() == 0) { + return 0; + } + return offsetVector.getBufferSize() + data.writerIndex(); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + final int idx = offsetVector.getAccessor().get(valueCount); + return offsetVector.getBufferSizeFor(valueCount + 1) + idx; + } + + @Override + public int getValueCapacity(){ + return Math.max(offsetVector.getValueCapacity() - 1, 0); + } + + @Override + public int getByteCapacity(){ + return data.capacity(); + } + + @Override + public int getCurrentSizeInBytes() { + return offsetVector.getAccessor().get(getAccessor().getValueCount()); + } + + /** + * Return the number of bytes contained in the current var len byte vector. + * @return + */ + public int getVarByteLength(){ + final int valueCount = getAccessor().getValueCount(); + if(valueCount == 0) { + return 0; + } + return offsetVector.getAccessor().get(valueCount); + } + +// @Override +// public SerializedField getMetadata() { +// return getMetadataBuilder() // +// .addChild(offsetVector.getMetadata()) +// .setValueCount(getAccessor().getValueCount()) // +// .setBufferLength(getBufferSize()) // +// .build(); +// } +// +// @Override +// public void load(SerializedField metadata, ArrowBuf buffer) { +// the bits vector is the first child (the order in which the children are added in getMetadataBuilder is significant) +// final SerializedField offsetField = metadata.getChild(0); +// offsetVector.load(offsetField, buffer); +// +// final int capacity = buffer.capacity(); +// final int offsetsLength = offsetField.getBufferLength(); +// data = buffer.slice(offsetsLength, capacity - offsetsLength); +// data.retain(); +// } + + @Override + public void clear() { + super.clear(); + offsetVector.clear(); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers = ObjectArrays.concat(offsetVector.getBuffers(false), super.getBuffers(false), ArrowBuf.class); + if (clear) { + // does not make much sense but we have to retain buffers even when clear is set. refactor this interface. + for (final ArrowBuf buffer:buffers) { + buffer.retain(1); + } + clear(); + } + return buffers; + } + + public long getOffsetAddr(){ + return offsetVector.getBuffer().memoryAddress(); + } + + public UInt${type.width}Vector getOffsetVector(){ + return offsetVector; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator){ + return new TransferImpl(getField(), allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator){ + return new TransferImpl(getField().withPath(ref), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((${minor.class}Vector) to); + } + + public void transferTo(${minor.class}Vector target){ + target.clear(); + this.offsetVector.transferTo(target.offsetVector); + target.data = data.transferOwnership(target.allocator).buffer; + target.data.writerIndex(data.writerIndex()); + clear(); + } + + public void splitAndTransferTo(int startIndex, int length, ${minor.class}Vector target) { + UInt${type.width}Vector.Accessor offsetVectorAccessor = this.offsetVector.getAccessor(); + final int startPoint = offsetVectorAccessor.get(startIndex); + final int sliceLength = offsetVectorAccessor.get(startIndex + length) - startPoint; + target.clear(); + target.offsetVector.allocateNew(length + 1); + offsetVectorAccessor = this.offsetVector.getAccessor(); + final UInt4Vector.Mutator targetOffsetVectorMutator = target.offsetVector.getMutator(); + for (int i = 0; i < length + 1; i++) { + targetOffsetVectorMutator.set(i, offsetVectorAccessor.get(startIndex + i) - startPoint); + } + target.data = data.slice(startPoint, sliceLength).transferOwnership(target.allocator).buffer; + target.getMutator().setValueCount(length); +} + + protected void copyFrom(int fromIndex, int thisIndex, ${minor.class}Vector from){ + final UInt4Vector.Accessor fromOffsetVectorAccessor = from.offsetVector.getAccessor(); + final int start = fromOffsetVectorAccessor.get(fromIndex); + final int end = fromOffsetVectorAccessor.get(fromIndex + 1); + final int len = end - start; + + final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}); + from.data.getBytes(start, data, outputStart, len); + offsetVector.data.set${(minor.javaType!type.javaType)?cap_first}( (thisIndex+1) * ${type.width}, outputStart + len); + } + + public boolean copyFromSafe(int fromIndex, int thisIndex, ${minor.class}Vector from){ + final UInt${type.width}Vector.Accessor fromOffsetVectorAccessor = from.offsetVector.getAccessor(); + final int start = fromOffsetVectorAccessor.get(fromIndex); + final int end = fromOffsetVectorAccessor.get(fromIndex + 1); + final int len = end - start; + final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}); + + while(data.capacity() < outputStart + len) { + reAlloc(); + } + + offsetVector.getMutator().setSafe(thisIndex + 1, outputStart + len); + from.data.getBytes(start, data, outputStart, len); + return true; + } + + private class TransferImpl implements TransferPair{ + ${minor.class}Vector to; + + public TransferImpl(MaterializedField field, BufferAllocator allocator){ + to = new ${minor.class}Vector(field, allocator); + } + + public TransferImpl(${minor.class}Vector to){ + this.to = to; + } + + @Override + public ${minor.class}Vector getTo(){ + return to; + } + + @Override + public void transfer(){ + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, ${minor.class}Vector.this); + } + } + + @Override + public void setInitialCapacity(final int valueCount) { + final long size = 1L * valueCount * ${type.width}; + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); + } + allocationSizeInBytes = (int)size; + offsetVector.setInitialCapacity(valueCount + 1); + } + + @Override + public void allocateNew() { + if(!allocateNewSafe()){ + throw new OutOfMemoryException("Failure while allocating buffer."); + } + } + + @Override + public boolean allocateNewSafe() { + long curAllocationSize = allocationSizeInBytes; + if (allocationMonitor > 10) { + curAllocationSize = Math.max(MIN_BYTE_COUNT, curAllocationSize / 2); + allocationMonitor = 0; + } else if (allocationMonitor < -2) { + curAllocationSize = curAllocationSize * 2L; + allocationMonitor = 0; + } + + if (curAllocationSize > MAX_ALLOCATION_SIZE) { + return false; + } + + clear(); + /* Boolean to keep track if all the memory allocations were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + try { + final int requestedSize = (int)curAllocationSize; + data = allocator.buffer(requestedSize); + allocationSizeInBytes = requestedSize; + offsetVector.allocateNew(); + } catch (OutOfMemoryException e) { + clear(); + return false; + } + data.readerIndex(0); + offsetVector.zeroVector(); + return true; + } + + @Override + public void allocateNew(int totalBytes, int valueCount) { + clear(); + assert totalBytes >= 0; + try { + data = allocator.buffer(totalBytes); + offsetVector.allocateNew(valueCount + 1); + } catch (RuntimeException e) { + clear(); + throw e; + } + data.readerIndex(0); + allocationSizeInBytes = totalBytes; + offsetVector.zeroVector(); + } + + @Override + public void reset() { + allocationSizeInBytes = INITIAL_BYTE_COUNT; + allocationMonitor = 0; + data.readerIndex(0); + offsetVector.zeroVector(); + super.reset(); + } + + public void reAlloc() { + final long newAllocationSize = allocationSizeInBytes*2L; + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Unable to expand the buffer. Max allowed buffer size is reached."); + } + + final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); + newBuf.setBytes(0, data, 0, data.capacity()); + data.release(); + data = newBuf; + allocationSizeInBytes = (int)newAllocationSize; + } + + public void decrementAllocationMonitor() { + if (allocationMonitor > 0) { + allocationMonitor = 0; + } + --allocationMonitor; + } + + private void incrementAllocationMonitor() { + ++allocationMonitor; + } + + @Override + public Accessor getAccessor(){ + return accessor; + } + + @Override + public Mutator getMutator() { + return mutator; + } + + public final class Accessor extends BaseValueVector.BaseAccessor implements VariableWidthAccessor { + final UInt${type.width}Vector.Accessor oAccessor = offsetVector.getAccessor(); + public long getStartEnd(int index){ + return oAccessor.getTwoAsLong(index); + } + + public byte[] get(int index) { + assert index >= 0; + final int startIdx = oAccessor.get(index); + final int length = oAccessor.get(index + 1) - startIdx; + assert length >= 0; + final byte[] dst = new byte[length]; + data.getBytes(startIdx, dst, 0, length); + return dst; + } + + @Override + public int getValueLength(int index) { + final UInt${type.width}Vector.Accessor offsetVectorAccessor = offsetVector.getAccessor(); + return offsetVectorAccessor.get(index + 1) - offsetVectorAccessor.get(index); + } + + public void get(int index, ${minor.class}Holder holder){ + holder.start = oAccessor.get(index); + holder.end = oAccessor.get(index + 1); + holder.buffer = data; + } + + public void get(int index, Nullable${minor.class}Holder holder){ + holder.isSet = 1; + holder.start = oAccessor.get(index); + holder.end = oAccessor.get(index + 1); + holder.buffer = data; + } + + + <#switch minor.class> + <#case "VarChar"> + @Override + public ${friendlyType} getObject(int index) { + Text text = new Text(); + text.set(get(index)); + return text; + } + <#break> + <#case "Var16Char"> + @Override + public ${friendlyType} getObject(int index) { + return new String(get(index), Charsets.UTF_16); + } + <#break> + <#default> + @Override + public ${friendlyType} getObject(int index) { + return get(index); + } + + + @Override + public int getValueCount() { + return Math.max(offsetVector.getAccessor().getValueCount()-1, 0); + } + + @Override + public boolean isNull(int index){ + return false; + } + + public UInt${type.width}Vector getOffsetVector(){ + return offsetVector; + } + } + + /** + * Mutable${minor.class} implements a vector of variable width values. Elements in the vector + * are accessed by position from the logical start of the vector. A fixed width offsetVector + * is used to convert an element's position to it's offset from the start of the (0-based) + * ArrowBuf. Size is inferred by adjacent elements. + * The width of each element is ${type.width} byte(s) + * The equivalent Java primitive is '${minor.javaType!type.javaType}' + * + * NB: this class is automatically generated from ValueVectorTypes.tdd using FreeMarker. + */ + public final class Mutator extends BaseValueVector.BaseMutator implements VariableWidthVector.VariableWidthMutator { + + /** + * Set the variable length element at the specified index to the supplied byte array. + * + * @param index position of the bit to set + * @param bytes array of bytes to write + */ + protected void set(int index, byte[] bytes) { + assert index >= 0; + final int currentOffset = offsetVector.getAccessor().get(index); + offsetVector.getMutator().set(index + 1, currentOffset + bytes.length); + data.setBytes(currentOffset, bytes, 0, bytes.length); + } + + public void setSafe(int index, byte[] bytes) { + assert index >= 0; + + final int currentOffset = offsetVector.getAccessor().get(index); + while (data.capacity() < currentOffset + bytes.length) { + reAlloc(); + } + offsetVector.getMutator().setSafe(index + 1, currentOffset + bytes.length); + data.setBytes(currentOffset, bytes, 0, bytes.length); + } + + /** + * Set the variable length element at the specified index to the supplied byte array. + * + * @param index position of the bit to set + * @param bytes array of bytes to write + * @param start start index of bytes to write + * @param length length of bytes to write + */ + protected void set(int index, byte[] bytes, int start, int length) { + assert index >= 0; + final int currentOffset = offsetVector.getAccessor().get(index); + offsetVector.getMutator().set(index + 1, currentOffset + length); + data.setBytes(currentOffset, bytes, start, length); + } + + public void setSafe(int index, ByteBuffer bytes, int start, int length) { + assert index >= 0; + + int currentOffset = offsetVector.getAccessor().get(index); + + while (data.capacity() < currentOffset + length) { + reAlloc(); + } + offsetVector.getMutator().setSafe(index + 1, currentOffset + length); + data.setBytes(currentOffset, bytes, start, length); + } + + public void setSafe(int index, byte[] bytes, int start, int length) { + assert index >= 0; + + final int currentOffset = offsetVector.getAccessor().get(index); + + while (data.capacity() < currentOffset + length) { + reAlloc(); + } + offsetVector.getMutator().setSafe(index + 1, currentOffset + length); + data.setBytes(currentOffset, bytes, start, length); + } + + @Override + public void setValueLengthSafe(int index, int length) { + final int offset = offsetVector.getAccessor().get(index); + while(data.capacity() < offset + length ) { + reAlloc(); + } + offsetVector.getMutator().setSafe(index + 1, offsetVector.getAccessor().get(index) + length); + } + + + public void setSafe(int index, int start, int end, ArrowBuf buffer){ + final int len = end - start; + final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + + while(data.capacity() < outputStart + len) { + reAlloc(); + } + + offsetVector.getMutator().setSafe( index+1, outputStart + len); + buffer.getBytes(start, data, outputStart, len); + } + + public void setSafe(int index, Nullable${minor.class}Holder holder){ + assert holder.isSet == 1; + + final int start = holder.start; + final int end = holder.end; + final int len = end - start; + + int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + + while(data.capacity() < outputStart + len) { + reAlloc(); + } + + holder.buffer.getBytes(start, data, outputStart, len); + offsetVector.getMutator().setSafe( index+1, outputStart + len); + } + + public void setSafe(int index, ${minor.class}Holder holder){ + final int start = holder.start; + final int end = holder.end; + final int len = end - start; + final int outputStart = offsetVector.data.get${(minor.javaType!type.javaType)?cap_first}(index * ${type.width}); + + while(data.capacity() < outputStart + len) { + reAlloc(); + } + + holder.buffer.getBytes(start, data, outputStart, len); + offsetVector.getMutator().setSafe( index+1, outputStart + len); + } + + protected void set(int index, int start, int length, ArrowBuf buffer){ + assert index >= 0; + final int currentOffset = offsetVector.getAccessor().get(index); + offsetVector.getMutator().set(index + 1, currentOffset + length); + final ArrowBuf bb = buffer.slice(start, length); + data.setBytes(currentOffset, bb); + } + + protected void set(int index, Nullable${minor.class}Holder holder){ + final int length = holder.end - holder.start; + final int currentOffset = offsetVector.getAccessor().get(index); + offsetVector.getMutator().set(index + 1, currentOffset + length); + data.setBytes(currentOffset, holder.buffer, holder.start, length); + } + + protected void set(int index, ${minor.class}Holder holder){ + final int length = holder.end - holder.start; + final int currentOffset = offsetVector.getAccessor().get(index); + offsetVector.getMutator().set(index + 1, currentOffset + length); + data.setBytes(currentOffset, holder.buffer, holder.start, length); + } + + @Override + public void setValueCount(int valueCount) { + final int currentByteCapacity = getByteCapacity(); + final int idx = offsetVector.getAccessor().get(valueCount); + data.writerIndex(idx); + if (valueCount > 0 && currentByteCapacity > idx * 2) { + incrementAllocationMonitor(); + } else if (allocationMonitor > 0) { + allocationMonitor = 0; + } + VectorTrimmer.trim(data, idx); + offsetVector.getMutator().setValueCount(valueCount == 0 ? 0 : valueCount+1); + } + + @Override + public void generateTestData(int size){ + boolean even = true; + <#switch minor.class> + <#case "Var16Char"> + final java.nio.charset.Charset charset = Charsets.UTF_16; + <#break> + <#case "VarChar"> + <#default> + final java.nio.charset.Charset charset = Charsets.UTF_8; + + final byte[] evenValue = new String("aaaaa").getBytes(charset); + final byte[] oddValue = new String("bbbbbbbbbb").getBytes(charset); + for(int i =0; i < size; i++, even = !even){ + set(i, even ? evenValue : oddValue); + } + setValueCount(size); + } + } +} + + <#-- type.major --> + + diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java new file mode 100644 index 00000000000..388eb9c4479 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/AddOrGetResult.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import com.google.common.base.Preconditions; + +public class AddOrGetResult { + private final V vector; + private final boolean created; + + public AddOrGetResult(V vector, boolean created) { + this.vector = Preconditions.checkNotNull(vector); + this.created = created; + } + + public V getVector() { + return vector; + } + + public boolean isCreated() { + return created; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java new file mode 100644 index 00000000000..54c3cd7331e --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -0,0 +1,61 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import org.apache.arrow.vector.complex.RepeatedFixedWidthVectorLike; +import org.apache.arrow.vector.complex.RepeatedVariableWidthVectorLike; + +public class AllocationHelper { +// private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AllocationHelper.class); + + public static void allocate(ValueVector v, int valueCount, int bytesPerValue) { + allocate(v, valueCount, bytesPerValue, 5); + } + + public static void allocatePrecomputedChildCount(ValueVector v, int valueCount, int bytesPerValue, int childValCount){ + if(v instanceof FixedWidthVector) { + ((FixedWidthVector) v).allocateNew(valueCount); + } else if (v instanceof VariableWidthVector) { + ((VariableWidthVector) v).allocateNew(valueCount * bytesPerValue, valueCount); + } else if(v instanceof RepeatedFixedWidthVectorLike) { + ((RepeatedFixedWidthVectorLike) v).allocateNew(valueCount, childValCount); + } else if(v instanceof RepeatedVariableWidthVectorLike) { + ((RepeatedVariableWidthVectorLike) v).allocateNew(childValCount * bytesPerValue, valueCount, childValCount); + } else { + v.allocateNew(); + } + } + + public static void allocate(ValueVector v, int valueCount, int bytesPerValue, int repeatedPerTop){ + allocatePrecomputedChildCount(v, valueCount, bytesPerValue, repeatedPerTop * valueCount); + } + + /** + * Allocates the exact amount if v is fixed width, otherwise falls back to dynamic allocation + * @param v value vector we are trying to allocate + * @param valueCount size we are trying to allocate + * @throws org.apache.drill.exec.memory.OutOfMemoryException if it can't allocate the memory + */ + public static void allocateNew(ValueVector v, int valueCount) { + if (v instanceof FixedWidthVector) { + ((FixedWidthVector) v).allocateNew(valueCount); + } else { + v.allocateNew(); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java new file mode 100644 index 00000000000..b129ea9bcb9 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.MaterializedField; + + +public abstract class BaseDataValueVector extends BaseValueVector { + + protected final static byte[] emptyByteArray = new byte[]{}; // Nullable vectors use this + + protected ArrowBuf data; + + public BaseDataValueVector(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + data = allocator.getEmpty(); + } + + @Override + public void clear() { + if (data != null) { + data.release(); + } + data = allocator.getEmpty(); + super.clear(); + } + + @Override + public void close() { + clear(); + if (data != null) { + data.release(); + data = null; + } + super.close(); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + ArrowBuf[] out; + if (getBufferSize() == 0) { + out = new ArrowBuf[0]; + } else { + out = new ArrowBuf[]{data}; + data.readerIndex(0); + if (clear) { + data.retain(1); + } + } + if (clear) { + clear(); + } + return out; + } + + @Override + public int getBufferSize() { + if (getAccessor().getValueCount() == 0) { + return 0; + } + return data.writerIndex(); + } + + public ArrowBuf getBuffer() { + return data; + } + + /** + * This method has a similar effect of allocateNew() without actually clearing and reallocating + * the value vector. The purpose is to move the value vector to a "mutate" state + */ + public void reset() {} +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java new file mode 100644 index 00000000000..8bca3c00537 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -0,0 +1,125 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +import java.util.Iterator; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterators; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.util.TransferPair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public abstract class BaseValueVector implements ValueVector { + private static final Logger logger = LoggerFactory.getLogger(BaseValueVector.class); + + public static final int MAX_ALLOCATION_SIZE = Integer.MAX_VALUE; + public static final int INITIAL_VALUE_ALLOCATION = 4096; + + protected final BufferAllocator allocator; + protected final MaterializedField field; + + protected BaseValueVector(MaterializedField field, BufferAllocator allocator) { + this.field = Preconditions.checkNotNull(field, "field cannot be null"); + this.allocator = Preconditions.checkNotNull(allocator, "allocator cannot be null"); + } + + @Override + public String toString() { + return super.toString() + "[field = " + field + ", ...]"; + } + + @Override + public void clear() { + getMutator().reset(); + } + + @Override + public void close() { + clear(); + } + + @Override + public MaterializedField getField() { + return field; + } + + public MaterializedField getField(String ref){ + return getField().withPath(ref); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return getTransferPair(getField().getPath(), allocator); + } + +// public static SerializedField getMetadata(BaseValueVector vector) { +// return getMetadataBuilder(vector).build(); +// } +// +// protected static SerializedField.Builder getMetadataBuilder(BaseValueVector vector) { +// return SerializedFieldHelper.getAsBuilder(vector.getField()) +// .setValueCount(vector.getAccessor().getValueCount()) +// .setBufferLength(vector.getBufferSize()); +// } + + public abstract static class BaseAccessor implements ValueVector.Accessor { + protected BaseAccessor() { } + + @Override + public boolean isNull(int index) { + return false; + } + } + + public abstract static class BaseMutator implements ValueVector.Mutator { + protected BaseMutator() { } + + @Override + public void generateTestData(int values) {} + + //TODO: consider making mutator stateless(if possible) on another issue. + public void reset() {} + } + + @Override + public Iterator iterator() { + return Iterators.emptyIterator(); + } + + public static boolean checkBufRefs(final ValueVector vv) { + for(final ArrowBuf buffer : vv.getBuffers(false)) { + if (buffer.refCnt() <= 0) { + throw new IllegalStateException("zero refcount"); + } + } + + return true; + } + + @Override + public BufferAllocator getAllocator() { + return allocator; + } +} + diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java new file mode 100644 index 00000000000..952e9028e06 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -0,0 +1,450 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.complex.impl.BitReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.BitHolder; +import org.apache.arrow.vector.holders.NullableBitHolder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.util.TransferPair; + +/** + * Bit implements a vector of bit-width values. Elements in the vector are accessed by position from the logical start + * of the vector. The width of each element is 1 bit. The equivalent Java primitive is an int containing the value '0' + * or '1'. + */ +public final class BitVector extends BaseDataValueVector implements FixedWidthVector { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BitVector.class); + + private final FieldReader reader = new BitReaderImpl(BitVector.this); + private final Accessor accessor = new Accessor(); + private final Mutator mutator = new Mutator(); + + private int valueCount; + private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION; + private int allocationMonitor = 0; + + public BitVector(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public int getBufferSize() { + return getSizeFromCount(valueCount); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + return getSizeFromCount(valueCount); + } + + private int getSizeFromCount(int valueCount) { + return (int) Math.ceil(valueCount / 8.0); + } + + @Override + public int getValueCapacity() { + return (int)Math.min((long)Integer.MAX_VALUE, data.capacity() * 8L); + } + + private int getByteIndex(int index) { + return (int) Math.floor(index / 8.0); + } + + @Override + public void setInitialCapacity(final int valueCount) { + allocationSizeInBytes = getSizeFromCount(valueCount); + } + + @Override + public void allocateNew() { + if (!allocateNewSafe()) { + throw new OutOfMemoryException(); + } + } + + @Override + public boolean allocateNewSafe() { + long curAllocationSize = allocationSizeInBytes; + if (allocationMonitor > 10) { + curAllocationSize = Math.max(8, allocationSizeInBytes / 2); + allocationMonitor = 0; + } else if (allocationMonitor < -2) { + curAllocationSize = allocationSizeInBytes * 2L; + allocationMonitor = 0; + } + + try { + allocateBytes(curAllocationSize); + } catch (OutOfMemoryException ex) { + return false; + } + return true; + } + + @Override + public void reset() { + valueCount = 0; + allocationSizeInBytes = INITIAL_VALUE_ALLOCATION; + allocationMonitor = 0; + zeroVector(); + super.reset(); + } + + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param valueCount + * The number of values which can be contained within this vector. + */ + @Override + public void allocateNew(int valueCount) { + final int size = getSizeFromCount(valueCount); + allocateBytes(size); + } + + private void allocateBytes(final long size) { + if (size > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); + } + + final int curSize = (int) size; + clear(); + data = allocator.buffer(curSize); + zeroVector(); + allocationSizeInBytes = curSize; + } + + /** + * Allocate new buffer with double capacity, and copy data into the new buffer. Replace vector's buffer with new buffer, and release old one + */ + public void reAlloc() { + final long newAllocationSize = allocationSizeInBytes * 2L; + if (newAllocationSize > MAX_ALLOCATION_SIZE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed allocation size"); + } + + final int curSize = (int)newAllocationSize; + final ArrowBuf newBuf = allocator.buffer(curSize); + newBuf.setZero(0, newBuf.capacity()); + newBuf.setBytes(0, data, 0, data.capacity()); + data.release(); + data = newBuf; + allocationSizeInBytes = curSize; + } + + /** + * {@inheritDoc} + */ + @Override + public void zeroVector() { + data.setZero(0, data.capacity()); + } + + public void copyFrom(int inIndex, int outIndex, BitVector from) { + this.mutator.set(outIndex, from.accessor.get(inIndex)); + } + + public boolean copyFromSafe(int inIndex, int outIndex, BitVector from) { + if (outIndex >= this.getValueCapacity()) { + decrementAllocationMonitor(); + return false; + } + copyFrom(inIndex, outIndex, from); + return true; + } + +// @Override +// public void load(SerializedField metadata, DrillBuf buffer) { +// Preconditions.checkArgument(this.field.getPath().equals(metadata.getNamePart().getName()), "The field %s doesn't match the provided metadata %s.", this.field, metadata); +// final int valueCount = metadata.getValueCount(); +// final int expectedLength = getSizeFromCount(valueCount); +// final int actualLength = metadata.getBufferLength(); +// assert expectedLength == actualLength: "expected and actual buffer sizes do not match"; +// +// clear(); +// data = buffer.slice(0, actualLength); +// data.retain(); +// this.valueCount = valueCount; +// } + + @Override + public Mutator getMutator() { + return new Mutator(); + } + + @Override + public Accessor getAccessor() { + return new Accessor(); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new TransferImpl(getField(), allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(getField().withPath(ref), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new TransferImpl((BitVector) to); + } + + + public void transferTo(BitVector target) { + target.clear(); + if (target.data != null) { + target.data.release(); + } + target.data = data; + target.data.retain(1); + target.valueCount = valueCount; + clear(); + } + + public void splitAndTransferTo(int startIndex, int length, BitVector target) { + assert startIndex + length <= valueCount; + int firstByte = getByteIndex(startIndex); + int byteSize = getSizeFromCount(length); + int offset = startIndex % 8; + if (offset == 0) { + target.clear(); + // slice + if (target.data != null) { + target.data.release(); + } + target.data = (ArrowBuf) data.slice(firstByte, byteSize); + target.data.retain(1); + } else { + // Copy data + // When the first bit starts from the middle of a byte (offset != 0), copy data from src BitVector. + // Each byte in the target is composed by a part in i-th byte, another part in (i+1)-th byte. + // The last byte copied to target is a bit tricky : + // 1) if length requires partly byte (length % 8 !=0), copy the remaining bits only. + // 2) otherwise, copy the last byte in the same way as to the prior bytes. + target.clear(); + target.allocateNew(length); + // TODO maybe do this one word at a time, rather than byte? + for(int i = 0; i < byteSize - 1; i++) { + target.data.setByte(i, (((this.data.getByte(firstByte + i) & 0xFF) >>> offset) + (this.data.getByte(firstByte + i + 1) << (8 - offset)))); + } + if (length % 8 != 0) { + target.data.setByte(byteSize - 1, ((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset)); + } else { + target.data.setByte(byteSize - 1, + (((this.data.getByte(firstByte + byteSize - 1) & 0xFF) >>> offset) + (this.data.getByte(firstByte + byteSize) << (8 - offset)))); + } + } + target.getMutator().setValueCount(length); + } + + private class TransferImpl implements TransferPair { + BitVector to; + + public TransferImpl(MaterializedField field, BufferAllocator allocator) { + this.to = new BitVector(field, allocator); + } + + public TransferImpl(BitVector to) { + this.to = to; + } + + @Override + public BitVector getTo() { + return to; + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + splitAndTransferTo(startIndex, length, to); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + to.copyFromSafe(fromIndex, toIndex, BitVector.this); + } + } + + private void decrementAllocationMonitor() { + if (allocationMonitor > 0) { + allocationMonitor = 0; + } + --allocationMonitor; + } + + private void incrementAllocationMonitor() { + ++allocationMonitor; + } + + public class Accessor extends BaseAccessor { + + /** + * Get the byte holding the desired bit, then mask all other bits. Iff the result is 0, the bit was not set. + * + * @param index + * position of the bit in the vector + * @return 1 if set, otherwise 0 + */ + public final int get(int index) { + int byteIndex = index >> 3; + byte b = data.getByte(byteIndex); + int bitIndex = index & 7; + return Long.bitCount(b & (1L << bitIndex)); + } + + @Override + public boolean isNull(int index) { + return false; + } + + @Override + public final Boolean getObject(int index) { + return new Boolean(get(index) != 0); + } + + @Override + public final int getValueCount() { + return valueCount; + } + + public final void get(int index, BitHolder holder) { + holder.value = get(index); + } + + public final void get(int index, NullableBitHolder holder) { + holder.isSet = 1; + holder.value = get(index); + } + } + + /** + * MutableBit implements a vector of bit-width values. Elements in the vector are accessed by position from the + * logical start of the vector. Values should be pushed onto the vector sequentially, but may be randomly accessed. + * + * NB: this class is automatically generated from ValueVectorTypes.tdd using FreeMarker. + */ + public class Mutator extends BaseMutator { + + private Mutator() { + } + + /** + * Set the bit at the given index to the specified value. + * + * @param index + * position of the bit to set + * @param value + * value to set (either 1 or 0) + */ + public final void set(int index, int value) { + int byteIndex = index >> 3; + int bitIndex = index & 7; + byte currentByte = data.getByte(byteIndex); + byte bitMask = (byte) (1L << bitIndex); + if (value != 0) { + currentByte |= bitMask; + } else { + currentByte -= (bitMask & currentByte); + } + + data.setByte(byteIndex, currentByte); + } + + public final void set(int index, BitHolder holder) { + set(index, holder.value); + } + + final void set(int index, NullableBitHolder holder) { + set(index, holder.value); + } + + public void setSafe(int index, int value) { + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, value); + } + + public void setSafe(int index, BitHolder holder) { + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, holder.value); + } + + public void setSafe(int index, NullableBitHolder holder) { + while(index >= getValueCapacity()) { + reAlloc(); + } + set(index, holder.value); + } + + @Override + public final void setValueCount(int valueCount) { + int currentValueCapacity = getValueCapacity(); + BitVector.this.valueCount = valueCount; + int idx = getSizeFromCount(valueCount); + while(valueCount > getValueCapacity()) { + reAlloc(); + } + if (valueCount > 0 && currentValueCapacity > valueCount * 2) { + incrementAllocationMonitor(); + } else if (allocationMonitor > 0) { + allocationMonitor = 0; + } + VectorTrimmer.trim(data, idx); + } + + @Override + public final void generateTestData(int values) { + boolean even = true; + for(int i = 0; i < values; i++, even = !even) { + if (even) { + set(i, 1); + } + } + setValueCount(values); + } + + } + + @Override + public void clear() { + this.valueCount = 0; + super.clear(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java new file mode 100644 index 00000000000..59057000bbc --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/FixedWidthVector.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + + +public interface FixedWidthVector extends ValueVector{ + + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param valueCount Number of values in the vector. + */ + void allocateNew(int valueCount); + +/** + * Zero out the underlying buffer backing this vector. + */ + void zeroVector(); + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java new file mode 100644 index 00000000000..00c33fc2d6e --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +public interface NullableVector extends ValueVector{ + + ValueVector getValuesVector(); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java b/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java new file mode 100644 index 00000000000..b819c5d39e9 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullableVectorDefinitionSetter.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +public interface NullableVectorDefinitionSetter { + + public void setIndexDefined(int index); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java new file mode 100644 index 00000000000..b806b180e70 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java @@ -0,0 +1,220 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.ObjectHolder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.util.TransferPair; + +public class ObjectVector extends BaseValueVector { + private final Accessor accessor = new Accessor(); + private final Mutator mutator = new Mutator(); + private int maxCount = 0; + private int count = 0; + private int allocationSize = 4096; + + private List objectArrayList = new ArrayList<>(); + + public ObjectVector(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + } + + public void addNewArray() { + objectArrayList.add(new Object[allocationSize]); + maxCount += allocationSize; + } + + @Override + public FieldReader getReader() { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + + public final class Mutator implements ValueVector.Mutator { + + public void set(int index, Object obj) { + int listOffset = index / allocationSize; + if (listOffset >= objectArrayList.size()) { + addNewArray(); + } + objectArrayList.get(listOffset)[index % allocationSize] = obj; + } + + public boolean setSafe(int index, long value) { + set(index, value); + return true; + } + + protected void set(int index, ObjectHolder holder) { + set(index, holder.obj); + } + + public boolean setSafe(int index, ObjectHolder holder){ + set(index, holder); + return true; + } + + @Override + public void setValueCount(int valueCount) { + count = valueCount; + } + + @Override + public void reset() { + count = 0; + maxCount = 0; + objectArrayList = new ArrayList<>(); + addNewArray(); + } + + @Override + public void generateTestData(int values) { + } + } + + @Override + public void setInitialCapacity(int numRecords) { + // NoOp + } + + @Override + public void allocateNew() throws OutOfMemoryException { + addNewArray(); + } + + public void allocateNew(int valueCount) throws OutOfMemoryException { + while (maxCount < valueCount) { + addNewArray(); + } + } + + @Override + public boolean allocateNewSafe() { + allocateNew(); + return true; + } + + @Override + public int getBufferSize() { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + + @Override + public void close() { + clear(); + } + + @Override + public void clear() { + objectArrayList.clear(); + maxCount = 0; + count = 0; + } + + @Override + public MaterializedField getField() { + return field; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + + @Override + public int getValueCapacity() { + return maxCount; + } + + @Override + public Accessor getAccessor() { + return accessor; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + +// @Override +// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { +// throw new UnsupportedOperationException("ObjectVector does not support this"); +// } +// +// @Override +// public UserBitShared.SerializedField getMetadata() { +// throw new UnsupportedOperationException("ObjectVector does not support this"); +// } + + @Override + public Mutator getMutator() { + return mutator; + } + + @Override + public Iterator iterator() { + throw new UnsupportedOperationException("ObjectVector does not support this"); + } + + public final class Accessor extends BaseAccessor { + @Override + public Object getObject(int index) { + int listOffset = index / allocationSize; + if (listOffset >= objectArrayList.size()) { + addNewArray(); + } + return objectArrayList.get(listOffset)[index % allocationSize]; + } + + @Override + public int getValueCount() { + return count; + } + + public Object get(int index) { + return getObject(index); + } + + public void get(int index, ObjectHolder holder){ + holder.obj = getObject(index); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java b/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java new file mode 100644 index 00000000000..fc0a066749a --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/SchemaChangeCallBack.java @@ -0,0 +1,52 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.vector.util.CallBack; + + +public class SchemaChangeCallBack implements CallBack { + private boolean schemaChanged = false; + + /** + * Constructs a schema-change callback with the schema-changed state set to + * {@code false}. + */ + public SchemaChangeCallBack() { + } + + /** + * Sets the schema-changed state to {@code true}. + */ + @Override + public void doWork() { + schemaChanged = true; + } + + /** + * Returns the value of schema-changed state, resetting the + * schema-changed state to {@code false}. + */ + public boolean getSchemaChangedAndReset() { + final boolean current = schemaChanged; + schemaChanged = false; + return current; + } +} + diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java new file mode 100644 index 00000000000..61ce285d61b --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java @@ -0,0 +1,203 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +import java.math.BigDecimal; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.holders.BigIntHolder; +import org.apache.arrow.vector.holders.BitHolder; +import org.apache.arrow.vector.holders.DateHolder; +import org.apache.arrow.vector.holders.Decimal18Holder; +import org.apache.arrow.vector.holders.Decimal28SparseHolder; +import org.apache.arrow.vector.holders.Decimal38SparseHolder; +import org.apache.arrow.vector.holders.Decimal9Holder; +import org.apache.arrow.vector.holders.Float4Holder; +import org.apache.arrow.vector.holders.Float8Holder; +import org.apache.arrow.vector.holders.IntHolder; +import org.apache.arrow.vector.holders.IntervalDayHolder; +import org.apache.arrow.vector.holders.IntervalYearHolder; +import org.apache.arrow.vector.holders.NullableBitHolder; +import org.apache.arrow.vector.holders.TimeHolder; +import org.apache.arrow.vector.holders.TimeStampHolder; +import org.apache.arrow.vector.holders.VarCharHolder; +import org.apache.arrow.vector.util.DecimalUtility; + +import com.google.common.base.Charsets; + + +public class ValueHolderHelper { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ValueHolderHelper.class); + + public static IntHolder getIntHolder(int value) { + IntHolder holder = new IntHolder(); + holder.value = value; + + return holder; + } + + public static BigIntHolder getBigIntHolder(long value) { + BigIntHolder holder = new BigIntHolder(); + holder.value = value; + + return holder; + } + + public static Float4Holder getFloat4Holder(float value) { + Float4Holder holder = new Float4Holder(); + holder.value = value; + + return holder; + } + + public static Float8Holder getFloat8Holder(double value) { + Float8Holder holder = new Float8Holder(); + holder.value = value; + + return holder; + } + + public static DateHolder getDateHolder(long value) { + DateHolder holder = new DateHolder(); + holder.value = value; + return holder; + } + + public static TimeHolder getTimeHolder(int value) { + TimeHolder holder = new TimeHolder(); + holder.value = value; + return holder; + } + + public static TimeStampHolder getTimeStampHolder(long value) { + TimeStampHolder holder = new TimeStampHolder(); + holder.value = value; + return holder; + } + + public static BitHolder getBitHolder(int value) { + BitHolder holder = new BitHolder(); + holder.value = value; + + return holder; + } + + public static NullableBitHolder getNullableBitHolder(boolean isNull, int value) { + NullableBitHolder holder = new NullableBitHolder(); + holder.isSet = isNull? 0 : 1; + if (! isNull) { + holder.value = value; + } + + return holder; + } + + public static VarCharHolder getVarCharHolder(ArrowBuf buf, String s){ + VarCharHolder vch = new VarCharHolder(); + + byte[] b = s.getBytes(Charsets.UTF_8); + vch.start = 0; + vch.end = b.length; + vch.buffer = buf.reallocIfNeeded(b.length); + vch.buffer.setBytes(0, b); + return vch; + } + + public static VarCharHolder getVarCharHolder(BufferAllocator a, String s){ + VarCharHolder vch = new VarCharHolder(); + + byte[] b = s.getBytes(Charsets.UTF_8); + vch.start = 0; + vch.end = b.length; + vch.buffer = a.buffer(b.length); // + vch.buffer.setBytes(0, b); + return vch; + } + + + public static IntervalYearHolder getIntervalYearHolder(int intervalYear) { + IntervalYearHolder holder = new IntervalYearHolder(); + + holder.value = intervalYear; + return holder; + } + + public static IntervalDayHolder getIntervalDayHolder(int days, int millis) { + IntervalDayHolder dch = new IntervalDayHolder(); + + dch.days = days; + dch.milliseconds = millis; + return dch; + } + + public static Decimal9Holder getDecimal9Holder(int decimal, int scale, int precision) { + Decimal9Holder dch = new Decimal9Holder(); + + dch.scale = scale; + dch.precision = precision; + dch.value = decimal; + + return dch; + } + + public static Decimal18Holder getDecimal18Holder(long decimal, int scale, int precision) { + Decimal18Holder dch = new Decimal18Holder(); + + dch.scale = scale; + dch.precision = precision; + dch.value = decimal; + + return dch; + } + + public static Decimal28SparseHolder getDecimal28Holder(ArrowBuf buf, String decimal) { + + Decimal28SparseHolder dch = new Decimal28SparseHolder(); + + BigDecimal bigDecimal = new BigDecimal(decimal); + + dch.scale = bigDecimal.scale(); + dch.precision = bigDecimal.precision(); + Decimal28SparseHolder.setSign(bigDecimal.signum() == -1, dch.start, dch.buffer); + dch.start = 0; + dch.buffer = buf.reallocIfNeeded(5 * DecimalUtility.INTEGER_SIZE); + DecimalUtility + .getSparseFromBigDecimal(bigDecimal, dch.buffer, dch.start, dch.scale, dch.precision, dch.nDecimalDigits); + + return dch; + } + + public static Decimal38SparseHolder getDecimal38Holder(ArrowBuf buf, String decimal) { + + Decimal38SparseHolder dch = new Decimal38SparseHolder(); + + BigDecimal bigDecimal = new BigDecimal(decimal); + + dch.scale = bigDecimal.scale(); + dch.precision = bigDecimal.precision(); + Decimal38SparseHolder.setSign(bigDecimal.signum() == -1, dch.start, dch.buffer); + dch.start = 0; + dch.buffer = buf.reallocIfNeeded(dch.maxPrecision * DecimalUtility.INTEGER_SIZE); + DecimalUtility + .getSparseFromBigDecimal(bigDecimal, dch.buffer, dch.start, dch.scale, dch.precision, dch.nDecimalDigits); + + return dch; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java new file mode 100644 index 00000000000..c05f0e7c50f --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -0,0 +1,222 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import java.io.Closeable; + +import io.netty.buffer.ArrowBuf; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.util.TransferPair; + +/** + * An abstraction that is used to store a sequence of values in an individual column. + * + * A {@link ValueVector value vector} stores underlying data in-memory in a columnar fashion that is compact and + * efficient. The column whose data is stored, is referred by {@link #getField()}. + * + * A vector when instantiated, relies on a {@link org.apache.drill.exec.record.DeadBuf dead buffer}. It is important + * that vector is allocated before attempting to read or write. + * + * There are a few "rules" around vectors: + * + *
    + *
  • values need to be written in order (e.g. index 0, 1, 2, 5)
  • + *
  • null vectors start with all values as null before writing anything
  • + *
  • for variable width types, the offset vector should be all zeros before writing
  • + *
  • you must call setValueCount before a vector can be read
  • + *
  • you should never write to a vector once it has been read.
  • + *
+ * + * Please note that the current implementation doesn't enfore those rules, hence we may find few places that + * deviate from these rules (e.g. offset vectors in Variable Length and Repeated vector) + * + * This interface "should" strive to guarantee this order of operation: + *
+ * allocate > mutate > setvaluecount > access > clear (or allocate to start the process over). + *
+ */ +public interface ValueVector extends Closeable, Iterable { + /** + * Allocate new buffers. ValueVector implements logic to determine how much to allocate. + * @throws OutOfMemoryException Thrown if no memory can be allocated. + */ + void allocateNew() throws OutOfMemoryException; + + /** + * Allocates new buffers. ValueVector implements logic to determine how much to allocate. + * @return Returns true if allocation was succesful. + */ + boolean allocateNewSafe(); + + BufferAllocator getAllocator(); + + /** + * Set the initial record capacity + * @param numRecords + */ + void setInitialCapacity(int numRecords); + + /** + * Returns the maximum number of values that can be stored in this vector instance. + */ + int getValueCapacity(); + + /** + * Alternative to clear(). Allows use as an AutoCloseable in try-with-resources. + */ + @Override + void close(); + + /** + * Release the underlying DrillBuf and reset the ValueVector to empty. + */ + void clear(); + + /** + * Get information about how this field is materialized. + */ + MaterializedField getField(); + + /** + * Returns a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new target vector of + * the same type. + */ + TransferPair getTransferPair(BufferAllocator allocator); + + TransferPair getTransferPair(String ref, BufferAllocator allocator); + + /** + * Returns a new {@link org.apache.arrow.vector.util.TransferPair transfer pair} that is used to transfer underlying + * buffers into the target vector. + */ + TransferPair makeTransferPair(ValueVector target); + + /** + * Returns an {@link org.apache.arrow.vector.ValueVector.Accessor accessor} that is used to read from this vector + * instance. + */ + Accessor getAccessor(); + + /** + * Returns an {@link org.apache.arrow.vector.ValueVector.Mutator mutator} that is used to write to this vector + * instance. + */ + Mutator getMutator(); + + /** + * Returns a {@link org.apache.arrow.vector.complex.reader.FieldReader field reader} that supports reading values + * from this vector. + */ + FieldReader getReader(); + + /** + * Get the metadata for this field. Used in serialization + * + * @return FieldMetadata for this field. + */ +// SerializedField getMetadata(); + + /** + * Returns the number of bytes that is used by this vector instance. + */ + int getBufferSize(); + + /** + * Returns the number of bytes that is used by this vector if it holds the given number + * of values. The result will be the same as if Mutator.setValueCount() were called, followed + * by calling getBufferSize(), but without any of the closing side-effects that setValueCount() + * implies wrt finishing off the population of a vector. Some operations might wish to use + * this to determine how much memory has been used by a vector so far, even though it is + * not finished being populated. + * + * @param valueCount the number of values to assume this vector contains + * @return the buffer size if this vector is holding valueCount values + */ + int getBufferSizeFor(int valueCount); + + /** + * Return the underlying buffers associated with this vector. Note that this doesn't impact the reference counts for + * this buffer so it only should be used for in-context access. Also note that this buffer changes regularly thus + * external classes shouldn't hold a reference to it (unless they change it). + * @param clear Whether to clear vector before returning; the buffers will still be refcounted; + * but the returned array will be the only reference to them + * + * @return The underlying {@link io.netty.buffer.ArrowBuf buffers} that is used by this vector instance. + */ + ArrowBuf[] getBuffers(boolean clear); + + /** + * Load the data provided in the buffer. Typically used when deserializing from the wire. + * + * @param metadata + * Metadata used to decode the incoming buffer. + * @param buffer + * The buffer that contains the ValueVector. + */ +// void load(SerializedField metadata, DrillBuf buffer); + + /** + * An abstraction that is used to read from this vector instance. + */ + interface Accessor { + /** + * Get the Java Object representation of the element at the specified position. Useful for testing. + * + * @param index + * Index of the value to get + */ + Object getObject(int index); + + /** + * Returns the number of values that is stored in this vector. + */ + int getValueCount(); + + /** + * Returns true if the value at the given index is null, false otherwise. + */ + boolean isNull(int index); + } + + /** + * An abstractiong that is used to write into this vector instance. + */ + interface Mutator { + /** + * Sets the number of values that is stored in this vector to the given value count. + * + * @param valueCount value count to set. + */ + void setValueCount(int valueCount); + + /** + * Resets the mutator to pristine state. + */ + void reset(); + + /** + * @deprecated this has nothing to do with value vector abstraction and should be removed. + */ + @Deprecated + void generateTestData(int values); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java new file mode 100644 index 00000000000..e227bb4c417 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java @@ -0,0 +1,51 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +public interface VariableWidthVector extends ValueVector{ + + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param totalBytes Desired size of the underlying data buffer. + * @param valueCount Number of values in the vector. + */ + void allocateNew(int totalBytes, int valueCount); + + /** + * Provide the maximum amount of variable width bytes that can be stored int his vector. + * @return + */ + int getByteCapacity(); + + VariableWidthMutator getMutator(); + + VariableWidthAccessor getAccessor(); + + interface VariableWidthAccessor extends Accessor { + int getValueLength(int index); + } + + int getCurrentSizeInBytes(); + + interface VariableWidthMutator extends Mutator { + void setValueLengthSafe(int index, int length); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java new file mode 100644 index 00000000000..fdad99a3332 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java @@ -0,0 +1,83 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import java.util.Collection; + +import com.google.common.base.Preconditions; + +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.MajorType; + +public class VectorDescriptor { + private static final String DEFAULT_NAME = "NONE"; + + private final MaterializedField field; + + public VectorDescriptor(final MajorType type) { + this(DEFAULT_NAME, type); + } + + public VectorDescriptor(final String name, final MajorType type) { + this(MaterializedField.create(name, type)); + } + + public VectorDescriptor(final MaterializedField field) { + this.field = Preconditions.checkNotNull(field, "field cannot be null"); + } + + public MaterializedField getField() { + return field; + } + + public MajorType getType() { + return field.getType(); + } + + public String getName() { + return field.getLastName(); + } + + public Collection getChildren() { + return field.getChildren(); + } + + public boolean hasName() { + return getName() != DEFAULT_NAME; + } + + public VectorDescriptor withName(final String name) { + return new VectorDescriptor(field.withPath(name)); + } + + public VectorDescriptor withType(final MajorType type) { + return new VectorDescriptor(field.withType(type)); + } + + public static VectorDescriptor create(final String name, final MajorType type) { + return new VectorDescriptor(name, type); + } + + public static VectorDescriptor create(final MajorType type) { + return new VectorDescriptor(type); + } + + public static VectorDescriptor create(final MaterializedField field) { + return new VectorDescriptor(field); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java new file mode 100644 index 00000000000..055857e9560 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorTrimmer.java @@ -0,0 +1,33 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ByteBuf; +import io.netty.buffer.ArrowBuf; + +public class VectorTrimmer { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(VectorTrimmer.class); + + public static void trim(ByteBuf data, int idx) { + data.writerIndex(idx); + if (data instanceof ArrowBuf) { + // data.capacity(idx); + data.writerIndex(idx); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java new file mode 100644 index 00000000000..78de8706fb7 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -0,0 +1,181 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +import java.util.Iterator; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.complex.impl.NullReader; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.TransferPair; + +import com.google.common.collect.Iterators; + +public class ZeroVector implements ValueVector { + public final static ZeroVector INSTANCE = new ZeroVector(); + + private final MaterializedField field = MaterializedField.create("[DEFAULT]", Types.required(MinorType.LATE)); + + private final TransferPair defaultPair = new TransferPair() { + @Override + public void transfer() { } + + @Override + public void splitAndTransfer(int startIndex, int length) { } + + @Override + public ValueVector getTo() { + return ZeroVector.this; + } + + @Override + public void copyValueSafe(int from, int to) { } + }; + + private final Accessor defaultAccessor = new Accessor() { + @Override + public Object getObject(int index) { + return null; + } + + @Override + public int getValueCount() { + return 0; + } + + @Override + public boolean isNull(int index) { + return true; + } + }; + + private final Mutator defaultMutator = new Mutator() { + @Override + public void setValueCount(int valueCount) { } + + @Override + public void reset() { } + + @Override + public void generateTestData(int values) { } + }; + + public ZeroVector() { } + + @Override + public void close() { } + + @Override + public void clear() { } + + @Override + public MaterializedField getField() { + return field; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return defaultPair; + } + +// @Override +// public UserBitShared.SerializedField getMetadata() { +// return getField() +// .getAsBuilder() +// .setBufferLength(getBufferSize()) +// .setValueCount(getAccessor().getValueCount()) +// .build(); +// } + + @Override + public Iterator iterator() { + return Iterators.emptyIterator(); + } + + @Override + public int getBufferSize() { + return 0; + } + + @Override + public int getBufferSizeFor(final int valueCount) { + return 0; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return new ArrowBuf[0]; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + allocateNewSafe(); + } + + @Override + public boolean allocateNewSafe() { + return true; + } + + @Override + public BufferAllocator getAllocator() { + throw new UnsupportedOperationException("Tried to get allocator from ZeroVector"); + } + + @Override + public void setInitialCapacity(int numRecords) { } + + @Override + public int getValueCapacity() { + return 0; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return defaultPair; + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return defaultPair; + } + + @Override + public Accessor getAccessor() { + return defaultAccessor; + } + + @Override + public Mutator getMutator() { + return defaultMutator; + } + + @Override + public FieldReader getReader() { + return NullReader.INSTANCE; + } + +// @Override +// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java new file mode 100644 index 00000000000..c671c9e0b3c --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import java.util.Collection; + +import javax.annotation.Nullable; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.CallBack; + +import com.google.common.base.Function; +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; + +/** + * Base class for composite vectors. + * + * This class implements common functionality of composite vectors. + */ +public abstract class AbstractContainerVector implements ValueVector { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); + + protected MaterializedField field; + protected final BufferAllocator allocator; + protected final CallBack callBack; + + protected AbstractContainerVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { + this.field = Preconditions.checkNotNull(field); + this.allocator = allocator; + this.callBack = callBack; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException(); + } + } + + public BufferAllocator getAllocator() { + return allocator; + } + + /** + * Returns the field definition of this instance. + */ + @Override + public MaterializedField getField() { + return field; + } + + /** + * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given field name if exists or null. + */ + public ValueVector getChild(String name) { + return getChild(name, ValueVector.class); + } + + /** + * Returns a sequence of field names in the order that they show up in the schema. + */ + protected Collection getChildFieldNames() { + return Sets.newLinkedHashSet(Iterables.transform(field.getChildren(), new Function() { + @Nullable + @Override + public String apply(MaterializedField field) { + return Preconditions.checkNotNull(field).getLastName(); + } + })); + } + + /** + * Clears out all underlying child vectors. + */ + @Override + public void close() { + for (ValueVector vector:(Iterable)this) { + vector.close(); + } + } + + protected T typeify(ValueVector v, Class clazz) { + if (clazz.isAssignableFrom(v.getClass())) { + return (T) v; + } + throw new IllegalStateException(String.format("Vector requested [%s] was different than type stored [%s]. Drill doesn't yet support hetergenous types.", clazz.getSimpleName(), v.getClass().getSimpleName())); + } + + MajorType getLastPathType() { + if((this.getField().getType().getMinorType() == MinorType.LIST && + this.getField().getType().getMode() == DataMode.REPEATED)) { // Use Repeated scalar type instead of Required List. + VectorWithOrdinal vord = getChildVectorWithOrdinal(null); + ValueVector v = vord.vector; + if (! (v instanceof AbstractContainerVector)) { + return v.getField().getType(); + } + } else if (this.getField().getType().getMinorType() == MinorType.MAP && + this.getField().getType().getMode() == DataMode.REPEATED) { // Use Required Map + return new MajorType(MinorType.MAP, DataMode.REQUIRED); + } + + return this.getField().getType(); + } + + protected boolean supportsDirectRead() { + return false; + } + + // return the number of child vectors + public abstract int size(); + + // add a new vector with the input MajorType or return the existing vector if we already added one with the same type + public abstract T addOrGet(String name, MajorType type, Class clazz); + + // return the child vector with the input name + public abstract T getChild(String name, Class clazz); + + // return the child vector's ordinal in the composite container + public abstract VectorWithOrdinal getChildVectorWithOrdinal(String name); +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java new file mode 100644 index 00000000000..d4189b2314a --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java @@ -0,0 +1,278 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import io.netty.buffer.ArrowBuf; + +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.util.BasicTypeHelper; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.MapWithOrdinal; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +/* + * Base class for MapVectors. Currently used by RepeatedMapVector and MapVector + */ +public abstract class AbstractMapVector extends AbstractContainerVector { + private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); + + // Maintains a map with key as field name and value is the vector itself + private final MapWithOrdinal vectors = new MapWithOrdinal<>(); + + protected AbstractMapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { + super(field.clone(), allocator, callBack); + MaterializedField clonedField = field.clone(); + // create the hierarchy of the child vectors based on the materialized field + for (MaterializedField child : clonedField.getChildren()) { + if (!child.equals(BaseRepeatedValueVector.OFFSETS_FIELD)) { + final String fieldName = child.getLastName(); + final ValueVector v = BasicTypeHelper.getNewVector(child, allocator, callBack); + putVector(fieldName, v); + } + } + } + + @Override + public void close() { + for(final ValueVector valueVector : vectors.values()) { + valueVector.close(); + } + vectors.clear(); + + super.close(); + } + + @Override + public boolean allocateNewSafe() { + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + for (final ValueVector v : vectors.values()) { + if (!v.allocateNewSafe()) { + return false; + } + } + success = true; + } finally { + if (!success) { + clear(); + } + } + return true; + } + + /** + * Adds a new field with the given parameters or replaces the existing one and consequently returns the resultant + * {@link org.apache.arrow.vector.ValueVector}. + * + * Execution takes place in the following order: + *
    + *
  • + * if field is new, create and insert a new vector of desired type. + *
  • + *
  • + * if field exists and existing vector is of desired vector type, return the vector. + *
  • + *
  • + * if field exists and null filled, clear the existing vector; create and insert a new vector of desired type. + *
  • + *
  • + * otherwise, throw an {@link java.lang.IllegalStateException} + *
  • + *
+ * + * @param name name of the field + * @param type type of the field + * @param clazz class of expected vector type + * @param class type of expected vector type + * @throws java.lang.IllegalStateException raised if there is a hard schema change + * + * @return resultant {@link org.apache.arrow.vector.ValueVector} + */ + @Override + public T addOrGet(String name, MajorType type, Class clazz) { + final ValueVector existing = getChild(name); + boolean create = false; + if (existing == null) { + create = true; + } else if (clazz.isAssignableFrom(existing.getClass())) { + return (T) existing; + } else if (nullFilled(existing)) { + existing.clear(); + create = true; + } + if (create) { + final T vector = (T) BasicTypeHelper.getNewVector(name, allocator, type, callBack); + putChild(name, vector); + if (callBack!=null) { + callBack.doWork(); + } + return vector; + } + final String message = "Drill does not support schema change yet. Existing[%s] and desired[%s] vector types mismatch"; + throw new IllegalStateException(String.format(message, existing.getClass().getSimpleName(), clazz.getSimpleName())); + } + + private boolean nullFilled(ValueVector vector) { + for (int r = 0; r < vector.getAccessor().getValueCount(); r++) { + if (!vector.getAccessor().isNull(r)) { + return false; + } + } + return true; + } + + /** + * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given ordinal identifier. + */ + public ValueVector getChildByOrdinal(int id) { + return vectors.getByOrdinal(id); + } + + /** + * Returns a {@link org.apache.arrow.vector.ValueVector} instance of subtype of corresponding to the given + * field name if exists or null. + */ + @Override + public T getChild(String name, Class clazz) { + final ValueVector v = vectors.get(name.toLowerCase()); + if (v == null) { + return null; + } + return typeify(v, clazz); + } + + /** + * Inserts the vector with the given name if it does not exist else replaces it with the new value. + * + * Note that this method does not enforce any vector type check nor throws a schema change exception. + */ + protected void putChild(String name, ValueVector vector) { + putVector(name, vector); + field.addChild(vector.getField()); + } + + /** + * Inserts the input vector into the map if it does not exist, replaces if it exists already + * @param name field name + * @param vector vector to be inserted + */ + protected void putVector(String name, ValueVector vector) { + final ValueVector old = vectors.put( + Preconditions.checkNotNull(name, "field name cannot be null").toLowerCase(), + Preconditions.checkNotNull(vector, "vector cannot be null") + ); + if (old != null && old != vector) { + logger.debug("Field [{}] mutated from [{}] to [{}]", name, old.getClass().getSimpleName(), + vector.getClass().getSimpleName()); + } + } + + /** + * Returns a sequence of underlying child vectors. + */ + protected Collection getChildren() { + return vectors.values(); + } + + /** + * Returns the number of underlying child vectors. + */ + @Override + public int size() { + return vectors.size(); + } + + @Override + public Iterator iterator() { + return vectors.values().iterator(); + } + + /** + * Returns a list of scalar child vectors recursing the entire vector hierarchy. + */ + public List getPrimitiveVectors() { + final List primitiveVectors = Lists.newArrayList(); + for (final ValueVector v : vectors.values()) { + if (v instanceof AbstractMapVector) { + AbstractMapVector mapVector = (AbstractMapVector) v; + primitiveVectors.addAll(mapVector.getPrimitiveVectors()); + } else { + primitiveVectors.add(v); + } + } + return primitiveVectors; + } + + /** + * Returns a vector with its corresponding ordinal mapping if field exists or null. + */ + @Override + public VectorWithOrdinal getChildVectorWithOrdinal(String name) { + final int ordinal = vectors.getOrdinal(name.toLowerCase()); + if (ordinal < 0) { + return null; + } + final ValueVector vector = vectors.getByOrdinal(ordinal); + return new VectorWithOrdinal(vector, ordinal); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final List buffers = Lists.newArrayList(); + + for (final ValueVector vector : vectors.values()) { + for (final ArrowBuf buf : vector.getBuffers(false)) { + buffers.add(buf); + if (clear) { + buf.retain(1); + } + } + if (clear) { + vector.clear(); + } + } + + return buffers.toArray(new ArrowBuf[buffers.size()]); + } + + @Override + public int getBufferSize() { + int actualBufSize = 0 ; + + for (final ValueVector v : vectors.values()) { + for (final ArrowBuf buf : v.getBuffers(false)) { + actualBufSize += buf.writerIndex(); + } + } + return actualBufSize; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java new file mode 100644 index 00000000000..6518897fb78 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -0,0 +1,260 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import io.netty.buffer.ArrowBuf; + +import java.util.Collections; +import java.util.Iterator; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorDescriptor; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.BasicTypeHelper; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ObjectArrays; + +public abstract class BaseRepeatedValueVector extends BaseValueVector implements RepeatedValueVector { + + public final static ValueVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; + public final static String OFFSETS_VECTOR_NAME = "$offsets$"; + public final static String DATA_VECTOR_NAME = "$data$"; + + public final static MaterializedField OFFSETS_FIELD = + MaterializedField.create(OFFSETS_VECTOR_NAME, new MajorType(MinorType.UINT4, DataMode.REQUIRED)); + + protected final UInt4Vector offsets; + protected ValueVector vector; + + protected BaseRepeatedValueVector(MaterializedField field, BufferAllocator allocator) { + this(field, allocator, DEFAULT_DATA_VECTOR); + } + + protected BaseRepeatedValueVector(MaterializedField field, BufferAllocator allocator, ValueVector vector) { + super(field, allocator); + this.offsets = new UInt4Vector(OFFSETS_FIELD, allocator); + this.vector = Preconditions.checkNotNull(vector, "data vector cannot be null"); + } + + @Override + public boolean allocateNewSafe() { + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + if (!offsets.allocateNewSafe()) { + return false; + } + success = vector.allocateNewSafe(); + } finally { + if (!success) { + clear(); + } + } + offsets.zeroVector(); + return success; + } + + + @Override + public UInt4Vector getOffsetVector() { + return offsets; + } + + @Override + public ValueVector getDataVector() { + return vector; + } + + @Override + public void setInitialCapacity(int numRecords) { + offsets.setInitialCapacity(numRecords + 1); + vector.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } + + @Override + public int getValueCapacity() { + final int offsetValueCapacity = Math.max(offsets.getValueCapacity() - 1, 0); + if (vector == DEFAULT_DATA_VECTOR) { + return offsetValueCapacity; + } + return Math.min(vector.getValueCapacity(), offsetValueCapacity); + } + +// @Override +// protected UserBitShared.SerializedField.Builder getMetadataBuilder() { +// return super.getMetadataBuilder() +// .addChild(offsets.getMetadata()) +// .addChild(vector.getMetadata()); +// } + + @Override + public int getBufferSize() { + if (getAccessor().getValueCount() == 0) { + return 0; + } + return offsets.getBufferSize() + vector.getBufferSize(); + } + + @Override + public int getBufferSizeFor(int valueCount) { + if (valueCount == 0) { + return 0; + } + + return offsets.getBufferSizeFor(valueCount + 1) + vector.getBufferSizeFor(valueCount); + } + + @Override + public Iterator iterator() { + return Collections.singleton(getDataVector()).iterator(); + } + + @Override + public void clear() { + offsets.clear(); + vector.clear(); + super.clear(); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), vector.getBuffers(false), ArrowBuf.class); + if (clear) { + for (ArrowBuf buffer:buffers) { + buffer.retain(); + } + clear(); + } + return buffers; + } + +// @Override +// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { +// final UserBitShared.SerializedField offsetMetadata = metadata.getChild(0); +// offsets.load(offsetMetadata, buffer); +// +// final UserBitShared.SerializedField vectorMetadata = metadata.getChild(1); +// if (getDataVector() == DEFAULT_DATA_VECTOR) { +// addOrGetVector(VectorDescriptor.create(vectorMetadata.getMajorType())); +// } +// +// final int offsetLength = offsetMetadata.getBufferLength(); +// final int vectorLength = vectorMetadata.getBufferLength(); +// vector.load(vectorMetadata, buffer.slice(offsetLength, vectorLength)); +// } + + /** + * Returns 1 if inner vector is explicitly set via #addOrGetVector else 0 + * + * @see {@link ContainerVectorLike#size} + */ + @Override + public int size() { + return vector == DEFAULT_DATA_VECTOR ? 0:1; + } + + @Override + public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { + boolean created = false; + if (vector == DEFAULT_DATA_VECTOR && descriptor.getType().getMinorType() != MinorType.LATE) { + final MaterializedField field = descriptor.withName(DATA_VECTOR_NAME).getField(); + vector = BasicTypeHelper.getNewVector(field, allocator); + // returned vector must have the same field + assert field.equals(vector.getField()); + getField().addChild(field); + created = true; + } + + final MajorType actual = vector.getField().getType(); + if (!actual.equals(descriptor.getType())) { + final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", + descriptor.getType(), actual); + throw new SchemaChangeRuntimeException(msg); + } + + return new AddOrGetResult<>((T)vector, created); + } + + protected void replaceDataVector(ValueVector v) { + vector.clear(); + vector = v; + } + + public abstract class BaseRepeatedAccessor extends BaseValueVector.BaseAccessor implements RepeatedAccessor { + + @Override + public int getValueCount() { + return Math.max(offsets.getAccessor().getValueCount() - 1, 0); + } + + @Override + public int getInnerValueCount() { + return vector.getAccessor().getValueCount(); + } + + @Override + public int getInnerValueCountAt(int index) { + return offsets.getAccessor().get(index+1) - offsets.getAccessor().get(index); + } + + @Override + public boolean isNull(int index) { + return false; + } + + @Override + public boolean isEmpty(int index) { + return false; + } + } + + public abstract class BaseRepeatedMutator extends BaseValueVector.BaseMutator implements RepeatedMutator { + + @Override + public void startNewValue(int index) { + while (offsets.getValueCapacity() <= index) { + offsets.reAlloc(); + } + offsets.getMutator().setSafe(index+1, offsets.getAccessor().get(index)); + setValueCount(index+1); + } + + @Override + public void setValueCount(int valueCount) { + // TODO: populate offset end points + offsets.getMutator().setValueCount(valueCount == 0 ? 0 : valueCount+1); + final int childValueCount = valueCount == 0 ? 0 : offsets.getAccessor().get(valueCount); + vector.getMutator().setValueCount(childValueCount); + } + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java new file mode 100644 index 00000000000..e50b0d0d0a5 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java @@ -0,0 +1,43 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorDescriptor; + +/** + * A mix-in used for introducing container vector-like behaviour. + */ +public interface ContainerVectorLike { + + /** + * Creates and adds a child vector if none with the same name exists, else returns the vector instance. + * + * @param descriptor vector descriptor + * @return result of operation wrapping vector corresponding to the given descriptor and whether it's newly created + * @throws org.apache.drill.common.exceptions.DrillRuntimeException + * if schema change is not permissible between the given and existing data vector types. + */ + AddOrGetResult addOrGetVector(VectorDescriptor descriptor); + + /** + * Returns the number of child vectors in this container vector-like instance. + */ + int size(); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java new file mode 100644 index 00000000000..df699755770 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/EmptyValuePopulator.java @@ -0,0 +1,54 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.UInt4Vector; + +import com.google.common.base.Preconditions; + +/** + * A helper class that is used to track and populate empty values in repeated value vectors. + */ +public class EmptyValuePopulator { + private final UInt4Vector offsets; + + public EmptyValuePopulator(UInt4Vector offsets) { + this.offsets = Preconditions.checkNotNull(offsets, "offsets cannot be null"); + } + + /** + * Marks all values since the last set as empty. The last set value is obtained from underlying offsets vector. + * + * @param lastIndex the last index (inclusive) in the offsets vector until which empty population takes place + * @throws java.lang.IndexOutOfBoundsException if lastIndex is negative or greater than offsets capacity. + */ + public void populate(int lastIndex) { + if (lastIndex < 0) { + throw new IndexOutOfBoundsException("index cannot be negative"); + } + final UInt4Vector.Accessor accessor = offsets.getAccessor(); + final UInt4Vector.Mutator mutator = offsets.getMutator(); + final int lastSet = Math.max(accessor.getValueCount() - 1, 0); + final int previousEnd = accessor.get(lastSet);//0 ? 0 : accessor.get(lastSet); + for (int i = lastSet; i < lastIndex; i++) { + mutator.setSafe(i + 1, previousEnd); + } + mutator.setValueCount(lastIndex+1); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java new file mode 100644 index 00000000000..8387c9e5ba6 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -0,0 +1,321 @@ +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.complex; + +import io.netty.buffer.ArrowBuf; + +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorDescriptor; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.TransferPair; + +import com.google.common.collect.ObjectArrays; + +public class ListVector extends BaseRepeatedValueVector { + + private UInt4Vector offsets; + private final UInt1Vector bits; + private Mutator mutator = new Mutator(); + private Accessor accessor = new Accessor(); + private UnionListWriter writer; + private UnionListReader reader; + private CallBack callBack; + + public ListVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { + super(field, allocator); + this.bits = new UInt1Vector(MaterializedField.create("$bits$", new MajorType(MinorType.UINT1, DataMode.REQUIRED)), allocator); + offsets = getOffsetVector(); + this.field.addChild(getDataVector().getField()); + this.writer = new UnionListWriter(this); + this.reader = new UnionListReader(this); + this.callBack = callBack; + } + + public UnionListWriter getWriter() { + return writer; + } + + @Override + public void allocateNew() throws OutOfMemoryException { + super.allocateNewSafe(); + } + + public void transferTo(ListVector target) { + offsets.makeTransferPair(target.offsets).transfer(); + bits.makeTransferPair(target.bits).transfer(); + if (target.getDataVector() instanceof ZeroVector) { + target.addOrGetVector(new VectorDescriptor(vector.getField().getType())); + } + getDataVector().makeTransferPair(target.getDataVector()).transfer(); + } + + public void copyFromSafe(int inIndex, int outIndex, ListVector from) { + copyFrom(inIndex, outIndex, from); + } + + public void copyFrom(int inIndex, int outIndex, ListVector from) { + FieldReader in = from.getReader(); + in.setPosition(inIndex); + FieldWriter out = getWriter(); + out.setPosition(outIndex); + ComplexCopier.copy(in, out); + } + + @Override + public ValueVector getDataVector() { + return vector; + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new TransferImpl(field.withPath(ref), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new TransferImpl((ListVector) target); + } + + private class TransferImpl implements TransferPair { + + ListVector to; + + public TransferImpl(MaterializedField field, BufferAllocator allocator) { + to = new ListVector(field, allocator, null); + to.addOrGetVector(new VectorDescriptor(vector.getField().getType())); + } + + public TransferImpl(ListVector to) { + this.to = to; + to.addOrGetVector(new VectorDescriptor(vector.getField().getType())); + } + + @Override + public void transfer() { + transferTo(to); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + to.allocateNew(); + for (int i = 0; i < length; i++) { + copyValueSafe(startIndex + i, i); + } + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + this.to.copyFrom(from, to, ListVector.this); + } + } + + @Override + public Accessor getAccessor() { + return accessor; + } + + @Override + public Mutator getMutator() { + return mutator; + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public boolean allocateNewSafe() { + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + if (!offsets.allocateNewSafe()) { + return false; + } + success = vector.allocateNewSafe(); + success = success && bits.allocateNewSafe(); + } finally { + if (!success) { + clear(); + } + } + if (success) { + offsets.zeroVector(); + bits.zeroVector(); + } + return success; + } + +// @Override +// protected UserBitShared.SerializedField.Builder getMetadataBuilder() { +// return getField().getAsBuilder() +// .setValueCount(getAccessor().getValueCount()) +// .setBufferLength(getBufferSize()) +// .addChild(offsets.getMetadata()) +// .addChild(bits.getMetadata()) +// .addChild(vector.getMetadata()); +// } + public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { + AddOrGetResult result = super.addOrGetVector(descriptor); + reader = new UnionListReader(this); + return result; + } + + @Override + public int getBufferSize() { + if (getAccessor().getValueCount() == 0) { + return 0; + } + return offsets.getBufferSize() + bits.getBufferSize() + vector.getBufferSize(); + } + + @Override + public void clear() { + offsets.clear(); + vector.clear(); + bits.clear(); + lastSet = 0; + super.clear(); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final ArrowBuf[] buffers = ObjectArrays.concat(offsets.getBuffers(false), ObjectArrays.concat(bits.getBuffers(false), + vector.getBuffers(false), ArrowBuf.class), ArrowBuf.class); + if (clear) { + for (ArrowBuf buffer:buffers) { + buffer.retain(); + } + clear(); + } + return buffers; + } + +// @Override +// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { +// final UserBitShared.SerializedField offsetMetadata = metadata.getChild(0); +// offsets.load(offsetMetadata, buffer); +// +// final int offsetLength = offsetMetadata.getBufferLength(); +// final UserBitShared.SerializedField bitMetadata = metadata.getChild(1); +// final int bitLength = bitMetadata.getBufferLength(); +// bits.load(bitMetadata, buffer.slice(offsetLength, bitLength)); +// +// final UserBitShared.SerializedField vectorMetadata = metadata.getChild(2); +// if (getDataVector() == DEFAULT_DATA_VECTOR) { +// addOrGetVector(VectorDescriptor.create(vectorMetadata.getMajorType())); +// } +// +// final int vectorLength = vectorMetadata.getBufferLength(); +// vector.load(vectorMetadata, buffer.slice(offsetLength + bitLength, vectorLength)); +// } + + public UnionVector promoteToUnion() { + MaterializedField newField = MaterializedField.create(getField().getPath(), new MajorType(MinorType.UNION, DataMode.OPTIONAL)); + UnionVector vector = new UnionVector(newField, allocator, null); + replaceDataVector(vector); + reader = new UnionListReader(this); + return vector; + } + + private int lastSet; + + public class Accessor extends BaseRepeatedAccessor { + + @Override + public Object getObject(int index) { + if (isNull(index)) { + return null; + } + final List vals = new JsonStringArrayList<>(); + final UInt4Vector.Accessor offsetsAccessor = offsets.getAccessor(); + final int start = offsetsAccessor.get(index); + final int end = offsetsAccessor.get(index + 1); + final ValueVector.Accessor valuesAccessor = getDataVector().getAccessor(); + for(int i = start; i < end; i++) { + vals.add(valuesAccessor.getObject(i)); + } + return vals; + } + + @Override + public boolean isNull(int index) { + return bits.getAccessor().get(index) == 0; + } + } + + public class Mutator extends BaseRepeatedMutator { + public void setNotNull(int index) { + bits.getMutator().setSafe(index, 1); + lastSet = index + 1; + } + + @Override + public void startNewValue(int index) { + for (int i = lastSet; i <= index; i++) { + offsets.getMutator().setSafe(i + 1, offsets.getAccessor().get(i)); + } + setNotNull(index); + lastSet = index + 1; + } + + @Override + public void setValueCount(int valueCount) { + // TODO: populate offset end points + if (valueCount == 0) { + offsets.getMutator().setValueCount(0); + } else { + for (int i = lastSet; i < valueCount; i++) { + offsets.getMutator().setSafe(i + 1, offsets.getAccessor().get(i)); + } + offsets.getMutator().setValueCount(valueCount + 1); + } + final int childValueCount = valueCount == 0 ? 0 : offsets.getAccessor().get(valueCount); + vector.getMutator().setValueCount(childValueCount); + bits.getMutator().setValueCount(valueCount); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java new file mode 100644 index 00000000000..1bbce73d6ff --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -0,0 +1,374 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import io.netty.buffer.ArrowBuf; + +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; + +import javax.annotation.Nullable; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.RepeatedMapVector.MapSingleCopier; +import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringHashMap; +import org.apache.arrow.vector.util.TransferPair; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Ordering; +import com.google.common.primitives.Ints; + +public class MapVector extends AbstractMapVector { + //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MapVector.class); + + public final static MajorType TYPE = new MajorType(MinorType.MAP, DataMode.OPTIONAL); + + private final SingleMapReaderImpl reader = new SingleMapReaderImpl(MapVector.this); + private final Accessor accessor = new Accessor(); + private final Mutator mutator = new Mutator(); + private int valueCount; + + public MapVector(String path, BufferAllocator allocator, CallBack callBack){ + this(MaterializedField.create(path, TYPE), allocator, callBack); + } + + public MapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ + super(field, allocator, callBack); + } + + @Override + public FieldReader getReader() { + //return new SingleMapReaderImpl(MapVector.this); + return reader; + } + + transient private MapTransferPair ephPair; + transient private MapSingleCopier ephPair2; + + public void copyFromSafe(int fromIndex, int thisIndex, MapVector from) { + if(ephPair == null || ephPair.from != from) { + ephPair = (MapTransferPair) from.makeTransferPair(this); + } + ephPair.copyValueSafe(fromIndex, thisIndex); + } + + public void copyFromSafe(int fromSubIndex, int thisIndex, RepeatedMapVector from) { + if(ephPair2 == null || ephPair2.from != from) { + ephPair2 = from.makeSingularCopier(this); + } + ephPair2.copySafe(fromSubIndex, thisIndex); + } + + @Override + protected boolean supportsDirectRead() { + return true; + } + + public Iterator fieldNameIterator() { + return getChildFieldNames().iterator(); + } + + @Override + public void setInitialCapacity(int numRecords) { + for (final ValueVector v : (Iterable) this) { + v.setInitialCapacity(numRecords); + } + } + + @Override + public int getBufferSize() { + if (valueCount == 0 || size() == 0) { + return 0; + } + long buffer = 0; + for (final ValueVector v : (Iterable)this) { + buffer += v.getBufferSize(); + } + + return (int) buffer; + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + long bufferSize = 0; + for (final ValueVector v : (Iterable) this) { + bufferSize += v.getBufferSizeFor(valueCount); + } + + return (int) bufferSize; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + int expectedSize = getBufferSize(); + int actualSize = super.getBufferSize(); + + Preconditions.checkArgument(expectedSize == actualSize); + return super.getBuffers(clear); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new MapTransferPair(this, getField().getPath(), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new MapTransferPair(this, (MapVector) to); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new MapTransferPair(this, ref, allocator); + } + + protected static class MapTransferPair implements TransferPair{ + private final TransferPair[] pairs; + private final MapVector from; + private final MapVector to; + + public MapTransferPair(MapVector from, String path, BufferAllocator allocator) { + this(from, new MapVector(MaterializedField.create(path, TYPE), allocator, from.callBack), false); + } + + public MapTransferPair(MapVector from, MapVector to) { + this(from, to, true); + } + + protected MapTransferPair(MapVector from, MapVector to, boolean allocate) { + this.from = from; + this.to = to; + this.pairs = new TransferPair[from.size()]; + this.to.ephPair = null; + this.to.ephPair2 = null; + + int i = 0; + ValueVector vector; + for (String child:from.getChildFieldNames()) { + int preSize = to.size(); + vector = from.getChild(child); + if (vector == null) { + continue; + } + //DRILL-1872: we add the child fields for the vector, looking up the field by name. For a map vector, + // the child fields may be nested fields of the top level child. For example if the structure + // of a child field is oa.oab.oabc then we add oa, then add oab to oa then oabc to oab. + // But the children member of a Materialized field is a HashSet. If the fields are added in the + // children HashSet, and the hashCode of the Materialized field includes the hash code of the + // children, the hashCode value of oa changes *after* the field has been added to the HashSet. + // (This is similar to what happens in ScanBatch where the children cannot be added till they are + // read). To take care of this, we ensure that the hashCode of the MaterializedField does not + // include the hashCode of the children but is based only on MaterializedField$key. + final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); + if (allocate && to.size() != preSize) { + newVector.allocateNew(); + } + pairs[i++] = vector.makeTransferPair(newVector); + } + } + + @Override + public void transfer() { + for (final TransferPair p : pairs) { + p.transfer(); + } + to.valueCount = from.valueCount; + from.clear(); + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + for (TransferPair p : pairs) { + p.copyValueSafe(from, to); + } + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + for (TransferPair p : pairs) { + p.splitAndTransfer(startIndex, length); + } + to.getMutator().setValueCount(length); + } + } + + @Override + public int getValueCapacity() { + if (size() == 0) { + return 0; + } + + final Ordering natural = new Ordering() { + @Override + public int compare(@Nullable ValueVector left, @Nullable ValueVector right) { + return Ints.compare( + Preconditions.checkNotNull(left).getValueCapacity(), + Preconditions.checkNotNull(right).getValueCapacity() + ); + } + }; + + return natural.min(getChildren()).getValueCapacity(); + } + + @Override + public Accessor getAccessor() { + return accessor; + } + +// @Override +// public void load(SerializedField metadata, DrillBuf buf) { +// final List fields = metadata.getChildList(); +// valueCount = metadata.getValueCount(); +// +// int bufOffset = 0; +// for (final SerializedField child : fields) { +// final MaterializedField fieldDef = SerializedFieldHelper.create(child); +// +// ValueVector vector = getChild(fieldDef.getLastName()); +// if (vector == null) { +// if we arrive here, we didn't have a matching vector. +// vector = BasicTypeHelper.getNewVector(fieldDef, allocator); +// putChild(fieldDef.getLastName(), vector); +// } +// if (child.getValueCount() == 0) { +// vector.clear(); +// } else { +// vector.load(child, buf.slice(bufOffset, child.getBufferLength())); +// } +// bufOffset += child.getBufferLength(); +// } +// +// assert bufOffset == buf.capacity(); +// } +// +// @Override +// public SerializedField getMetadata() { +// SerializedField.Builder b = getField() // +// .getAsBuilder() // +// .setBufferLength(getBufferSize()) // +// .setValueCount(valueCount); +// +// +// for(ValueVector v : getChildren()) { +// b.addChild(v.getMetadata()); +// } +// return b.build(); +// } + + @Override + public Mutator getMutator() { + return mutator; + } + + public class Accessor extends BaseValueVector.BaseAccessor { + + @Override + public Object getObject(int index) { + Map vv = new JsonStringHashMap<>(); + for (String child:getChildFieldNames()) { + ValueVector v = getChild(child); + // TODO(DRILL-4001): Resolve this hack: + // The index/value count check in the following if statement is a hack + // to work around the current fact that RecordBatchLoader.load and + // MapVector.load leave child vectors with a length of zero (as opposed + // to matching the lengths of siblings and the parent map vector) + // because they don't remove (or set the lengths of) vectors from + // previous batches that aren't in the current batch. + if (v != null && index < v.getAccessor().getValueCount()) { + Object value = v.getAccessor().getObject(index); + if (value != null) { + vv.put(child, value); + } + } + } + return vv; + } + + public void get(int index, ComplexHolder holder) { + reader.setPosition(index); + holder.reader = reader; + } + + @Override + public int getValueCount() { + return valueCount; + } + } + + public ValueVector getVectorById(int id) { + return getChildByOrdinal(id); + } + + public class Mutator extends BaseValueVector.BaseMutator { + + @Override + public void setValueCount(int valueCount) { + for (final ValueVector v : getChildren()) { + v.getMutator().setValueCount(valueCount); + } + MapVector.this.valueCount = valueCount; + } + + @Override + public void reset() { } + + @Override + public void generateTestData(int values) { } + } + + @Override + public void clear() { + for (final ValueVector v : getChildren()) { + v.clear(); + } + valueCount = 0; + } + + @Override + public void close() { + final Collection vectors = getChildren(); + for (final ValueVector v : vectors) { + v.close(); + } + vectors.clear(); + valueCount = 0; + + super.close(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java new file mode 100644 index 00000000000..93451181ca9 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/Positionable.java @@ -0,0 +1,22 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +public interface Positionable { + public void setPosition(int index); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java new file mode 100644 index 00000000000..23850bc9034 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedFixedWidthVectorLike.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +/** + * A {@link org.apache.arrow.vector.ValueVector} mix-in that can be used in conjunction with + * {@link RepeatedValueVector} subtypes. + */ +public interface RepeatedFixedWidthVectorLike { + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param valueCount Number of separate repeating groupings. + * @param innerValueCount Number of supported values in the vector. + */ + void allocateNew(int valueCount, int innerValueCount); + + /** + * Load the records in the provided buffer based on the given number of values. + * @param valueCount Number of separate repeating groupings. + * @param innerValueCount Number atomic values the buffer contains. + * @param buf Incoming buffer. + * @return The number of bytes of the buffer that were consumed. + */ +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java new file mode 100644 index 00000000000..778fe81b5da --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java @@ -0,0 +1,428 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import io.netty.buffer.ArrowBuf; + +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorDescriptor; +import org.apache.arrow.vector.complex.impl.NullReader; +import org.apache.arrow.vector.complex.impl.RepeatedListReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.holders.RepeatedListHolder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.TransferPair; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +public class RepeatedListVector extends AbstractContainerVector + implements RepeatedValueVector, RepeatedFixedWidthVectorLike { + + public final static MajorType TYPE = new MajorType(MinorType.LIST, DataMode.REPEATED); + private final RepeatedListReaderImpl reader = new RepeatedListReaderImpl(null, this); + private final DelegateRepeatedVector delegate; + + protected static class DelegateRepeatedVector extends BaseRepeatedValueVector { + + private final RepeatedListAccessor accessor = new RepeatedListAccessor(); + private final RepeatedListMutator mutator = new RepeatedListMutator(); + private final EmptyValuePopulator emptyPopulator; + private transient DelegateTransferPair ephPair; + + public class RepeatedListAccessor extends BaseRepeatedValueVector.BaseRepeatedAccessor { + + @Override + public Object getObject(int index) { + final List list = new JsonStringArrayList<>(); + final int start = offsets.getAccessor().get(index); + final int until = offsets.getAccessor().get(index+1); + for (int i = start; i < until; i++) { + list.add(vector.getAccessor().getObject(i)); + } + return list; + } + + public void get(int index, RepeatedListHolder holder) { + assert index <= getValueCapacity(); + holder.start = getOffsetVector().getAccessor().get(index); + holder.end = getOffsetVector().getAccessor().get(index+1); + } + + public void get(int index, ComplexHolder holder) { + final FieldReader reader = getReader(); + reader.setPosition(index); + holder.reader = reader; + } + + public void get(int index, int arrayIndex, ComplexHolder holder) { + final RepeatedListHolder listHolder = new RepeatedListHolder(); + get(index, listHolder); + int offset = listHolder.start + arrayIndex; + if (offset >= listHolder.end) { + holder.reader = NullReader.INSTANCE; + } else { + FieldReader r = getDataVector().getReader(); + r.setPosition(offset); + holder.reader = r; + } + } + } + + public class RepeatedListMutator extends BaseRepeatedValueVector.BaseRepeatedMutator { + + public int add(int index) { + final int curEnd = getOffsetVector().getAccessor().get(index+1); + getOffsetVector().getMutator().setSafe(index + 1, curEnd + 1); + return curEnd; + } + + @Override + public void startNewValue(int index) { + emptyPopulator.populate(index+1); + super.startNewValue(index); + } + + @Override + public void setValueCount(int valueCount) { + emptyPopulator.populate(valueCount); + super.setValueCount(valueCount); + } + } + + + public class DelegateTransferPair implements TransferPair { + private final DelegateRepeatedVector target; + private final TransferPair[] children; + + public DelegateTransferPair(DelegateRepeatedVector target) { + this.target = Preconditions.checkNotNull(target); + if (target.getDataVector() == DEFAULT_DATA_VECTOR) { + target.addOrGetVector(VectorDescriptor.create(getDataVector().getField())); + target.getDataVector().allocateNew(); + } + this.children = new TransferPair[] { + getOffsetVector().makeTransferPair(target.getOffsetVector()), + getDataVector().makeTransferPair(target.getDataVector()) + }; + } + + @Override + public void transfer() { + for (TransferPair child:children) { + child.transfer(); + } + } + + @Override + public ValueVector getTo() { + return target; + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + target.allocateNew(); + for (int i = 0; i < length; i++) { + copyValueSafe(startIndex + i, i); + } + } + + @Override + public void copyValueSafe(int srcIndex, int destIndex) { + final RepeatedListHolder holder = new RepeatedListHolder(); + getAccessor().get(srcIndex, holder); + target.emptyPopulator.populate(destIndex+1); + final TransferPair vectorTransfer = children[1]; + int newIndex = target.getOffsetVector().getAccessor().get(destIndex); + //todo: make this a bulk copy. + for (int i = holder.start; i < holder.end; i++, newIndex++) { + vectorTransfer.copyValueSafe(i, newIndex); + } + target.getOffsetVector().getMutator().setSafe(destIndex + 1, newIndex); + } + } + + public DelegateRepeatedVector(String path, BufferAllocator allocator) { + this(MaterializedField.create(path, TYPE), allocator); + } + + public DelegateRepeatedVector(MaterializedField field, BufferAllocator allocator) { + super(field, allocator); + emptyPopulator = new EmptyValuePopulator(getOffsetVector()); + } + + @Override + public void allocateNew() throws OutOfMemoryException { + if (!allocateNewSafe()) { + throw new OutOfMemoryException(); + } + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return makeTransferPair(new DelegateRepeatedVector(ref, allocator)); + } + + @Override + public TransferPair makeTransferPair(ValueVector target) { + return new DelegateTransferPair(DelegateRepeatedVector.class.cast(target)); + } + + @Override + public RepeatedListAccessor getAccessor() { + return accessor; + } + + @Override + public RepeatedListMutator getMutator() { + return mutator; + } + + @Override + public FieldReader getReader() { + throw new UnsupportedOperationException(); + } + + public void copyFromSafe(int fromIndex, int thisIndex, DelegateRepeatedVector from) { + if(ephPair == null || ephPair.target != from) { + ephPair = DelegateTransferPair.class.cast(from.makeTransferPair(this)); + } + ephPair.copyValueSafe(fromIndex, thisIndex); + } + + } + + protected class RepeatedListTransferPair implements TransferPair { + private final TransferPair delegate; + + public RepeatedListTransferPair(TransferPair delegate) { + this.delegate = delegate; + } + + public void transfer() { + delegate.transfer(); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + delegate.splitAndTransfer(startIndex, length); + } + + @Override + public ValueVector getTo() { + final DelegateRepeatedVector delegateVector = DelegateRepeatedVector.class.cast(delegate.getTo()); + return new RepeatedListVector(getField(), allocator, callBack, delegateVector); + } + + @Override + public void copyValueSafe(int from, int to) { + delegate.copyValueSafe(from, to); + } + } + + public RepeatedListVector(String path, BufferAllocator allocator, CallBack callBack) { + this(MaterializedField.create(path, TYPE), allocator, callBack); + } + + public RepeatedListVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { + this(field, allocator, callBack, new DelegateRepeatedVector(field, allocator)); + } + + protected RepeatedListVector(MaterializedField field, BufferAllocator allocator, CallBack callBack, DelegateRepeatedVector delegate) { + super(field, allocator, callBack); + this.delegate = Preconditions.checkNotNull(delegate); + + final List children = Lists.newArrayList(field.getChildren()); + final int childSize = children.size(); + assert childSize < 3; + final boolean hasChild = childSize > 0; + if (hasChild) { + // the last field is data field + final MaterializedField child = children.get(childSize-1); + addOrGetVector(VectorDescriptor.create(child)); + } + } + + + @Override + public RepeatedListReaderImpl getReader() { + return reader; + } + + @Override + public DelegateRepeatedVector.RepeatedListAccessor getAccessor() { + return delegate.getAccessor(); + } + + @Override + public DelegateRepeatedVector.RepeatedListMutator getMutator() { + return delegate.getMutator(); + } + + @Override + public UInt4Vector getOffsetVector() { + return delegate.getOffsetVector(); + } + + @Override + public ValueVector getDataVector() { + return delegate.getDataVector(); + } + + @Override + public void allocateNew() throws OutOfMemoryException { + delegate.allocateNew(); + } + + @Override + public boolean allocateNewSafe() { + return delegate.allocateNewSafe(); + } + + @Override + public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { + final AddOrGetResult result = delegate.addOrGetVector(descriptor); + if (result.isCreated() && callBack != null) { + callBack.doWork(); + } + this.field = delegate.getField(); + return result; + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public int getBufferSize() { + return delegate.getBufferSize(); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + return delegate.getBufferSizeFor(valueCount); + } + + @Override + public void close() { + delegate.close(); + } + + @Override + public void clear() { + delegate.clear(); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new RepeatedListTransferPair(delegate.getTransferPair(allocator)); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new RepeatedListTransferPair(delegate.getTransferPair(ref, allocator)); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + final RepeatedListVector target = RepeatedListVector.class.cast(to); + return new RepeatedListTransferPair(delegate.makeTransferPair(target.delegate)); + } + + @Override + public int getValueCapacity() { + return delegate.getValueCapacity(); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return delegate.getBuffers(clear); + } + + +// @Override +// public void load(SerializedField metadata, DrillBuf buf) { +// delegate.load(metadata, buf); +// } + +// @Override +// public SerializedField getMetadata() { +// return delegate.getMetadata(); +// } + + @Override + public Iterator iterator() { + return delegate.iterator(); + } + + @Override + public void setInitialCapacity(int numRecords) { + delegate.setInitialCapacity(numRecords); + } + + /** + * @deprecated + * prefer using {@link #addOrGetVector(org.apache.arrow.vector.VectorDescriptor)} instead. + */ + @Override + public T addOrGet(String name, MajorType type, Class clazz) { + final AddOrGetResult result = addOrGetVector(VectorDescriptor.create(type)); + return result.getVector(); + } + + @Override + public T getChild(String name, Class clazz) { + if (name != null) { + return null; + } + return typeify(delegate.getDataVector(), clazz); + } + + @Override + public void allocateNew(int valueCount, int innerValueCount) { + clear(); + getOffsetVector().allocateNew(valueCount + 1); + getMutator().reset(); + } + + @Override + public VectorWithOrdinal getChildVectorWithOrdinal(String name) { + if (name != null) { + return null; + } + return new VectorWithOrdinal(delegate.getDataVector(), 0); + } + + public void copyFromSafe(int fromIndex, int thisIndex, RepeatedListVector from) { + delegate.copyFromSafe(fromIndex, thisIndex, from.delegate); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java new file mode 100644 index 00000000000..e7eacd3c67c --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java @@ -0,0 +1,584 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import io.netty.buffer.ArrowBuf; + +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.AllocationHelper; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorDescriptor; +import org.apache.arrow.vector.complex.impl.NullReader; +import org.apache.arrow.vector.complex.impl.RepeatedMapReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.holders.RepeatedMapHolder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.JsonStringArrayList; +import org.apache.arrow.vector.util.TransferPair; +import org.apache.commons.lang3.ArrayUtils; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Maps; + +public class RepeatedMapVector extends AbstractMapVector + implements RepeatedValueVector, RepeatedFixedWidthVectorLike { + //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RepeatedMapVector.class); + + public final static MajorType TYPE = new MajorType(MinorType.MAP, DataMode.REPEATED); + + private final UInt4Vector offsets; // offsets to start of each record (considering record indices are 0-indexed) + private final RepeatedMapReaderImpl reader = new RepeatedMapReaderImpl(RepeatedMapVector.this); + private final RepeatedMapAccessor accessor = new RepeatedMapAccessor(); + private final Mutator mutator = new Mutator(); + private final EmptyValuePopulator emptyPopulator; + + public RepeatedMapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ + super(field, allocator, callBack); + this.offsets = new UInt4Vector(BaseRepeatedValueVector.OFFSETS_FIELD, allocator); + this.emptyPopulator = new EmptyValuePopulator(offsets); + } + + @Override + public UInt4Vector getOffsetVector() { + return offsets; + } + + @Override + public ValueVector getDataVector() { + throw new UnsupportedOperationException(); + } + + @Override + public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { + throw new UnsupportedOperationException(); + } + + @Override + public void setInitialCapacity(int numRecords) { + offsets.setInitialCapacity(numRecords + 1); + for(final ValueVector v : (Iterable) this) { + v.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); + } + } + + @Override + public RepeatedMapReaderImpl getReader() { + return reader; + } + + @Override + public void allocateNew(int groupCount, int innerValueCount) { + clear(); + try { + offsets.allocateNew(groupCount + 1); + for (ValueVector v : getChildren()) { + AllocationHelper.allocatePrecomputedChildCount(v, groupCount, 50, innerValueCount); + } + } catch (OutOfMemoryException e){ + clear(); + throw e; + } + offsets.zeroVector(); + mutator.reset(); + } + + public Iterator fieldNameIterator() { + return getChildFieldNames().iterator(); + } + + @Override + public List getPrimitiveVectors() { + final List primitiveVectors = super.getPrimitiveVectors(); + primitiveVectors.add(offsets); + return primitiveVectors; + } + + @Override + public int getBufferSize() { + if (getAccessor().getValueCount() == 0) { + return 0; + } + long bufferSize = offsets.getBufferSize(); + for (final ValueVector v : (Iterable) this) { + bufferSize += v.getBufferSize(); + } + return (int) bufferSize; + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + + long bufferSize = 0; + for (final ValueVector v : (Iterable) this) { + bufferSize += v.getBufferSizeFor(valueCount); + } + + return (int) bufferSize; + } + + @Override + public void close() { + offsets.close(); + super.close(); + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new RepeatedMapTransferPair(this, getField().getPath(), allocator); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new RepeatedMapTransferPair(this, (RepeatedMapVector)to); + } + + MapSingleCopier makeSingularCopier(MapVector to) { + return new MapSingleCopier(this, to); + } + + protected static class MapSingleCopier { + private final TransferPair[] pairs; + public final RepeatedMapVector from; + + public MapSingleCopier(RepeatedMapVector from, MapVector to) { + this.from = from; + this.pairs = new TransferPair[from.size()]; + + int i = 0; + ValueVector vector; + for (final String child:from.getChildFieldNames()) { + int preSize = to.size(); + vector = from.getChild(child); + if (vector == null) { + continue; + } + final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); + if (to.size() != preSize) { + newVector.allocateNew(); + } + pairs[i++] = vector.makeTransferPair(newVector); + } + } + + public void copySafe(int fromSubIndex, int toIndex) { + for (TransferPair p : pairs) { + p.copyValueSafe(fromSubIndex, toIndex); + } + } + } + + public TransferPair getTransferPairToSingleMap(String reference, BufferAllocator allocator) { + return new SingleMapTransferPair(this, reference, allocator); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new RepeatedMapTransferPair(this, ref, allocator); + } + + @Override + public boolean allocateNewSafe() { + /* boolean to keep track if all the memory allocation were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + if (!offsets.allocateNewSafe()) { + return false; + } + success = super.allocateNewSafe(); + } finally { + if (!success) { + clear(); + } + } + offsets.zeroVector(); + return success; + } + + protected static class SingleMapTransferPair implements TransferPair { + private final TransferPair[] pairs; + private final RepeatedMapVector from; + private final MapVector to; + private static final MajorType MAP_TYPE = new MajorType(MinorType.MAP, DataMode.REQUIRED); + + public SingleMapTransferPair(RepeatedMapVector from, String path, BufferAllocator allocator) { + this(from, new MapVector(MaterializedField.create(path, MAP_TYPE), allocator, from.callBack), false); + } + + public SingleMapTransferPair(RepeatedMapVector from, MapVector to) { + this(from, to, true); + } + + public SingleMapTransferPair(RepeatedMapVector from, MapVector to, boolean allocate) { + this.from = from; + this.to = to; + this.pairs = new TransferPair[from.size()]; + int i = 0; + ValueVector vector; + for (final String child : from.getChildFieldNames()) { + int preSize = to.size(); + vector = from.getChild(child); + if (vector == null) { + continue; + } + final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); + if (allocate && to.size() != preSize) { + newVector.allocateNew(); + } + pairs[i++] = vector.makeTransferPair(newVector); + } + } + + + @Override + public void transfer() { + for (TransferPair p : pairs) { + p.transfer(); + } + to.getMutator().setValueCount(from.getAccessor().getValueCount()); + from.clear(); + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int from, int to) { + for (TransferPair p : pairs) { + p.copyValueSafe(from, to); + } + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + for (TransferPair p : pairs) { + p.splitAndTransfer(startIndex, length); + } + to.getMutator().setValueCount(length); + } + } + + private static class RepeatedMapTransferPair implements TransferPair{ + + private final TransferPair[] pairs; + private final RepeatedMapVector to; + private final RepeatedMapVector from; + + public RepeatedMapTransferPair(RepeatedMapVector from, String path, BufferAllocator allocator) { + this(from, new RepeatedMapVector(MaterializedField.create(path, TYPE), allocator, from.callBack), false); + } + + public RepeatedMapTransferPair(RepeatedMapVector from, RepeatedMapVector to) { + this(from, to, true); + } + + public RepeatedMapTransferPair(RepeatedMapVector from, RepeatedMapVector to, boolean allocate) { + this.from = from; + this.to = to; + this.pairs = new TransferPair[from.size()]; + this.to.ephPair = null; + + int i = 0; + ValueVector vector; + for (final String child : from.getChildFieldNames()) { + final int preSize = to.size(); + vector = from.getChild(child); + if (vector == null) { + continue; + } + + final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); + if (to.size() != preSize) { + newVector.allocateNew(); + } + + pairs[i++] = vector.makeTransferPair(newVector); + } + } + + @Override + public void transfer() { + from.offsets.transferTo(to.offsets); + for (TransferPair p : pairs) { + p.transfer(); + } + from.clear(); + } + + @Override + public ValueVector getTo() { + return to; + } + + @Override + public void copyValueSafe(int srcIndex, int destIndex) { + RepeatedMapHolder holder = new RepeatedMapHolder(); + from.getAccessor().get(srcIndex, holder); + to.emptyPopulator.populate(destIndex + 1); + int newIndex = to.offsets.getAccessor().get(destIndex); + //todo: make these bulk copies + for (int i = holder.start; i < holder.end; i++, newIndex++) { + for (TransferPair p : pairs) { + p.copyValueSafe(i, newIndex); + } + } + to.offsets.getMutator().setSafe(destIndex + 1, newIndex); + } + + @Override + public void splitAndTransfer(final int groupStart, final int groups) { + final UInt4Vector.Accessor a = from.offsets.getAccessor(); + final UInt4Vector.Mutator m = to.offsets.getMutator(); + + final int startPos = a.get(groupStart); + final int endPos = a.get(groupStart + groups); + final int valuesToCopy = endPos - startPos; + + to.offsets.clear(); + to.offsets.allocateNew(groups + 1); + + int normalizedPos; + for (int i = 0; i < groups + 1; i++) { + normalizedPos = a.get(groupStart + i) - startPos; + m.set(i, normalizedPos); + } + + m.setValueCount(groups + 1); + to.emptyPopulator.populate(groups); + + for (final TransferPair p : pairs) { + p.splitAndTransfer(startPos, valuesToCopy); + } + } + } + + + transient private RepeatedMapTransferPair ephPair; + + public void copyFromSafe(int fromIndex, int thisIndex, RepeatedMapVector from) { + if (ephPair == null || ephPair.from != from) { + ephPair = (RepeatedMapTransferPair) from.makeTransferPair(this); + } + ephPair.copyValueSafe(fromIndex, thisIndex); + } + + @Override + public int getValueCapacity() { + return Math.max(offsets.getValueCapacity() - 1, 0); + } + + @Override + public RepeatedMapAccessor getAccessor() { + return accessor; + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + final int expectedBufferSize = getBufferSize(); + final int actualBufferSize = super.getBufferSize(); + + Preconditions.checkArgument(expectedBufferSize == actualBufferSize + offsets.getBufferSize()); + return ArrayUtils.addAll(offsets.getBuffers(clear), super.getBuffers(clear)); + } + + +// @Override +// public void load(SerializedField metadata, DrillBuf buffer) { +// final List children = metadata.getChildList(); +// +// final SerializedField offsetField = children.get(0); +// offsets.load(offsetField, buffer); +// int bufOffset = offsetField.getBufferLength(); +// +// for (int i = 1; i < children.size(); i++) { +// final SerializedField child = children.get(i); +// final MaterializedField fieldDef = SerializedFieldHelper.create(child); +// ValueVector vector = getChild(fieldDef.getLastName()); +// if (vector == null) { + // if we arrive here, we didn't have a matching vector. +// vector = BasicTypeHelper.getNewVector(fieldDef, allocator); +// putChild(fieldDef.getLastName(), vector); +// } +// final int vectorLength = child.getBufferLength(); +// vector.load(child, buffer.slice(bufOffset, vectorLength)); +// bufOffset += vectorLength; +// } +// +// assert bufOffset == buffer.capacity(); +// } +// +// +// @Override +// public SerializedField getMetadata() { +// SerializedField.Builder builder = getField() // +// .getAsBuilder() // +// .setBufferLength(getBufferSize()) // + // while we don't need to actually read this on load, we need it to make sure we don't skip deserialization of this vector +// .setValueCount(accessor.getValueCount()); +// builder.addChild(offsets.getMetadata()); +// for (final ValueVector child : getChildren()) { +// builder.addChild(child.getMetadata()); +// } +// return builder.build(); +// } + + @Override + public Mutator getMutator() { + return mutator; + } + + public class RepeatedMapAccessor implements RepeatedAccessor { + @Override + public Object getObject(int index) { + final List list = new JsonStringArrayList<>(); + final int end = offsets.getAccessor().get(index+1); + String fieldName; + for (int i = offsets.getAccessor().get(index); i < end; i++) { + final Map vv = Maps.newLinkedHashMap(); + for (final MaterializedField field : getField().getChildren()) { + if (!field.equals(BaseRepeatedValueVector.OFFSETS_FIELD)) { + fieldName = field.getLastName(); + final Object value = getChild(fieldName).getAccessor().getObject(i); + if (value != null) { + vv.put(fieldName, value); + } + } + } + list.add(vv); + } + return list; + } + + @Override + public int getValueCount() { + return Math.max(offsets.getAccessor().getValueCount() - 1, 0); + } + + @Override + public int getInnerValueCount() { + final int valueCount = getValueCount(); + if (valueCount == 0) { + return 0; + } + return offsets.getAccessor().get(valueCount); + } + + @Override + public int getInnerValueCountAt(int index) { + return offsets.getAccessor().get(index+1) - offsets.getAccessor().get(index); + } + + @Override + public boolean isEmpty(int index) { + return false; + } + + @Override + public boolean isNull(int index) { + return false; + } + + public void get(int index, RepeatedMapHolder holder) { + assert index < getValueCapacity() : + String.format("Attempted to access index %d when value capacity is %d", + index, getValueCapacity()); + final UInt4Vector.Accessor offsetsAccessor = offsets.getAccessor(); + holder.start = offsetsAccessor.get(index); + holder.end = offsetsAccessor.get(index + 1); + } + + public void get(int index, ComplexHolder holder) { + final FieldReader reader = getReader(); + reader.setPosition(index); + holder.reader = reader; + } + + public void get(int index, int arrayIndex, ComplexHolder holder) { + final RepeatedMapHolder h = new RepeatedMapHolder(); + get(index, h); + final int offset = h.start + arrayIndex; + + if (offset >= h.end) { + holder.reader = NullReader.INSTANCE; + } else { + reader.setSinglePosition(index, arrayIndex); + holder.reader = reader; + } + } + } + + public class Mutator implements RepeatedMutator { + @Override + public void startNewValue(int index) { + emptyPopulator.populate(index + 1); + offsets.getMutator().setSafe(index + 1, offsets.getAccessor().get(index)); + } + + @Override + public void setValueCount(int topLevelValueCount) { + emptyPopulator.populate(topLevelValueCount); + offsets.getMutator().setValueCount(topLevelValueCount == 0 ? 0 : topLevelValueCount + 1); + int childValueCount = offsets.getAccessor().get(topLevelValueCount); + for (final ValueVector v : getChildren()) { + v.getMutator().setValueCount(childValueCount); + } + } + + @Override + public void reset() {} + + @Override + public void generateTestData(int values) {} + + public int add(int index) { + final int prevEnd = offsets.getAccessor().get(index + 1); + offsets.getMutator().setSafe(index + 1, prevEnd + 1); + return prevEnd; + } + } + + @Override + public void clear() { + getMutator().reset(); + + offsets.clear(); + for(final ValueVector vector : getChildren()) { + vector.clear(); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java new file mode 100644 index 00000000000..99c0a0aeb1e --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; + +/** + * An abstraction representing repeated value vectors. + * + * A repeated vector contains values that may either be flat or nested. A value consists of zero or more cells(inner values). + * Current design maintains data and offsets vectors. Each cell is stored in the data vector. Repeated vector + * uses the offset vector to determine the sequence of cells pertaining to an individual value. + * + */ +public interface RepeatedValueVector extends ValueVector, ContainerVectorLike { + + final static int DEFAULT_REPEAT_PER_RECORD = 5; + + /** + * Returns the underlying offset vector or null if none exists. + * + * TODO(DRILL-2995): eliminate exposing low-level interfaces. + */ + UInt4Vector getOffsetVector(); + + /** + * Returns the underlying data vector or null if none exists. + */ + ValueVector getDataVector(); + + @Override + RepeatedAccessor getAccessor(); + + @Override + RepeatedMutator getMutator(); + + interface RepeatedAccessor extends ValueVector.Accessor { + /** + * Returns total number of cells that vector contains. + * + * The result includes empty, null valued cells. + */ + int getInnerValueCount(); + + + /** + * Returns number of cells that the value at the given index contains. + */ + int getInnerValueCountAt(int index); + + /** + * Returns true if the value at the given index is empty, false otherwise. + * + * @param index value index + */ + boolean isEmpty(int index); + } + + interface RepeatedMutator extends ValueVector.Mutator { + /** + * Starts a new value that is a container of cells. + * + * @param index index of new value to start + */ + void startNewValue(int index); + + + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java new file mode 100644 index 00000000000..93b744e1087 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedVariableWidthVectorLike.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +public interface RepeatedVariableWidthVectorLike { + /** + * Allocate a new memory space for this vector. Must be called prior to using the ValueVector. + * + * @param totalBytes Desired size of the underlying data buffer. + * @param parentValueCount Number of separate repeating groupings. + * @param childValueCount Number of supported values in the vector. + */ + void allocateNew(int totalBytes, int parentValueCount, int childValueCount); + + /** + * Provide the maximum amount of variable width bytes that can be stored int his vector. + * @return + */ + int getByteCapacity(); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java new file mode 100644 index 00000000000..852c72c5497 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/StateTool.java @@ -0,0 +1,34 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import java.util.Arrays; + +public class StateTool { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(StateTool.class); + + public static > void check(T currentState, T... expectedStates) { + for (T s : expectedStates) { + if (s == currentState) { + return; + } + } + throw new IllegalArgumentException(String.format("Expected to be in one of these states %s but was actuall in state %s", Arrays.toString(expectedStates), currentState)); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java new file mode 100644 index 00000000000..d04fc1c022c --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/VectorWithOrdinal.java @@ -0,0 +1,30 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import org.apache.arrow.vector.ValueVector; + +public class VectorWithOrdinal { + public final ValueVector vector; + public final int ordinal; + + public VectorWithOrdinal(ValueVector v, int ordinal) { + this.vector = v; + this.ordinal = ordinal; + } +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java new file mode 100644 index 00000000000..264e241e739 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java @@ -0,0 +1,100 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import java.util.Iterator; + +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.UnionHolder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; + + +abstract class AbstractBaseReader implements FieldReader{ + + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseReader.class); + private static final MajorType LATE_BIND_TYPE = new MajorType(MinorType.LATE, DataMode.OPTIONAL); + + private int index; + + public AbstractBaseReader() { + super(); + } + + public void setPosition(int index){ + this.index = index; + } + + int idx(){ + return index; + } + + @Override + public void reset() { + index = 0; + } + + @Override + public Iterator iterator() { + throw new IllegalStateException("The current reader doesn't support reading as a map."); + } + + public MajorType getType(){ + throw new IllegalStateException("The current reader doesn't support getting type information."); + } + + @Override + public MaterializedField getField() { + return MaterializedField.create("unknown", LATE_BIND_TYPE); + } + + @Override + public boolean next() { + throw new IllegalStateException("The current reader doesn't support getting next information."); + } + + @Override + public int size() { + throw new IllegalStateException("The current reader doesn't support getting size information."); + } + + @Override + public void read(UnionHolder holder) { + holder.reader = this; + holder.isSet = this.isSet() ? 1 : 0; + } + + @Override + public void read(int index, UnionHolder holder) { + throw new IllegalStateException("The current reader doesn't support reading union type"); + } + + @Override + public void copyAsValue(UnionWriter writer) { + throw new IllegalStateException("The current reader doesn't support reading union type"); + } + + @Override + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter)writer); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java new file mode 100644 index 00000000000..4e1e103a12e --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.writer.FieldWriter; + + +abstract class AbstractBaseWriter implements FieldWriter { + //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseWriter.class); + + final FieldWriter parent; + private int index; + + public AbstractBaseWriter(FieldWriter parent) { + this.parent = parent; + } + + @Override + public String toString() { + return super.toString() + "[index = " + index + ", parent = " + parent + "]"; + } + + @Override + public FieldWriter getParent() { + return parent; + } + + public boolean isRoot() { + return parent == null; + } + + int idx() { + return index; + } + + @Override + public void setPosition(int index) { + this.index = index; + } + + @Override + public void end() { + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java new file mode 100644 index 00000000000..4e2051fd4ef --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -0,0 +1,193 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.StateTool; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; + +import com.google.common.base.Preconditions; + +public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWriter { +// private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ComplexWriterImpl.class); + + private SingleMapWriter mapRoot; + private SingleListWriter listRoot; + private final MapVector container; + + Mode mode = Mode.INIT; + private final String name; + private final boolean unionEnabled; + + private enum Mode { INIT, MAP, LIST }; + + public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled){ + super(null); + this.name = name; + this.container = container; + this.unionEnabled = unionEnabled; + } + + public ComplexWriterImpl(String name, MapVector container){ + this(name, container, false); + } + + @Override + public MaterializedField getField() { + return container.getField(); + } + + @Override + public int getValueCapacity() { + return container.getValueCapacity(); + } + + private void check(Mode... modes){ + StateTool.check(mode, modes); + } + + @Override + public void reset(){ + setPosition(0); + } + + @Override + public void close() throws Exception { + clear(); + mapRoot.close(); + if (listRoot != null) { + listRoot.close(); + } + } + + @Override + public void clear(){ + switch(mode){ + case MAP: + mapRoot.clear(); + break; + case LIST: + listRoot.clear(); + break; + } + } + + @Override + public void setValueCount(int count){ + switch(mode){ + case MAP: + mapRoot.setValueCount(count); + break; + case LIST: + listRoot.setValueCount(count); + break; + } + } + + @Override + public void setPosition(int index){ + super.setPosition(index); + switch(mode){ + case MAP: + mapRoot.setPosition(index); + break; + case LIST: + listRoot.setPosition(index); + break; + } + } + + + public MapWriter directMap(){ + Preconditions.checkArgument(name == null); + + switch(mode){ + + case INIT: + MapVector map = (MapVector) container; + mapRoot = new SingleMapWriter(map, this, unionEnabled); + mapRoot.setPosition(idx()); + mode = Mode.MAP; + break; + + case MAP: + break; + + default: + check(Mode.INIT, Mode.MAP); + } + + return mapRoot; + } + + @Override + public MapWriter rootAsMap() { + switch(mode){ + + case INIT: + MapVector map = container.addOrGet(name, Types.required(MinorType.MAP), MapVector.class); + mapRoot = new SingleMapWriter(map, this, unionEnabled); + mapRoot.setPosition(idx()); + mode = Mode.MAP; + break; + + case MAP: + break; + + default: + check(Mode.INIT, Mode.MAP); + } + + return mapRoot; + } + + + @Override + public void allocate() { + if(mapRoot != null) { + mapRoot.allocate(); + } else if(listRoot != null) { + listRoot.allocate(); + } + } + + @Override + public ListWriter rootAsList() { + switch(mode){ + + case INIT: + listRoot = new SingleListWriter(name, container, this); + listRoot.setPosition(idx()); + mode = Mode.LIST; + break; + + case LIST: + break; + + default: + check(Mode.INIT, Mode.MAP); + } + + return listRoot; + } + + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java new file mode 100644 index 00000000000..f8a9d4232aa --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/MapOrListWriterImpl.java @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapOrListWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.BitWriter; +import org.apache.arrow.vector.complex.writer.Float4Writer; +import org.apache.arrow.vector.complex.writer.Float8Writer; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.complex.writer.VarBinaryWriter; +import org.apache.arrow.vector.complex.writer.VarCharWriter; + +public class MapOrListWriterImpl implements MapOrListWriter { + + public final BaseWriter.MapWriter map; + public final BaseWriter.ListWriter list; + + public MapOrListWriterImpl(final BaseWriter.MapWriter writer) { + this.map = writer; + this.list = null; + } + + public MapOrListWriterImpl(final BaseWriter.ListWriter writer) { + this.map = null; + this.list = writer; + } + + public void start() { + if (map != null) { + map.start(); + } else { + list.startList(); + } + } + + public void end() { + if (map != null) { + map.end(); + } else { + list.endList(); + } + } + + public MapOrListWriter map(final String name) { + assert map != null; + return new MapOrListWriterImpl(map.map(name)); + } + + public MapOrListWriter listoftmap(final String name) { + assert list != null; + return new MapOrListWriterImpl(list.map()); + } + + public MapOrListWriter list(final String name) { + assert map != null; + return new MapOrListWriterImpl(map.list(name)); + } + + public boolean isMapWriter() { + return map != null; + } + + public boolean isListWriter() { + return list != null; + } + + public VarCharWriter varChar(final String name) { + return (map != null) ? map.varChar(name) : list.varChar(); + } + + public IntWriter integer(final String name) { + return (map != null) ? map.integer(name) : list.integer(); + } + + public BigIntWriter bigInt(final String name) { + return (map != null) ? map.bigInt(name) : list.bigInt(); + } + + public Float4Writer float4(final String name) { + return (map != null) ? map.float4(name) : list.float4(); + } + + public Float8Writer float8(final String name) { + return (map != null) ? map.float8(name) : list.float8(); + } + + public BitWriter bit(final String name) { + return (map != null) ? map.bit(name) : list.bit(); + } + + public VarBinaryWriter binary(final String name) { + return (map != null) ? map.varBinary(name) : list.varBinary(); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java new file mode 100644 index 00000000000..ea62e023608 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -0,0 +1,196 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import java.lang.reflect.Constructor; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VectorDescriptor; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.complex.AbstractMapVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.BasicTypeHelper; +import org.apache.arrow.vector.util.TransferPair; + +/** + * This FieldWriter implementation delegates all FieldWriter API calls to an inner FieldWriter. This inner field writer + * can start as a specific type, and this class will promote the writer to a UnionWriter if a call is made that the specifically + * typed writer cannot handle. A new UnionVector is created, wrapping the original vector, and replaces the original vector + * in the parent vector, which can be either an AbstractMapVector or a ListVector. + */ +public class PromotableWriter extends AbstractPromotableFieldWriter { + + private final AbstractMapVector parentContainer; + private final ListVector listVector; + private int position; + + private enum State { + UNTYPED, SINGLE, UNION + } + + private MinorType type; + private ValueVector vector; + private UnionVector unionVector; + private State state; + private FieldWriter writer; + + public PromotableWriter(ValueVector v, AbstractMapVector parentContainer) { + super(null); + this.parentContainer = parentContainer; + this.listVector = null; + init(v); + } + + public PromotableWriter(ValueVector v, ListVector listVector) { + super(null); + this.listVector = listVector; + this.parentContainer = null; + init(v); + } + + private void init(ValueVector v) { + if (v instanceof UnionVector) { + state = State.UNION; + unionVector = (UnionVector) v; + writer = new UnionWriter(unionVector); + } else if (v instanceof ZeroVector) { + state = State.UNTYPED; + } else { + setWriter(v); + } + } + + private void setWriter(ValueVector v) { + state = State.SINGLE; + vector = v; + type = v.getField().getType().getMinorType(); + Class writerClass = BasicTypeHelper + .getWriterImpl(v.getField().getType().getMinorType(), v.getField().getDataMode()); + if (writerClass.equals(SingleListWriter.class)) { + writerClass = UnionListWriter.class; + } + Class vectorClass = BasicTypeHelper.getValueVectorClass(v.getField().getType().getMinorType(), v.getField() + .getDataMode()); + try { + Constructor constructor = null; + for (Constructor c : writerClass.getConstructors()) { + if (c.getParameterTypes().length == 3) { + constructor = c; + } + } + if (constructor == null) { + constructor = writerClass.getConstructor(vectorClass, AbstractFieldWriter.class); + writer = (FieldWriter) constructor.newInstance(vector, null); + } else { + writer = (FieldWriter) constructor.newInstance(vector, null, true); + } + } catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } + + @Override + public void setPosition(int index) { + super.setPosition(index); + FieldWriter w = getWriter(); + if (w == null) { + position = index; + } else { + w.setPosition(index); + } + } + + protected FieldWriter getWriter(MinorType type) { + if (state == State.UNION) { + return writer; + } + if (state == State.UNTYPED) { + if (type == null) { + return null; + } + ValueVector v = listVector.addOrGetVector(new VectorDescriptor(new MajorType(type, DataMode.OPTIONAL))).getVector(); + v.allocateNew(); + setWriter(v); + writer.setPosition(position); + } + if (type != this.type) { + return promoteToUnion(); + } + return writer; + } + + @Override + public boolean isEmptyMap() { + return writer.isEmptyMap(); + } + + protected FieldWriter getWriter() { + return getWriter(type); + } + + private FieldWriter promoteToUnion() { + String name = vector.getField().getLastName(); + TransferPair tp = vector.getTransferPair(vector.getField().getType().getMinorType().name().toLowerCase(), vector.getAllocator()); + tp.transfer(); + if (parentContainer != null) { + unionVector = parentContainer.addOrGet(name, new MajorType(MinorType.UNION, DataMode.OPTIONAL), UnionVector.class); + } else if (listVector != null) { + unionVector = listVector.promoteToUnion(); + } + unionVector.addVector(tp.getTo()); + writer = new UnionWriter(unionVector); + writer.setPosition(idx()); + for (int i = 0; i < idx(); i++) { + unionVector.getMutator().setType(i, vector.getField().getType().getMinorType()); + } + vector = null; + state = State.UNION; + return writer; + } + + @Override + public void allocate() { + getWriter().allocate(); + } + + @Override + public void clear() { + getWriter().clear(); + } + + @Override + public MaterializedField getField() { + return getWriter().getField(); + } + + @Override + public int getValueCapacity() { + return getWriter().getValueCapacity(); + } + + @Override + public void close() throws Exception { + getWriter().close(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java new file mode 100644 index 00000000000..dd1a152e2f6 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java @@ -0,0 +1,145 @@ +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.complex.impl; + + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.RepeatedListVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.holders.RepeatedListHolder; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; + +public class RepeatedListReaderImpl extends AbstractFieldReader{ + private static final int NO_VALUES = Integer.MAX_VALUE - 1; + private static final MajorType TYPE = new MajorType(MinorType.LIST, DataMode.REPEATED); + private final String name; + private final RepeatedListVector container; + private FieldReader reader; + + public RepeatedListReaderImpl(String name, RepeatedListVector container) { + super(); + this.name = name; + this.container = container; + } + + @Override + public MajorType getType() { + return TYPE; + } + + @Override + public void copyAsValue(ListWriter writer) { + if (currentOffset == NO_VALUES) { + return; + } + RepeatedListWriter impl = (RepeatedListWriter) writer; + impl.container.copyFromSafe(idx(), impl.idx(), container); + } + + @Override + public void copyAsField(String name, MapWriter writer) { + if (currentOffset == NO_VALUES) { + return; + } + RepeatedListWriter impl = (RepeatedListWriter) writer.list(name); + impl.container.copyFromSafe(idx(), impl.idx(), container); + } + + private int currentOffset; + private int maxOffset; + + @Override + public void reset() { + super.reset(); + currentOffset = 0; + maxOffset = 0; + if (reader != null) { + reader.reset(); + } + reader = null; + } + + @Override + public int size() { + return maxOffset - currentOffset; + } + + @Override + public void setPosition(int index) { + if (index < 0 || index == NO_VALUES) { + currentOffset = NO_VALUES; + return; + } + + super.setPosition(index); + RepeatedListHolder h = new RepeatedListHolder(); + container.getAccessor().get(index, h); + if (h.start == h.end) { + currentOffset = NO_VALUES; + } else { + currentOffset = h.start-1; + maxOffset = h.end; + if(reader != null) { + reader.setPosition(currentOffset); + } + } + } + + @Override + public boolean next() { + if (currentOffset +1 < maxOffset) { + currentOffset++; + if (reader != null) { + reader.setPosition(currentOffset); + } + return true; + } else { + currentOffset = NO_VALUES; + return false; + } + } + + @Override + public Object readObject() { + return container.getAccessor().getObject(idx()); + } + + @Override + public FieldReader reader() { + if (reader == null) { + ValueVector child = container.getChild(name); + if (child == null) { + reader = NullReader.INSTANCE; + } else { + reader = child.getReader(); + } + reader.setPosition(currentOffset); + } + return reader; + } + + public boolean isSet() { + return true; + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java new file mode 100644 index 00000000000..09a831d8329 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java @@ -0,0 +1,192 @@ +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.complex.impl; + +import java.util.Map; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.RepeatedMapVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.holders.RepeatedMapHolder; +import org.apache.arrow.vector.types.Types.MajorType; + +import com.google.common.collect.Maps; + +@SuppressWarnings("unused") +public class RepeatedMapReaderImpl extends AbstractFieldReader{ + private static final int NO_VALUES = Integer.MAX_VALUE - 1; + + private final RepeatedMapVector vector; + private final Map fields = Maps.newHashMap(); + + public RepeatedMapReaderImpl(RepeatedMapVector vector) { + this.vector = vector; + } + + private void setChildrenPosition(int index) { + for (FieldReader r : fields.values()) { + r.setPosition(index); + } + } + + @Override + public FieldReader reader(String name) { + FieldReader reader = fields.get(name); + if (reader == null) { + ValueVector child = vector.getChild(name); + if (child == null) { + reader = NullReader.INSTANCE; + } else { + reader = child.getReader(); + } + fields.put(name, reader); + reader.setPosition(currentOffset); + } + return reader; + } + + @Override + public FieldReader reader() { + if (currentOffset == NO_VALUES) { + return NullReader.INSTANCE; + } + + setChildrenPosition(currentOffset); + return new SingleLikeRepeatedMapReaderImpl(vector, this); + } + + private int currentOffset; + private int maxOffset; + + @Override + public void reset() { + super.reset(); + currentOffset = 0; + maxOffset = 0; + for (FieldReader reader:fields.values()) { + reader.reset(); + } + fields.clear(); + } + + @Override + public int size() { + if (isNull()) { + return 0; + } + return maxOffset - (currentOffset < 0 ? 0 : currentOffset); + } + + @Override + public void setPosition(int index) { + if (index < 0 || index == NO_VALUES) { + currentOffset = NO_VALUES; + return; + } + + super.setPosition(index); + RepeatedMapHolder h = new RepeatedMapHolder(); + vector.getAccessor().get(index, h); + if (h.start == h.end) { + currentOffset = NO_VALUES; + } else { + currentOffset = h.start-1; + maxOffset = h.end; + setChildrenPosition(currentOffset); + } + } + + public void setSinglePosition(int index, int childIndex) { + super.setPosition(index); + RepeatedMapHolder h = new RepeatedMapHolder(); + vector.getAccessor().get(index, h); + if (h.start == h.end) { + currentOffset = NO_VALUES; + } else { + int singleOffset = h.start + childIndex; + assert singleOffset < h.end; + currentOffset = singleOffset; + maxOffset = singleOffset + 1; + setChildrenPosition(singleOffset); + } + } + + @Override + public boolean next() { + if (currentOffset +1 < maxOffset) { + setChildrenPosition(++currentOffset); + return true; + } else { + currentOffset = NO_VALUES; + return false; + } + } + + public boolean isNull() { + return currentOffset == NO_VALUES; + } + + @Override + public Object readObject() { + return vector.getAccessor().getObject(idx()); + } + + @Override + public MajorType getType() { + return vector.getField().getType(); + } + + @Override + public java.util.Iterator iterator() { + return vector.fieldNameIterator(); + } + + @Override + public boolean isSet() { + return true; + } + + @Override + public void copyAsValue(MapWriter writer) { + if (currentOffset == NO_VALUES) { + return; + } + RepeatedMapWriter impl = (RepeatedMapWriter) writer; + impl.container.copyFromSafe(idx(), impl.idx(), vector); + } + + public void copyAsValueSingle(MapWriter writer) { + if (currentOffset == NO_VALUES) { + return; + } + SingleMapWriter impl = (SingleMapWriter) writer; + impl.container.copyFromSafe(currentOffset, impl.idx(), vector); + } + + @Override + public void copyAsField(String name, MapWriter writer) { + if (currentOffset == NO_VALUES) { + return; + } + RepeatedMapWriter impl = (RepeatedMapWriter) writer.map(name); + impl.container.copyFromSafe(idx(), impl.idx(), vector); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java new file mode 100644 index 00000000000..086d26e1194 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.complex.impl; + +import java.util.Iterator; + +import org.apache.arrow.vector.complex.RepeatedMapVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; + +public class SingleLikeRepeatedMapReaderImpl extends AbstractFieldReader{ + + private RepeatedMapReaderImpl delegate; + + public SingleLikeRepeatedMapReaderImpl(RepeatedMapVector vector, FieldReader delegate) { + this.delegate = (RepeatedMapReaderImpl) delegate; + } + + @Override + public int size() { + throw new UnsupportedOperationException("You can't call size on a single map reader."); + } + + @Override + public boolean next() { + throw new UnsupportedOperationException("You can't call next on a single map reader."); + } + + @Override + public MajorType getType() { + return Types.required(MinorType.MAP); + } + + + @Override + public void copyAsValue(MapWriter writer) { + delegate.copyAsValueSingle(writer); + } + + public void copyAsValueSingle(MapWriter writer){ + delegate.copyAsValueSingle(writer); + } + + @Override + public FieldReader reader(String name) { + return delegate.reader(name); + } + + @Override + public void setPosition(int index) { + delegate.setPosition(index); + } + + @Override + public Object readObject() { + return delegate.readObject(); + } + + @Override + public Iterator iterator() { + return delegate.iterator(); + } + + @Override + public boolean isSet() { + return ! delegate.isNull(); + } + + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java new file mode 100644 index 00000000000..f16f628603d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java @@ -0,0 +1,88 @@ + +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.complex.impl; + + +import org.apache.arrow.vector.complex.AbstractContainerVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; + +@SuppressWarnings("unused") +public class SingleListReaderImpl extends AbstractFieldReader{ + + private static final MajorType TYPE = Types.optional(MinorType.LIST); + private final String name; + private final AbstractContainerVector container; + private FieldReader reader; + + public SingleListReaderImpl(String name, AbstractContainerVector container) { + super(); + this.name = name; + this.container = container; + } + + @Override + public MajorType getType() { + return TYPE; + } + + + @Override + public void setPosition(int index) { + super.setPosition(index); + if (reader != null) { + reader.setPosition(index); + } + } + + @Override + public Object readObject() { + return reader.readObject(); + } + + @Override + public FieldReader reader() { + if (reader == null) { + reader = container.getChild(name).getReader(); + setPosition(idx()); + } + return reader; + } + + @Override + public boolean isSet() { + return false; + } + + @Override + public void copyAsValue(ListWriter writer) { + throw new UnsupportedOperationException("Generic list copying not yet supported. Please resolve to typed list."); + } + + @Override + public void copyAsField(String name, MapWriter writer) { + throw new UnsupportedOperationException("Generic list copying not yet supported. Please resolve to typed list."); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java new file mode 100644 index 00000000000..84b99801419 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java @@ -0,0 +1,108 @@ + + +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.complex.impl; + + +import java.util.Map; + +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.types.Types.MajorType; + +import com.google.common.collect.Maps; + +@SuppressWarnings("unused") +public class SingleMapReaderImpl extends AbstractFieldReader{ + + private final MapVector vector; + private final Map fields = Maps.newHashMap(); + + public SingleMapReaderImpl(MapVector vector) { + this.vector = vector; + } + + private void setChildrenPosition(int index){ + for(FieldReader r : fields.values()){ + r.setPosition(index); + } + } + + @Override + public FieldReader reader(String name){ + FieldReader reader = fields.get(name); + if(reader == null){ + ValueVector child = vector.getChild(name); + if(child == null){ + reader = NullReader.INSTANCE; + }else{ + reader = child.getReader(); + } + fields.put(name, reader); + reader.setPosition(idx()); + } + return reader; + } + + @Override + public void setPosition(int index){ + super.setPosition(index); + for(FieldReader r : fields.values()){ + r.setPosition(index); + } + } + + @Override + public Object readObject() { + return vector.getAccessor().getObject(idx()); + } + + @Override + public boolean isSet() { + return true; + } + + @Override + public MajorType getType(){ + return vector.getField().getType(); + } + + @Override + public java.util.Iterator iterator(){ + return vector.fieldNameIterator(); + } + + @Override + public void copyAsValue(MapWriter writer){ + SingleMapWriter impl = (SingleMapWriter) writer; + impl.container.copyFromSafe(idx(), impl.idx(), vector); + } + + @Override + public void copyAsField(String name, MapWriter writer){ + SingleMapWriter impl = (SingleMapWriter) writer.map(name); + impl.container.copyFromSafe(idx(), impl.idx(), vector); + } + + +} + diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java new file mode 100644 index 00000000000..9b54d02e571 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -0,0 +1,98 @@ +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.holders.UnionHolder; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; + +public class UnionListReader extends AbstractFieldReader { + + private ListVector vector; + private ValueVector data; + private UInt4Vector offsets; + + public UnionListReader(ListVector vector) { + this.vector = vector; + this.data = vector.getDataVector(); + this.offsets = vector.getOffsetVector(); + } + + @Override + public boolean isSet() { + return true; + } + + MajorType type = new MajorType(MinorType.LIST, DataMode.OPTIONAL); + + public MajorType getType() { + return type; + } + + private int currentOffset; + private int maxOffset; + + @Override + public void setPosition(int index) { + super.setPosition(index); + currentOffset = offsets.getAccessor().get(index) - 1; + maxOffset = offsets.getAccessor().get(index + 1); + } + + @Override + public FieldReader reader() { + return data.getReader(); + } + + @Override + public Object readObject() { + return vector.getAccessor().getObject(idx()); + } + + @Override + public void read(int index, UnionHolder holder) { + setPosition(idx()); + for (int i = -1; i < index; i++) { + next(); + } + holder.reader = data.getReader(); + holder.isSet = data.getReader().isSet() ? 1 : 0; + } + + @Override + public boolean next() { + if (currentOffset + 1 < maxOffset) { + data.getReader().setPosition(++currentOffset); + return true; + } else { + return false; + } + } + + public void copyAsValue(ListWriter writer) { + ComplexCopier.copy(this, (FieldWriter) writer); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java new file mode 100644 index 00000000000..c4eb3dc739a --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/reader/FieldReader.java @@ -0,0 +1,29 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.reader; + +import org.apache.arrow.vector.complex.reader.BaseReader.ListReader; +import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; +import org.apache.arrow.vector.complex.reader.BaseReader.RepeatedListReader; +import org.apache.arrow.vector.complex.reader.BaseReader.RepeatedMapReader; +import org.apache.arrow.vector.complex.reader.BaseReader.ScalarReader; + + + +public interface FieldReader extends MapReader, ListReader, ScalarReader, RepeatedMapReader, RepeatedListReader { +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java new file mode 100644 index 00000000000..ecffe0bec0e --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/writer/FieldWriter.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.writer; + +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ScalarWriter; + +public interface FieldWriter extends MapWriter, ListWriter, ScalarWriter { + void allocate(); + void clear(); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java new file mode 100644 index 00000000000..0f9310da55b --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ComplexHolder.java @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.holders; + +import org.apache.arrow.vector.complex.reader.FieldReader; + +public class ComplexHolder implements ValueHolder { + public FieldReader reader; + public int isSet; +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java new file mode 100644 index 00000000000..5a5fe0305d8 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.holders; + +import org.apache.arrow.vector.types.Types; + +/* + * Holder class for the vector ObjectVector. This holder internally stores a + * reference to an object. The ObjectVector maintains an array of these objects. + * This holder can be used only as workspace variables in aggregate functions. + * Using this holder should be avoided and we should stick to native holder types. + */ +@Deprecated +public class ObjectHolder implements ValueHolder { + public static final Types.MajorType TYPE = Types.required(Types.MinorType.GENERIC_OBJECT); + + public Types.MajorType getType() { + return TYPE; + } + + public Object obj; +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java new file mode 100644 index 00000000000..83506cdc175 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedListHolder.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.holders; + +public final class RepeatedListHolder implements ValueHolder{ + public int start; + public int end; +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java new file mode 100644 index 00000000000..85d782b3818 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/RepeatedMapHolder.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.holders; + +public final class RepeatedMapHolder implements ValueHolder{ + public int start; + public int end; +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java new file mode 100644 index 00000000000..b868a620f98 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.holders; + +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.types.Types.MinorType; + +public class UnionHolder implements ValueHolder { + public static final MajorType TYPE = new MajorType(MinorType.UNION, DataMode.OPTIONAL); + public FieldReader reader; + public int isSet; + + public MajorType getType() { + return reader.getType(); + } + + public boolean isSet() { + return isSet == 1; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java new file mode 100644 index 00000000000..88cbcd4a8c3 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.holders; + +/** + * Wrapper object for an individual value in Drill. + * + * ValueHolders are designed to be mutable wrapper objects for defining clean + * APIs that access data in Drill. For performance, object creation is avoided + * at all costs throughout execution. For this reason, ValueHolders are + * disallowed from implementing any methods, this allows for them to be + * replaced by their java primitive inner members during optimization of + * run-time generated code. + */ +public interface ValueHolder { +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java b/java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java new file mode 100644 index 00000000000..c73098b2a85 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.types; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.Objects; + +import org.apache.arrow.vector.types.Types.DataMode; +import org.apache.arrow.vector.types.Types.MajorType; +import org.apache.arrow.vector.util.BasicTypeHelper; + + +public class MaterializedField { + private final String name; + private final MajorType type; + // use an ordered set as existing code relies on order (e,g. parquet writer) + private final LinkedHashSet children; + + MaterializedField(String name, MajorType type, LinkedHashSet children) { + this.name = name; + this.type = type; + this.children = children; + } + + public Collection getChildren() { + return new ArrayList<>(children); + } + + public MaterializedField newWithChild(MaterializedField child) { + MaterializedField newField = clone(); + newField.addChild(child); + return newField; + } + + public void addChild(MaterializedField field){ + children.add(field); + } + + public MaterializedField clone() { + return withPathAndType(name, getType()); + } + + public MaterializedField withType(MajorType type) { + return withPathAndType(name, type); + } + + public MaterializedField withPath(String name) { + return withPathAndType(name, getType()); + } + + public MaterializedField withPathAndType(String name, final MajorType type) { + final LinkedHashSet newChildren = new LinkedHashSet<>(children.size()); + for (final MaterializedField child:children) { + newChildren.add(child.clone()); + } + return new MaterializedField(name, type, newChildren); + } + +// public String getLastName(){ +// PathSegment seg = key.path.getRootSegment(); +// while (seg.getChild() != null) { +// seg = seg.getChild(); +// } +// return seg.getNameSegment().getPath(); +// } + + + // TODO: rewrite without as direct match rather than conversion then match. +// public boolean matches(SerializedField booleanfield){ +// MaterializedField f = create(field); +// return f.equals(this); +// } + + public static MaterializedField create(String name, MajorType type){ + return new MaterializedField(name, type, new LinkedHashSet()); + } + +// public String getName(){ +// StringBuilder sb = new StringBuilder(); +// boolean first = true; +// for(NamePart np : def.getNameList()){ +// if(np.getType() == Type.ARRAY){ +// sb.append("[]"); +// }else{ +// if(first){ +// first = false; +// }else{ +// sb.append("."); +// } +// sb.append('`'); +// sb.append(np.getName()); +// sb.append('`'); +// +// } +// } +// return sb.toString(); +// } + + public String getPath() { + return getName(); + } + + public String getLastName() { + return getName(); + } + + public String getName() { + return name; + } + +// public int getWidth() { +// return type.getWidth(); +// } + + public MajorType getType() { + return type; + } + + public int getScale() { + return type.getScale(); + } + public int getPrecision() { + return type.getPrecision(); + } + public boolean isNullable() { + return type.getMode() == DataMode.OPTIONAL; + } + + public DataMode getDataMode() { + return type.getMode(); + } + + public MaterializedField getOtherNullableVersion(){ + MajorType mt = type; + DataMode newDataMode = null; + switch (mt.getMode()){ + case OPTIONAL: + newDataMode = DataMode.REQUIRED; + break; + case REQUIRED: + newDataMode = DataMode.OPTIONAL; + break; + default: + throw new UnsupportedOperationException(); + } + return new MaterializedField(name, new MajorType(mt.getMinorType(), newDataMode, mt.getPrecision(), mt.getScale(), mt.getTimezone(), mt.getSubTypes()), children); + } + + public Class getValueClass() { + return BasicTypeHelper.getValueVectorClass(getType().getMinorType(), getDataMode()); + } + + @Override + public int hashCode() { + return Objects.hash(this.name, this.type, this.children); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + MaterializedField other = (MaterializedField) obj; + // DRILL-1872: Compute equals only on key. See also the comment + // in MapVector$MapTransferPair + + return this.name.equalsIgnoreCase(other.name) && + Objects.equals(this.type, other.type); + } + + + @Override + public String toString() { + final int maxLen = 10; + String childStr = children != null && !children.isEmpty() ? toString(children, maxLen) : ""; + return name + "(" + type.getMinorType().name() + ":" + type.getMode().name() + ")" + childStr; + } + + + private String toString(Collection collection, int maxLen) { + StringBuilder builder = new StringBuilder(); + builder.append("["); + int i = 0; + for (Iterator iterator = collection.iterator(); iterator.hasNext() && i < maxLen; i++) { + if (i > 0){ + builder.append(", "); + } + builder.append(iterator.next()); + } + builder.append("]"); + return builder.toString(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java new file mode 100644 index 00000000000..cef892ce880 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.types; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +public class Types { + public enum MinorType { + LATE, // late binding type + MAP, // an empty map column. Useful for conceptual setup. Children listed within here + + TINYINT, // single byte signed integer + SMALLINT, // two byte signed integer + INT, // four byte signed integer + BIGINT, // eight byte signed integer + DECIMAL9, // a decimal supporting precision between 1 and 9 + DECIMAL18, // a decimal supporting precision between 10 and 18 + DECIMAL28SPARSE, // a decimal supporting precision between 19 and 28 + DECIMAL38SPARSE, // a decimal supporting precision between 29 and 38 + MONEY, // signed decimal with two digit precision + DATE, // days since 4713bc + TIME, // time in micros before or after 2000/1/1 + TIMETZ, // time in micros before or after 2000/1/1 with timezone + TIMESTAMPTZ, // unix epoch time in millis + TIMESTAMP, // TBD + INTERVAL, // TBD + FLOAT4, // 4 byte ieee 754 + FLOAT8, // 8 byte ieee 754 + BIT, // single bit value (boolean) + FIXEDCHAR, // utf8 fixed length string, padded with spaces + FIXED16CHAR, + FIXEDBINARY, // fixed length binary, padded with 0 bytes + VARCHAR, // utf8 variable length string + VAR16CHAR, // utf16 variable length string + VARBINARY, // variable length binary + UINT1, // unsigned 1 byte integer + UINT2, // unsigned 2 byte integer + UINT4, // unsigned 4 byte integer + UINT8, // unsigned 8 byte integer + DECIMAL28DENSE, // dense decimal representation, supporting precision between 19 and 28 + DECIMAL38DENSE, // dense decimal representation, supporting precision between 28 and 38 + NULL, // a value of unknown type (e.g. a missing reference). + INTERVALYEAR, // Interval type specifying YEAR to MONTH + INTERVALDAY, // Interval type specifying DAY to SECONDS + LIST, + GENERIC_OBJECT, + UNION + } + + public enum DataMode { + REQUIRED, + OPTIONAL, + REPEATED + } + + public static class MajorType { + private MinorType minorType; + private DataMode mode; + private Integer precision; + private Integer scale; + private Integer timezone; + private List subTypes; + + public MajorType(MinorType minorType, DataMode mode) { + this(minorType, mode, null, null, null, null); + } + + public MajorType(MinorType minorType, DataMode mode, Integer precision, Integer scale) { + this(minorType, mode, precision, scale, null, null); + } + + public MajorType(MinorType minorType, DataMode mode, Integer precision, Integer scale, Integer timezone, List subTypes) { + this.minorType = minorType; + this.mode = mode; + this.precision = precision; + this.scale = scale; + this.timezone = timezone; + this.subTypes = subTypes; + } + + public MinorType getMinorType() { + return minorType; + } + + public DataMode getMode() { + return mode; + } + + public Integer getPrecision() { + return precision; + } + + public Integer getScale() { + return scale; + } + + public Integer getTimezone() { + return timezone; + } + + public List getSubTypes() { + return subTypes; + } + } + + public static MajorType required(MinorType minorType) { + return new MajorType(minorType, DataMode.REQUIRED); + } + public static MajorType optional(MinorType minorType) { + return new MajorType(minorType, DataMode.OPTIONAL); + } + public static MajorType repeated(MinorType minorType) { + return new MajorType(minorType, DataMode.REPEATED); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java new file mode 100644 index 00000000000..2bdfd70b229 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -0,0 +1,233 @@ +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.util; + +import io.netty.buffer.ArrowBuf; +import io.netty.util.internal.PlatformDependent; + +import org.apache.arrow.memory.BoundsChecking; + +import com.google.common.primitives.UnsignedLongs; + +public class ByteFunctionHelpers { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ByteFunctionHelpers.class); + + /** + * Helper function to check for equality of bytes in two DrillBuffers + * + * @param left Left DrillBuf for comparison + * @param lStart start offset in the buffer + * @param lEnd end offset in the buffer + * @param right Right DrillBuf for comparison + * @param rStart start offset in the buffer + * @param rEnd end offset in the buffer + * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise + */ + public static final int equal(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd){ + if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { + left.checkBytes(lStart, lEnd); + right.checkBytes(rStart, rEnd); + } + return memEqual(left.memoryAddress(), lStart, lEnd, right.memoryAddress(), rStart, rEnd); + } + + private static final int memEqual(final long laddr, int lStart, int lEnd, final long raddr, int rStart, + final int rEnd) { + + int n = lEnd - lStart; + if (n == rEnd - rStart) { + long lPos = laddr + lStart; + long rPos = raddr + rStart; + + while (n > 7) { + long leftLong = PlatformDependent.getLong(lPos); + long rightLong = PlatformDependent.getLong(rPos); + if (leftLong != rightLong) { + return 0; + } + lPos += 8; + rPos += 8; + n -= 8; + } + while (n-- != 0) { + byte leftByte = PlatformDependent.getByte(lPos); + byte rightByte = PlatformDependent.getByte(rPos); + if (leftByte != rightByte) { + return 0; + } + lPos++; + rPos++; + } + return 1; + } else { + return 0; + } + } + + /** + * Helper function to compare a set of bytes in two DrillBuffers. + * + * Function will check data before completing in the case that + * + * @param left Left DrillBuf to compare + * @param lStart start offset in the buffer + * @param lEnd end offset in the buffer + * @param right Right DrillBuf to compare + * @param rStart start offset in the buffer + * @param rEnd end offset in the buffer + * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise + */ + public static final int compare(final ArrowBuf left, int lStart, int lEnd, final ArrowBuf right, int rStart, int rEnd){ + if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { + left.checkBytes(lStart, lEnd); + right.checkBytes(rStart, rEnd); + } + return memcmp(left.memoryAddress(), lStart, lEnd, right.memoryAddress(), rStart, rEnd); + } + + private static final int memcmp(final long laddr, int lStart, int lEnd, final long raddr, int rStart, final int rEnd) { + int lLen = lEnd - lStart; + int rLen = rEnd - rStart; + int n = Math.min(rLen, lLen); + long lPos = laddr + lStart; + long rPos = raddr + rStart; + + while (n > 7) { + long leftLong = PlatformDependent.getLong(lPos); + long rightLong = PlatformDependent.getLong(rPos); + if (leftLong != rightLong) { + return UnsignedLongs.compare(Long.reverseBytes(leftLong), Long.reverseBytes(rightLong)); + } + lPos += 8; + rPos += 8; + n -= 8; + } + + while (n-- != 0) { + byte leftByte = PlatformDependent.getByte(lPos); + byte rightByte = PlatformDependent.getByte(rPos); + if (leftByte != rightByte) { + return ((leftByte & 0xFF) - (rightByte & 0xFF)) > 0 ? 1 : -1; + } + lPos++; + rPos++; + } + + if (lLen == rLen) { + return 0; + } + + return lLen > rLen ? 1 : -1; + + } + + /** + * Helper function to compare a set of bytes in DrillBuf to a ByteArray. + * + * @param left Left DrillBuf for comparison purposes + * @param lStart start offset in the buffer + * @param lEnd end offset in the buffer + * @param right second input to be compared + * @param rStart start offset in the byte array + * @param rEnd end offset in the byte array + * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise + */ + public static final int compare(final ArrowBuf left, int lStart, int lEnd, final byte[] right, int rStart, final int rEnd) { + if (BoundsChecking.BOUNDS_CHECKING_ENABLED) { + left.checkBytes(lStart, lEnd); + } + return memcmp(left.memoryAddress(), lStart, lEnd, right, rStart, rEnd); + } + + + private static final int memcmp(final long laddr, int lStart, int lEnd, final byte[] right, int rStart, final int rEnd) { + int lLen = lEnd - lStart; + int rLen = rEnd - rStart; + int n = Math.min(rLen, lLen); + long lPos = laddr + lStart; + int rPos = rStart; + + while (n-- != 0) { + byte leftByte = PlatformDependent.getByte(lPos); + byte rightByte = right[rPos]; + if (leftByte != rightByte) { + return ((leftByte & 0xFF) - (rightByte & 0xFF)) > 0 ? 1 : -1; + } + lPos++; + rPos++; + } + + if (lLen == rLen) { + return 0; + } + + return lLen > rLen ? 1 : -1; + } + + /* + * Following are helper functions to interact with sparse decimal represented in a byte array. + */ + + // Get the integer ignore the sign + public static int getInteger(byte[] b, int index) { + return getInteger(b, index, true); + } + // Get the integer, ignore the sign + public static int getInteger(byte[] b, int index, boolean ignoreSign) { + int startIndex = index * DecimalUtility.INTEGER_SIZE; + + if (index == 0 && ignoreSign == true) { + return (b[startIndex + 3] & 0xFF) | + (b[startIndex + 2] & 0xFF) << 8 | + (b[startIndex + 1] & 0xFF) << 16 | + (b[startIndex] & 0x7F) << 24; + } + + return ((b[startIndex + 3] & 0xFF) | + (b[startIndex + 2] & 0xFF) << 8 | + (b[startIndex + 1] & 0xFF) << 16 | + (b[startIndex] & 0xFF) << 24); + + } + + // Set integer in the byte array + public static void setInteger(byte[] b, int index, int value) { + int startIndex = index * DecimalUtility.INTEGER_SIZE; + b[startIndex] = (byte) ((value >> 24) & 0xFF); + b[startIndex + 1] = (byte) ((value >> 16) & 0xFF); + b[startIndex + 2] = (byte) ((value >> 8) & 0xFF); + b[startIndex + 3] = (byte) ((value) & 0xFF); + } + + // Set the sign in a sparse decimal representation + public static void setSign(byte[] b, boolean sign) { + int value = getInteger(b, 0); + if (sign == true) { + setInteger(b, 0, value | 0x80000000); + } else { + setInteger(b, 0, value & 0x7FFFFFFF); + } + } + + // Get the sign + public static boolean getSign(byte[] b) { + return ((getInteger(b, 0, false) & 0x80000000) != 0); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java b/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java new file mode 100644 index 00000000000..249834270b3 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/CallBack.java @@ -0,0 +1,23 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + + +public interface CallBack { + public void doWork(); +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java new file mode 100644 index 00000000000..1eb2c13cd4c --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java @@ -0,0 +1,91 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + +import java.math.BigDecimal; + +import org.apache.arrow.vector.types.Types; + +public class CoreDecimalUtility { + static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(CoreDecimalUtility.class); + + public static long getDecimal18FromBigDecimal(BigDecimal input, int scale, int precision) { + // Truncate or pad to set the input to the correct scale + input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); + + return (input.unscaledValue().longValue()); + } + + public static int getMaxPrecision(Types.MinorType decimalType) { + if (decimalType == Types.MinorType.DECIMAL9) { + return 9; + } else if (decimalType == Types.MinorType.DECIMAL18) { + return 18; + } else if (decimalType == Types.MinorType.DECIMAL28SPARSE) { + return 28; + } else if (decimalType == Types.MinorType.DECIMAL38SPARSE) { + return 38; + } + return 0; + } + + /* + * Function returns the Minor decimal type given the precision + */ + public static Types.MinorType getDecimalDataType(int precision) { + if (precision <= 9) { + return Types.MinorType.DECIMAL9; + } else if (precision <= 18) { + return Types.MinorType.DECIMAL18; + } else if (precision <= 28) { + return Types.MinorType.DECIMAL28SPARSE; + } else { + return Types.MinorType.DECIMAL38SPARSE; + } + } + + /* + * Given a precision it provides the max precision of that decimal data type; + * For eg: given the precision 12, we would use DECIMAL18 to store the data + * which has a max precision range of 18 digits + */ + public static int getPrecisionRange(int precision) { + return getMaxPrecision(getDecimalDataType(precision)); + } + public static int getDecimal9FromBigDecimal(BigDecimal input, int scale, int precision) { + // Truncate/ or pad to set the input to the correct scale + input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); + + return (input.unscaledValue().intValue()); + } + + /* + * Helper function to detect if the given data type is Decimal + */ + public static boolean isDecimalType(Types.MajorType type) { + return isDecimalType(type.getMinorType()); + } + + public static boolean isDecimalType(Types.MinorType minorType) { + if (minorType == Types.MinorType.DECIMAL9 || minorType == Types.MinorType.DECIMAL18 || + minorType == Types.MinorType.DECIMAL28SPARSE || minorType == Types.MinorType.DECIMAL38SPARSE) { + return true; + } + return false; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java new file mode 100644 index 00000000000..f4fc1736032 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DateUtility.java @@ -0,0 +1,682 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.util; + +import org.joda.time.Period; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.DateTimeFormatterBuilder; +import org.joda.time.format.DateTimeParser; + +import com.carrotsearch.hppc.ObjectIntHashMap; + +// Utility class for Date, DateTime, TimeStamp, Interval data types +public class DateUtility { + + + /* We have a hashmap that stores the timezone as the key and an index as the value + * While storing the timezone in value vectors, holders we only use this index. As we + * reconstruct the timestamp, we use this index to index through the array timezoneList + * and get the corresponding timezone and pass it to joda-time + */ + public static ObjectIntHashMap timezoneMap = new ObjectIntHashMap(); + + public static String[] timezoneList = {"Africa/Abidjan", + "Africa/Accra", + "Africa/Addis_Ababa", + "Africa/Algiers", + "Africa/Asmara", + "Africa/Asmera", + "Africa/Bamako", + "Africa/Bangui", + "Africa/Banjul", + "Africa/Bissau", + "Africa/Blantyre", + "Africa/Brazzaville", + "Africa/Bujumbura", + "Africa/Cairo", + "Africa/Casablanca", + "Africa/Ceuta", + "Africa/Conakry", + "Africa/Dakar", + "Africa/Dar_es_Salaam", + "Africa/Djibouti", + "Africa/Douala", + "Africa/El_Aaiun", + "Africa/Freetown", + "Africa/Gaborone", + "Africa/Harare", + "Africa/Johannesburg", + "Africa/Juba", + "Africa/Kampala", + "Africa/Khartoum", + "Africa/Kigali", + "Africa/Kinshasa", + "Africa/Lagos", + "Africa/Libreville", + "Africa/Lome", + "Africa/Luanda", + "Africa/Lubumbashi", + "Africa/Lusaka", + "Africa/Malabo", + "Africa/Maputo", + "Africa/Maseru", + "Africa/Mbabane", + "Africa/Mogadishu", + "Africa/Monrovia", + "Africa/Nairobi", + "Africa/Ndjamena", + "Africa/Niamey", + "Africa/Nouakchott", + "Africa/Ouagadougou", + "Africa/Porto-Novo", + "Africa/Sao_Tome", + "Africa/Timbuktu", + "Africa/Tripoli", + "Africa/Tunis", + "Africa/Windhoek", + "America/Adak", + "America/Anchorage", + "America/Anguilla", + "America/Antigua", + "America/Araguaina", + "America/Argentina/Buenos_Aires", + "America/Argentina/Catamarca", + "America/Argentina/ComodRivadavia", + "America/Argentina/Cordoba", + "America/Argentina/Jujuy", + "America/Argentina/La_Rioja", + "America/Argentina/Mendoza", + "America/Argentina/Rio_Gallegos", + "America/Argentina/Salta", + "America/Argentina/San_Juan", + "America/Argentina/San_Luis", + "America/Argentina/Tucuman", + "America/Argentina/Ushuaia", + "America/Aruba", + "America/Asuncion", + "America/Atikokan", + "America/Atka", + "America/Bahia", + "America/Bahia_Banderas", + "America/Barbados", + "America/Belem", + "America/Belize", + "America/Blanc-Sablon", + "America/Boa_Vista", + "America/Bogota", + "America/Boise", + "America/Buenos_Aires", + "America/Cambridge_Bay", + "America/Campo_Grande", + "America/Cancun", + "America/Caracas", + "America/Catamarca", + "America/Cayenne", + "America/Cayman", + "America/Chicago", + "America/Chihuahua", + "America/Coral_Harbour", + "America/Cordoba", + "America/Costa_Rica", + "America/Cuiaba", + "America/Curacao", + "America/Danmarkshavn", + "America/Dawson", + "America/Dawson_Creek", + "America/Denver", + "America/Detroit", + "America/Dominica", + "America/Edmonton", + "America/Eirunepe", + "America/El_Salvador", + "America/Ensenada", + "America/Fort_Wayne", + "America/Fortaleza", + "America/Glace_Bay", + "America/Godthab", + "America/Goose_Bay", + "America/Grand_Turk", + "America/Grenada", + "America/Guadeloupe", + "America/Guatemala", + "America/Guayaquil", + "America/Guyana", + "America/Halifax", + "America/Havana", + "America/Hermosillo", + "America/Indiana/Indianapolis", + "America/Indiana/Knox", + "America/Indiana/Marengo", + "America/Indiana/Petersburg", + "America/Indiana/Tell_City", + "America/Indiana/Vevay", + "America/Indiana/Vincennes", + "America/Indiana/Winamac", + "America/Indianapolis", + "America/Inuvik", + "America/Iqaluit", + "America/Jamaica", + "America/Jujuy", + "America/Juneau", + "America/Kentucky/Louisville", + "America/Kentucky/Monticello", + "America/Knox_IN", + "America/Kralendijk", + "America/La_Paz", + "America/Lima", + "America/Los_Angeles", + "America/Louisville", + "America/Lower_Princes", + "America/Maceio", + "America/Managua", + "America/Manaus", + "America/Marigot", + "America/Martinique", + "America/Matamoros", + "America/Mazatlan", + "America/Mendoza", + "America/Menominee", + "America/Merida", + "America/Metlakatla", + "America/Mexico_City", + "America/Miquelon", + "America/Moncton", + "America/Monterrey", + "America/Montevideo", + "America/Montreal", + "America/Montserrat", + "America/Nassau", + "America/New_York", + "America/Nipigon", + "America/Nome", + "America/Noronha", + "America/North_Dakota/Beulah", + "America/North_Dakota/Center", + "America/North_Dakota/New_Salem", + "America/Ojinaga", + "America/Panama", + "America/Pangnirtung", + "America/Paramaribo", + "America/Phoenix", + "America/Port-au-Prince", + "America/Port_of_Spain", + "America/Porto_Acre", + "America/Porto_Velho", + "America/Puerto_Rico", + "America/Rainy_River", + "America/Rankin_Inlet", + "America/Recife", + "America/Regina", + "America/Resolute", + "America/Rio_Branco", + "America/Rosario", + "America/Santa_Isabel", + "America/Santarem", + "America/Santiago", + "America/Santo_Domingo", + "America/Sao_Paulo", + "America/Scoresbysund", + "America/Shiprock", + "America/Sitka", + "America/St_Barthelemy", + "America/St_Johns", + "America/St_Kitts", + "America/St_Lucia", + "America/St_Thomas", + "America/St_Vincent", + "America/Swift_Current", + "America/Tegucigalpa", + "America/Thule", + "America/Thunder_Bay", + "America/Tijuana", + "America/Toronto", + "America/Tortola", + "America/Vancouver", + "America/Virgin", + "America/Whitehorse", + "America/Winnipeg", + "America/Yakutat", + "America/Yellowknife", + "Antarctica/Casey", + "Antarctica/Davis", + "Antarctica/DumontDUrville", + "Antarctica/Macquarie", + "Antarctica/Mawson", + "Antarctica/McMurdo", + "Antarctica/Palmer", + "Antarctica/Rothera", + "Antarctica/South_Pole", + "Antarctica/Syowa", + "Antarctica/Vostok", + "Arctic/Longyearbyen", + "Asia/Aden", + "Asia/Almaty", + "Asia/Amman", + "Asia/Anadyr", + "Asia/Aqtau", + "Asia/Aqtobe", + "Asia/Ashgabat", + "Asia/Ashkhabad", + "Asia/Baghdad", + "Asia/Bahrain", + "Asia/Baku", + "Asia/Bangkok", + "Asia/Beirut", + "Asia/Bishkek", + "Asia/Brunei", + "Asia/Calcutta", + "Asia/Choibalsan", + "Asia/Chongqing", + "Asia/Chungking", + "Asia/Colombo", + "Asia/Dacca", + "Asia/Damascus", + "Asia/Dhaka", + "Asia/Dili", + "Asia/Dubai", + "Asia/Dushanbe", + "Asia/Gaza", + "Asia/Harbin", + "Asia/Hebron", + "Asia/Ho_Chi_Minh", + "Asia/Hong_Kong", + "Asia/Hovd", + "Asia/Irkutsk", + "Asia/Istanbul", + "Asia/Jakarta", + "Asia/Jayapura", + "Asia/Jerusalem", + "Asia/Kabul", + "Asia/Kamchatka", + "Asia/Karachi", + "Asia/Kashgar", + "Asia/Kathmandu", + "Asia/Katmandu", + "Asia/Kolkata", + "Asia/Krasnoyarsk", + "Asia/Kuala_Lumpur", + "Asia/Kuching", + "Asia/Kuwait", + "Asia/Macao", + "Asia/Macau", + "Asia/Magadan", + "Asia/Makassar", + "Asia/Manila", + "Asia/Muscat", + "Asia/Nicosia", + "Asia/Novokuznetsk", + "Asia/Novosibirsk", + "Asia/Omsk", + "Asia/Oral", + "Asia/Phnom_Penh", + "Asia/Pontianak", + "Asia/Pyongyang", + "Asia/Qatar", + "Asia/Qyzylorda", + "Asia/Rangoon", + "Asia/Riyadh", + "Asia/Saigon", + "Asia/Sakhalin", + "Asia/Samarkand", + "Asia/Seoul", + "Asia/Shanghai", + "Asia/Singapore", + "Asia/Taipei", + "Asia/Tashkent", + "Asia/Tbilisi", + "Asia/Tehran", + "Asia/Tel_Aviv", + "Asia/Thimbu", + "Asia/Thimphu", + "Asia/Tokyo", + "Asia/Ujung_Pandang", + "Asia/Ulaanbaatar", + "Asia/Ulan_Bator", + "Asia/Urumqi", + "Asia/Vientiane", + "Asia/Vladivostok", + "Asia/Yakutsk", + "Asia/Yekaterinburg", + "Asia/Yerevan", + "Atlantic/Azores", + "Atlantic/Bermuda", + "Atlantic/Canary", + "Atlantic/Cape_Verde", + "Atlantic/Faeroe", + "Atlantic/Faroe", + "Atlantic/Jan_Mayen", + "Atlantic/Madeira", + "Atlantic/Reykjavik", + "Atlantic/South_Georgia", + "Atlantic/St_Helena", + "Atlantic/Stanley", + "Australia/ACT", + "Australia/Adelaide", + "Australia/Brisbane", + "Australia/Broken_Hill", + "Australia/Canberra", + "Australia/Currie", + "Australia/Darwin", + "Australia/Eucla", + "Australia/Hobart", + "Australia/LHI", + "Australia/Lindeman", + "Australia/Lord_Howe", + "Australia/Melbourne", + "Australia/NSW", + "Australia/North", + "Australia/Perth", + "Australia/Queensland", + "Australia/South", + "Australia/Sydney", + "Australia/Tasmania", + "Australia/Victoria", + "Australia/West", + "Australia/Yancowinna", + "Brazil/Acre", + "Brazil/DeNoronha", + "Brazil/East", + "Brazil/West", + "CET", + "CST6CDT", + "Canada/Atlantic", + "Canada/Central", + "Canada/East-Saskatchewan", + "Canada/Eastern", + "Canada/Mountain", + "Canada/Newfoundland", + "Canada/Pacific", + "Canada/Saskatchewan", + "Canada/Yukon", + "Chile/Continental", + "Chile/EasterIsland", + "Cuba", + "EET", + "EST", + "EST5EDT", + "Egypt", + "Eire", + "Etc/GMT", + "Etc/GMT+0", + "Etc/GMT+1", + "Etc/GMT+10", + "Etc/GMT+11", + "Etc/GMT+12", + "Etc/GMT+2", + "Etc/GMT+3", + "Etc/GMT+4", + "Etc/GMT+5", + "Etc/GMT+6", + "Etc/GMT+7", + "Etc/GMT+8", + "Etc/GMT+9", + "Etc/GMT-0", + "Etc/GMT-1", + "Etc/GMT-10", + "Etc/GMT-11", + "Etc/GMT-12", + "Etc/GMT-13", + "Etc/GMT-14", + "Etc/GMT-2", + "Etc/GMT-3", + "Etc/GMT-4", + "Etc/GMT-5", + "Etc/GMT-6", + "Etc/GMT-7", + "Etc/GMT-8", + "Etc/GMT-9", + "Etc/GMT0", + "Etc/Greenwich", + "Etc/UCT", + "Etc/UTC", + "Etc/Universal", + "Etc/Zulu", + "Europe/Amsterdam", + "Europe/Andorra", + "Europe/Athens", + "Europe/Belfast", + "Europe/Belgrade", + "Europe/Berlin", + "Europe/Bratislava", + "Europe/Brussels", + "Europe/Bucharest", + "Europe/Budapest", + "Europe/Chisinau", + "Europe/Copenhagen", + "Europe/Dublin", + "Europe/Gibraltar", + "Europe/Guernsey", + "Europe/Helsinki", + "Europe/Isle_of_Man", + "Europe/Istanbul", + "Europe/Jersey", + "Europe/Kaliningrad", + "Europe/Kiev", + "Europe/Lisbon", + "Europe/Ljubljana", + "Europe/London", + "Europe/Luxembourg", + "Europe/Madrid", + "Europe/Malta", + "Europe/Mariehamn", + "Europe/Minsk", + "Europe/Monaco", + "Europe/Moscow", + "Europe/Nicosia", + "Europe/Oslo", + "Europe/Paris", + "Europe/Podgorica", + "Europe/Prague", + "Europe/Riga", + "Europe/Rome", + "Europe/Samara", + "Europe/San_Marino", + "Europe/Sarajevo", + "Europe/Simferopol", + "Europe/Skopje", + "Europe/Sofia", + "Europe/Stockholm", + "Europe/Tallinn", + "Europe/Tirane", + "Europe/Tiraspol", + "Europe/Uzhgorod", + "Europe/Vaduz", + "Europe/Vatican", + "Europe/Vienna", + "Europe/Vilnius", + "Europe/Volgograd", + "Europe/Warsaw", + "Europe/Zagreb", + "Europe/Zaporozhye", + "Europe/Zurich", + "GB", + "GB-Eire", + "GMT", + "GMT+0", + "GMT-0", + "GMT0", + "Greenwich", + "HST", + "Hongkong", + "Iceland", + "Indian/Antananarivo", + "Indian/Chagos", + "Indian/Christmas", + "Indian/Cocos", + "Indian/Comoro", + "Indian/Kerguelen", + "Indian/Mahe", + "Indian/Maldives", + "Indian/Mauritius", + "Indian/Mayotte", + "Indian/Reunion", + "Iran", + "Israel", + "Jamaica", + "Japan", + "Kwajalein", + "Libya", + "MET", + "MST", + "MST7MDT", + "Mexico/BajaNorte", + "Mexico/BajaSur", + "Mexico/General", + "NZ", + "NZ-CHAT", + "Navajo", + "PRC", + "PST8PDT", + "Pacific/Apia", + "Pacific/Auckland", + "Pacific/Chatham", + "Pacific/Chuuk", + "Pacific/Easter", + "Pacific/Efate", + "Pacific/Enderbury", + "Pacific/Fakaofo", + "Pacific/Fiji", + "Pacific/Funafuti", + "Pacific/Galapagos", + "Pacific/Gambier", + "Pacific/Guadalcanal", + "Pacific/Guam", + "Pacific/Honolulu", + "Pacific/Johnston", + "Pacific/Kiritimati", + "Pacific/Kosrae", + "Pacific/Kwajalein", + "Pacific/Majuro", + "Pacific/Marquesas", + "Pacific/Midway", + "Pacific/Nauru", + "Pacific/Niue", + "Pacific/Norfolk", + "Pacific/Noumea", + "Pacific/Pago_Pago", + "Pacific/Palau", + "Pacific/Pitcairn", + "Pacific/Pohnpei", + "Pacific/Ponape", + "Pacific/Port_Moresby", + "Pacific/Rarotonga", + "Pacific/Saipan", + "Pacific/Samoa", + "Pacific/Tahiti", + "Pacific/Tarawa", + "Pacific/Tongatapu", + "Pacific/Truk", + "Pacific/Wake", + "Pacific/Wallis", + "Pacific/Yap", + "Poland", + "Portugal", + "ROC", + "ROK", + "Singapore", + "Turkey", + "UCT", + "US/Alaska", + "US/Aleutian", + "US/Arizona", + "US/Central", + "US/East-Indiana", + "US/Eastern", + "US/Hawaii", + "US/Indiana-Starke", + "US/Michigan", + "US/Mountain", + "US/Pacific", + "US/Pacific-New", + "US/Samoa", + "UTC", + "Universal", + "W-SU", + "WET", + "Zulu"}; + + static { + for (int i = 0; i < timezoneList.length; i++) { + timezoneMap.put(timezoneList[i], i); + } + } + + public static final DateTimeFormatter formatDate = DateTimeFormat.forPattern("yyyy-MM-dd"); + public static final DateTimeFormatter formatTimeStamp = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS"); + public static final DateTimeFormatter formatTimeStampTZ = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS ZZZ"); + public static final DateTimeFormatter formatTime = DateTimeFormat.forPattern("HH:mm:ss.SSS"); + + public static DateTimeFormatter dateTimeTZFormat = null; + public static DateTimeFormatter timeFormat = null; + + public static final int yearsToMonths = 12; + public static final int hoursToMillis = 60 * 60 * 1000; + public static final int minutesToMillis = 60 * 1000; + public static final int secondsToMillis = 1000; + public static final int monthToStandardDays = 30; + public static final long monthsToMillis = 2592000000L; // 30 * 24 * 60 * 60 * 1000 + public static final int daysToStandardMillis = 24 * 60 * 60 * 1000; + + + public static int getIndex(String timezone) { + return timezoneMap.get(timezone); + } + + public static String getTimeZone(int index) { + return timezoneList[index]; + } + + // Function returns the date time formatter used to parse date strings + public static DateTimeFormatter getDateTimeFormatter() { + + if (dateTimeTZFormat == null) { + DateTimeFormatter dateFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); + DateTimeParser optionalTime = DateTimeFormat.forPattern(" HH:mm:ss").getParser(); + DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); + DateTimeParser optionalZone = DateTimeFormat.forPattern(" ZZZ").getParser(); + + dateTimeTZFormat = new DateTimeFormatterBuilder().append(dateFormatter).appendOptional(optionalTime).appendOptional(optionalSec).appendOptional(optionalZone).toFormatter(); + } + + return dateTimeTZFormat; + } + + // Function returns time formatter used to parse time strings + public static DateTimeFormatter getTimeFormatter() { + if (timeFormat == null) { + DateTimeFormatter timeFormatter = DateTimeFormat.forPattern("HH:mm:ss"); + DateTimeParser optionalSec = DateTimeFormat.forPattern(".SSS").getParser(); + timeFormat = new DateTimeFormatterBuilder().append(timeFormatter).appendOptional(optionalSec).toFormatter(); + } + return timeFormat; + } + + public static int monthsFromPeriod(Period period){ + return (period.getYears() * yearsToMonths) + period.getMonths(); + } + + public static int millisFromPeriod(final Period period){ + return (period.getHours() * hoursToMillis) + + (period.getMinutes() * minutesToMillis) + + (period.getSeconds() * secondsToMillis) + + (period.getMillis()); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java new file mode 100644 index 00000000000..576a5b6351a --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -0,0 +1,737 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + +import io.netty.buffer.ArrowBuf; +import io.netty.buffer.ByteBuf; +import io.netty.buffer.UnpooledByteBufAllocator; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.Arrays; + +import org.apache.arrow.vector.holders.Decimal38SparseHolder; + +public class DecimalUtility extends CoreDecimalUtility{ + + public final static int MAX_DIGITS = 9; + public final static int DIGITS_BASE = 1000000000; + public final static int DIGITS_MAX = 999999999; + public final static int INTEGER_SIZE = (Integer.SIZE/8); + + public final static String[] decimalToString = {"", + "0", + "00", + "000", + "0000", + "00000", + "000000", + "0000000", + "00000000", + "000000000"}; + + public final static long[] scale_long_constants = { + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000l, + 100000000000l, + 1000000000000l, + 10000000000000l, + 100000000000000l, + 1000000000000000l, + 10000000000000000l, + 100000000000000000l, + 1000000000000000000l}; + + /* + * Simple function that returns the static precomputed + * power of ten, instead of using Math.pow + */ + public static long getPowerOfTen(int power) { + assert power >= 0 && power < scale_long_constants.length; + return scale_long_constants[(power)]; + } + + /* + * Math.pow returns a double and while multiplying with large digits + * in the decimal data type we encounter noise. So instead of multiplying + * with Math.pow we use the static constants to perform the multiplication + */ + public static long adjustScaleMultiply(long input, int factor) { + int index = Math.abs(factor); + assert index >= 0 && index < scale_long_constants.length; + if (factor >= 0) { + return input * scale_long_constants[index]; + } else { + return input / scale_long_constants[index]; + } + } + + public static long adjustScaleDivide(long input, int factor) { + int index = Math.abs(factor); + assert index >= 0 && index < scale_long_constants.length; + if (factor >= 0) { + return input / scale_long_constants[index]; + } else { + return input * scale_long_constants[index]; + } + } + + /* Given the number of actual digits this function returns the + * number of indexes it will occupy in the array of integers + * which are stored in base 1 billion + */ + public static int roundUp(int ndigits) { + return (ndigits + MAX_DIGITS - 1)/MAX_DIGITS; + } + + /* Returns a string representation of the given integer + * If the length of the given integer is less than the + * passed length, this function will prepend zeroes to the string + */ + public static StringBuilder toStringWithZeroes(int number, int desiredLength) { + String value = ((Integer) number).toString(); + int length = value.length(); + + StringBuilder str = new StringBuilder(); + str.append(decimalToString[desiredLength - length]); + str.append(value); + + return str; + } + + public static StringBuilder toStringWithZeroes(long number, int desiredLength) { + String value = ((Long) number).toString(); + int length = value.length(); + + StringBuilder str = new StringBuilder(); + + // Desired length can be > MAX_DIGITS + int zeroesLength = desiredLength - length; + while (zeroesLength > MAX_DIGITS) { + str.append(decimalToString[MAX_DIGITS]); + zeroesLength -= MAX_DIGITS; + } + str.append(decimalToString[zeroesLength]); + str.append(value); + + return str; + } + + public static BigDecimal getBigDecimalFromIntermediate(ByteBuf data, int startIndex, int nDecimalDigits, int scale) { + + // In the intermediate representation we don't pad the scale with zeroes, so set truncate = false + return getBigDecimalFromDrillBuf(data, startIndex, nDecimalDigits, scale, false); + } + + public static BigDecimal getBigDecimalFromSparse(ArrowBuf data, int startIndex, int nDecimalDigits, int scale) { + + // In the sparse representation we pad the scale with zeroes for ease of arithmetic, need to truncate + return getBigDecimalFromDrillBuf(data, startIndex, nDecimalDigits, scale, true); + } + + public static BigDecimal getBigDecimalFromDrillBuf(ArrowBuf bytebuf, int start, int length, int scale) { + byte[] value = new byte[length]; + bytebuf.getBytes(start, value, 0, length); + BigInteger unscaledValue = new BigInteger(value); + return new BigDecimal(unscaledValue, scale); + } + + public static BigDecimal getBigDecimalFromByteBuffer(ByteBuffer bytebuf, int start, int length, int scale) { + byte[] value = new byte[length]; + bytebuf.get(value); + BigInteger unscaledValue = new BigInteger(value); + return new BigDecimal(unscaledValue, scale); + } + + /* Create a BigDecimal object using the data in the DrillBuf. + * This function assumes that data is provided in a non-dense format + * It works on both sparse and intermediate representations. + */ + public static BigDecimal getBigDecimalFromDrillBuf(ByteBuf data, int startIndex, int nDecimalDigits, int scale, + boolean truncateScale) { + + // For sparse decimal type we have padded zeroes at the end, strip them while converting to BigDecimal. + int actualDigits; + + // Initialize the BigDecimal, first digit in the DrillBuf has the sign so mask it out + BigInteger decimalDigits = BigInteger.valueOf((data.getInt(startIndex)) & 0x7FFFFFFF); + + BigInteger base = BigInteger.valueOf(DIGITS_BASE); + + for (int i = 1; i < nDecimalDigits; i++) { + + BigInteger temp = BigInteger.valueOf(data.getInt(startIndex + (i * INTEGER_SIZE))); + decimalDigits = decimalDigits.multiply(base); + decimalDigits = decimalDigits.add(temp); + } + + // Truncate any additional padding we might have added + if (truncateScale == true && scale > 0 && (actualDigits = scale % MAX_DIGITS) != 0) { + BigInteger truncate = BigInteger.valueOf((int)Math.pow(10, (MAX_DIGITS - actualDigits))); + decimalDigits = decimalDigits.divide(truncate); + } + + // set the sign + if ((data.getInt(startIndex) & 0x80000000) != 0) { + decimalDigits = decimalDigits.negate(); + } + + BigDecimal decimal = new BigDecimal(decimalDigits, scale); + + return decimal; + } + + /* This function returns a BigDecimal object from the dense decimal representation. + * First step is to convert the dense representation into an intermediate representation + * and then invoke getBigDecimalFromDrillBuf() to get the BigDecimal object + */ + public static BigDecimal getBigDecimalFromDense(ArrowBuf data, int startIndex, int nDecimalDigits, int scale, int maxPrecision, int width) { + + /* This method converts the dense representation to + * an intermediate representation. The intermediate + * representation has one more integer than the dense + * representation. + */ + byte[] intermediateBytes = new byte[((nDecimalDigits + 1) * INTEGER_SIZE)]; + + // Start storing from the least significant byte of the first integer + int intermediateIndex = 3; + + int[] mask = {0x03, 0x0F, 0x3F, 0xFF}; + int[] reverseMask = {0xFC, 0xF0, 0xC0, 0x00}; + + int maskIndex; + int shiftOrder; + byte shiftBits; + + // TODO: Some of the logic here is common with casting from Dense to Sparse types, factor out common code + if (maxPrecision == 38) { + maskIndex = 0; + shiftOrder = 6; + shiftBits = 0x00; + intermediateBytes[intermediateIndex++] = (byte) (data.getByte(startIndex) & 0x7F); + } else if (maxPrecision == 28) { + maskIndex = 1; + shiftOrder = 4; + shiftBits = (byte) ((data.getByte(startIndex) & 0x03) << shiftOrder); + intermediateBytes[intermediateIndex++] = (byte) (((data.getByte(startIndex) & 0x3C) & 0xFF) >>> 2); + } else { + throw new UnsupportedOperationException("Dense types with max precision 38 and 28 are only supported"); + } + + int inputIndex = 1; + boolean sign = false; + + if ((data.getByte(startIndex) & 0x80) != 0) { + sign = true; + } + + while (inputIndex < width) { + + intermediateBytes[intermediateIndex] = (byte) ((shiftBits) | (((data.getByte(startIndex + inputIndex) & reverseMask[maskIndex]) & 0xFF) >>> (8 - shiftOrder))); + + shiftBits = (byte) ((data.getByte(startIndex + inputIndex) & mask[maskIndex]) << shiftOrder); + + inputIndex++; + intermediateIndex++; + + if (((inputIndex - 1) % INTEGER_SIZE) == 0) { + shiftBits = (byte) ((shiftBits & 0xFF) >>> 2); + maskIndex++; + shiftOrder -= 2; + } + + } + /* copy the last byte */ + intermediateBytes[intermediateIndex] = shiftBits; + + if (sign == true) { + intermediateBytes[0] = (byte) (intermediateBytes[0] | 0x80); + } + + final ByteBuf intermediate = UnpooledByteBufAllocator.DEFAULT.buffer(intermediateBytes.length); + try { + intermediate.setBytes(0, intermediateBytes); + + BigDecimal ret = getBigDecimalFromIntermediate(intermediate, 0, nDecimalDigits + 1, scale); + return ret; + } finally { + intermediate.release(); + } + + } + + /* + * Function converts the BigDecimal and stores it in out internal sparse representation + */ + public static void getSparseFromBigDecimal(BigDecimal input, ByteBuf data, int startIndex, int scale, int precision, + int nDecimalDigits) { + + // Initialize the buffer + for (int i = 0; i < nDecimalDigits; i++) { + data.setInt(startIndex + (i * INTEGER_SIZE), 0); + } + + boolean sign = false; + + if (input.signum() == -1) { + // negative input + sign = true; + input = input.abs(); + } + + // Truncate the input as per the scale provided + input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); + + // Separate out the integer part + BigDecimal integerPart = input.setScale(0, BigDecimal.ROUND_DOWN); + + int destIndex = nDecimalDigits - roundUp(scale) - 1; + + // we use base 1 billion integer digits for out integernal representation + BigDecimal base = new BigDecimal(DIGITS_BASE); + + while (integerPart.compareTo(BigDecimal.ZERO) == 1) { + // store the modulo as the integer value + data.setInt(startIndex + (destIndex * INTEGER_SIZE), (integerPart.remainder(base)).intValue()); + destIndex--; + // Divide by base 1 billion + integerPart = (integerPart.divide(base)).setScale(0, BigDecimal.ROUND_DOWN); + } + + /* Sparse representation contains padding of additional zeroes + * so each digit contains MAX_DIGITS for ease of arithmetic + */ + int actualDigits; + if ((actualDigits = (scale % MAX_DIGITS)) != 0) { + // Pad additional zeroes + scale = scale + (MAX_DIGITS - actualDigits); + input = input.setScale(scale, BigDecimal.ROUND_DOWN); + } + + //separate out the fractional part + BigDecimal fractionalPart = input.remainder(BigDecimal.ONE).movePointRight(scale); + + destIndex = nDecimalDigits - 1; + + while (scale > 0) { + // Get next set of MAX_DIGITS (9) store it in the DrillBuf + fractionalPart = fractionalPart.movePointLeft(MAX_DIGITS); + BigDecimal temp = fractionalPart.remainder(BigDecimal.ONE); + + data.setInt(startIndex + (destIndex * INTEGER_SIZE), (temp.unscaledValue().intValue())); + destIndex--; + + fractionalPart = fractionalPart.setScale(0, BigDecimal.ROUND_DOWN); + scale -= MAX_DIGITS; + } + + // Set the negative sign + if (sign == true) { + data.setInt(startIndex, data.getInt(startIndex) | 0x80000000); + } + + } + + + public static long getDecimal18FromBigDecimal(BigDecimal input, int scale, int precision) { + // Truncate or pad to set the input to the correct scale + input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); + + return (input.unscaledValue().longValue()); + } + + public static BigDecimal getBigDecimalFromPrimitiveTypes(int input, int scale, int precision) { + return BigDecimal.valueOf(input, scale); + } + + public static BigDecimal getBigDecimalFromPrimitiveTypes(long input, int scale, int precision) { + return BigDecimal.valueOf(input, scale); + } + + + public static int compareDenseBytes(ArrowBuf left, int leftStart, boolean leftSign, ArrowBuf right, int rightStart, boolean rightSign, int width) { + + int invert = 1; + + /* If signs are different then simply look at the + * sign of the two inputs and determine which is greater + */ + if (leftSign != rightSign) { + + return((leftSign == true) ? -1 : 1); + } else if(leftSign == true) { + /* Both inputs are negative, at the end we will + * have to invert the comparison + */ + invert = -1; + } + + int cmp = 0; + + for (int i = 0; i < width; i++) { + byte leftByte = left.getByte(leftStart + i); + byte rightByte = right.getByte(rightStart + i); + // Unsigned byte comparison + if ((leftByte & 0xFF) > (rightByte & 0xFF)) { + cmp = 1; + break; + } else if ((leftByte & 0xFF) < (rightByte & 0xFF)) { + cmp = -1; + break; + } + } + cmp *= invert; // invert the comparison if both were negative values + + return cmp; + } + + public static int getIntegerFromSparseBuffer(ArrowBuf buffer, int start, int index) { + int value = buffer.getInt(start + (index * 4)); + + if (index == 0) { + /* the first byte contains sign bit, return value without it */ + value = (value & 0x7FFFFFFF); + } + return value; + } + + public static void setInteger(ArrowBuf buffer, int start, int index, int value) { + buffer.setInt(start + (index * 4), value); + } + + public static int compareSparseBytes(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits, boolean absCompare) { + + int invert = 1; + + if (absCompare == false) { + if (leftSign != rightSign) { + return (leftSign == true) ? -1 : 1; + } + + // Both values are negative invert the outcome of the comparison + if (leftSign == true) { + invert = -1; + } + } + + int cmp = compareSparseBytesInner(left, leftStart, leftSign, leftScale, leftPrecision, right, rightStart, rightSign, rightPrecision, rightScale, width, nDecimalDigits); + return cmp * invert; + } + public static int compareSparseBytesInner(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits) { + /* compute the number of integer digits in each decimal */ + int leftInt = leftPrecision - leftScale; + int rightInt = rightPrecision - rightScale; + + /* compute the number of indexes required for storing integer digits */ + int leftIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftInt); + int rightIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightInt); + + /* compute number of indexes required for storing scale */ + int leftScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftScale); + int rightScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightScale); + + /* compute index of the most significant integer digits */ + int leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; + int rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; + + int leftStopIndex = nDecimalDigits - leftScaleRoundedUp; + int rightStopIndex = nDecimalDigits - rightScaleRoundedUp; + + /* Discard the zeroes in the integer part */ + while (leftIndex1 < leftStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { + break; + } + + /* Digit in this location is zero, decrement the actual number + * of integer digits + */ + leftIntRoundedUp--; + leftIndex1++; + } + + /* If we reached the stop index then the number of integers is zero */ + if (leftIndex1 == leftStopIndex) { + leftIntRoundedUp = 0; + } + + while (rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { + break; + } + + /* Digit in this location is zero, decrement the actual number + * of integer digits + */ + rightIntRoundedUp--; + rightIndex1++; + } + + if (rightIndex1 == rightStopIndex) { + rightIntRoundedUp = 0; + } + + /* We have the accurate number of non-zero integer digits, + * if the number of integer digits are different then we can determine + * which decimal is larger and needn't go down to comparing individual values + */ + if (leftIntRoundedUp > rightIntRoundedUp) { + return 1; + } + else if (rightIntRoundedUp > leftIntRoundedUp) { + return -1; + } + + /* The number of integer digits are the same, set the each index + * to the first non-zero integer and compare each digit + */ + leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; + rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; + + while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { + return 1; + } + else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { + return -1; + } + + leftIndex1++; + rightIndex1++; + } + + /* The integer part of both the decimal's are equal, now compare + * each individual fractional part. Set the index to be at the + * beginning of the fractional part + */ + leftIndex1 = leftStopIndex; + rightIndex1 = rightStopIndex; + + /* Stop indexes will be the end of the array */ + leftStopIndex = nDecimalDigits; + rightStopIndex = nDecimalDigits; + + /* compare the two fractional parts of the decimal */ + while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { + return 1; + } + else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { + return -1; + } + + leftIndex1++; + rightIndex1++; + } + + /* Till now the fractional part of the decimals are equal, check + * if one of the decimal has fractional part that is remaining + * and is non-zero + */ + while (leftIndex1 < leftStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { + return 1; + } + leftIndex1++; + } + + while(rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { + return -1; + } + rightIndex1++; + } + + /* Both decimal values are equal */ + return 0; + } + + public static BigDecimal getBigDecimalFromByteArray(byte[] bytes, int start, int length, int scale) { + byte[] value = Arrays.copyOfRange(bytes, start, start + length); + BigInteger unscaledValue = new BigInteger(value); + return new BigDecimal(unscaledValue, scale); + } + + public static void roundDecimal(ArrowBuf result, int start, int nDecimalDigits, int desiredScale, int currentScale) { + int newScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(desiredScale); + int origScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(currentScale); + + if (desiredScale < currentScale) { + + boolean roundUp = false; + + //Extract the first digit to be truncated to check if we need to round up + int truncatedScaleIndex = desiredScale + 1; + if (truncatedScaleIndex <= currentScale) { + int extractDigitIndex = nDecimalDigits - origScaleRoundedUp -1; + extractDigitIndex += org.apache.arrow.vector.util.DecimalUtility.roundUp(truncatedScaleIndex); + int extractDigit = getIntegerFromSparseBuffer(result, start, extractDigitIndex); + int temp = org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS - (truncatedScaleIndex % org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS); + if (temp != 0) { + extractDigit = extractDigit / (int) (Math.pow(10, temp)); + } + if ((extractDigit % 10) > 4) { + roundUp = true; + } + } + + // Get the source index beyond which we will truncate + int srcIntIndex = nDecimalDigits - origScaleRoundedUp - 1; + int srcIndex = srcIntIndex + newScaleRoundedUp; + + // Truncate the remaining fractional part, move the integer part + int destIndex = nDecimalDigits - 1; + if (srcIndex != destIndex) { + while (srcIndex >= 0) { + setInteger(result, start, destIndex--, getIntegerFromSparseBuffer(result, start, srcIndex--)); + } + + // Set the remaining portion of the decimal to be zeroes + while (destIndex >= 0) { + setInteger(result, start, destIndex--, 0); + } + srcIndex = nDecimalDigits - 1; + } + + // We truncated the decimal digit. Now we need to truncate within the base 1 billion fractional digit + int truncateFactor = org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS - (desiredScale % org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS); + if (truncateFactor != org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS) { + truncateFactor = (int) Math.pow(10, truncateFactor); + int fractionalDigits = getIntegerFromSparseBuffer(result, start, nDecimalDigits - 1); + fractionalDigits /= truncateFactor; + setInteger(result, start, nDecimalDigits - 1, fractionalDigits * truncateFactor); + } + + // Finally round up the digit if needed + if (roundUp == true) { + srcIndex = nDecimalDigits - 1; + int carry; + if (truncateFactor != org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS) { + carry = truncateFactor; + } else { + carry = 1; + } + + while (srcIndex >= 0) { + int value = getIntegerFromSparseBuffer(result, start, srcIndex); + value += carry; + + if (value >= org.apache.arrow.vector.util.DecimalUtility.DIGITS_BASE) { + setInteger(result, start, srcIndex--, value % org.apache.arrow.vector.util.DecimalUtility.DIGITS_BASE); + carry = value / org.apache.arrow.vector.util.DecimalUtility.DIGITS_BASE; + } else { + setInteger(result, start, srcIndex--, value); + carry = 0; + break; + } + } + } + } else if (desiredScale > currentScale) { + // Add fractional digits to the decimal + + // Check if we need to shift the decimal digits to the left + if (newScaleRoundedUp > origScaleRoundedUp) { + int srcIndex = 0; + int destIndex = newScaleRoundedUp - origScaleRoundedUp; + + // Check while extending scale, we are not overwriting integer part + while (srcIndex < destIndex) { + if (getIntegerFromSparseBuffer(result, start, srcIndex++) != 0) { + throw new RuntimeException("Truncate resulting in loss of integer part, reduce scale specified"); + } + } + + srcIndex = 0; + while (destIndex < nDecimalDigits) { + setInteger(result, start, srcIndex++, getIntegerFromSparseBuffer(result, start, destIndex++)); + } + + // Clear the remaining part + while (srcIndex < nDecimalDigits) { + setInteger(result, start, srcIndex++, 0); + } + } + } + } + + public static int getFirstFractionalDigit(int decimal, int scale) { + if (scale == 0) { + return 0; + } + int temp = (int) adjustScaleDivide(decimal, scale - 1); + return Math.abs(temp % 10); + } + + public static int getFirstFractionalDigit(long decimal, int scale) { + if (scale == 0) { + return 0; + } + long temp = adjustScaleDivide(decimal, scale - 1); + return (int) (Math.abs(temp % 10)); + } + + public static int getFirstFractionalDigit(ArrowBuf data, int scale, int start, int nDecimalDigits) { + if (scale == 0) { + return 0; + } + + int index = nDecimalDigits - roundUp(scale); + return (int) (adjustScaleDivide(data.getInt(start + (index * INTEGER_SIZE)), MAX_DIGITS - 1)); + } + + public static int compareSparseSamePrecScale(ArrowBuf left, int lStart, byte[] right, int length) { + // check the sign first + boolean lSign = (left.getInt(lStart) & 0x80000000) != 0; + boolean rSign = ByteFunctionHelpers.getSign(right); + int cmp = 0; + + if (lSign != rSign) { + return (lSign == false) ? 1 : -1; + } + + // invert the comparison if we are comparing negative numbers + int invert = (lSign == true) ? -1 : 1; + + // compare byte by byte + int n = 0; + int lPos = lStart; + int rPos = 0; + while (n < length/4) { + int leftInt = Decimal38SparseHolder.getInteger(n, lStart, left); + int rightInt = ByteFunctionHelpers.getInteger(right, n); + if (leftInt != rightInt) { + cmp = (leftInt - rightInt ) > 0 ? 1 : -1; + break; + } + n++; + } + return cmp * invert; + } +} + diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java new file mode 100644 index 00000000000..7aeaa12ef9f --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class JsonStringArrayList extends ArrayList { + + private static ObjectMapper mapper; + + static { + mapper = new ObjectMapper(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof List)) { + return false; + } + List other = (List) obj; + return this.size() == other.size() && this.containsAll(other); + } + + @Override + public final String toString() { + try { + return mapper.writeValueAsString(this); + } catch(JsonProcessingException e) { + throw new IllegalStateException("Cannot serialize array list to JSON string", e); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java new file mode 100644 index 00000000000..750dd592aa4 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + +import java.util.LinkedHashMap; +import java.util.Map; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +/* + * Simple class that extends the regular java.util.HashMap but overrides the + * toString() method of the HashMap class to produce a JSON string instead + */ +public class JsonStringHashMap extends LinkedHashMap { + + private static ObjectMapper mapper; + + static { + mapper = new ObjectMapper(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof Map)) { + return false; + } + Map other = (Map) obj; + if (this.size() != other.size()) { + return false; + } + for (K key : this.keySet()) { + if (this.get(key) == null ) { + if (other.get(key) == null) { + continue; + } else { + return false; + } + } + if ( ! this.get(key).equals(other.get(key))) { + return false; + } + } + return true; + } + + @Override + public final String toString() { + try { + return mapper.writeValueAsString(this); + } catch(JsonProcessingException e) { + throw new IllegalStateException("Cannot serialize hash map to JSON string", e); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java new file mode 100644 index 00000000000..dea433e99e8 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java @@ -0,0 +1,248 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + +import java.util.AbstractMap; +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +import com.google.common.base.Function; +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import io.netty.util.collection.IntObjectHashMap; +import io.netty.util.collection.IntObjectMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * An implementation of map that supports constant time look-up by a generic key or an ordinal. + * + * This class extends the functionality a regular {@link Map} with ordinal lookup support. + * Upon insertion an unused ordinal is assigned to the inserted (key, value) tuple. + * Upon update the same ordinal id is re-used while value is replaced. + * Upon deletion of an existing item, its corresponding ordinal is recycled and could be used by another item. + * + * For any instance with N items, this implementation guarantees that ordinals are in the range of [0, N). However, + * the ordinal assignment is dynamic and may change after an insertion or deletion. Consumers of this class are + * responsible for explicitly checking the ordinal corresponding to a key via + * {@link org.apache.arrow.vector.util.MapWithOrdinal#getOrdinal(Object)} before attempting to execute a lookup + * with an ordinal. + * + * @param key type + * @param value type + */ + +public class MapWithOrdinal implements Map { + private final static Logger logger = LoggerFactory.getLogger(MapWithOrdinal.class); + + private final Map> primary = Maps.newLinkedHashMap(); + private final IntObjectHashMap secondary = new IntObjectHashMap<>(); + + private final Map delegate = new Map() { + @Override + public boolean isEmpty() { + return size() == 0; + } + + @Override + public int size() { + return primary.size(); + } + + @Override + public boolean containsKey(Object key) { + return primary.containsKey(key); + } + + @Override + public boolean containsValue(Object value) { + return primary.containsValue(value); + } + + @Override + public V get(Object key) { + Entry pair = primary.get(key); + if (pair != null) { + return pair.getValue(); + } + return null; + } + + @Override + public V put(K key, V value) { + final Entry oldPair = primary.get(key); + // if key exists try replacing otherwise, assign a new ordinal identifier + final int ordinal = oldPair == null ? primary.size():oldPair.getKey(); + primary.put(key, new AbstractMap.SimpleImmutableEntry<>(ordinal, value)); + secondary.put(ordinal, value); + return oldPair==null ? null:oldPair.getValue(); + } + + @Override + public V remove(Object key) { + final Entry oldPair = primary.remove(key); + if (oldPair!=null) { + final int lastOrdinal = secondary.size(); + final V last = secondary.get(lastOrdinal); + // normalize mappings so that all numbers until primary.size() is assigned + // swap the last element with the deleted one + secondary.put(oldPair.getKey(), last); + primary.put((K) key, new AbstractMap.SimpleImmutableEntry<>(oldPair.getKey(), last)); + } + return oldPair==null ? null:oldPair.getValue(); + } + + @Override + public void putAll(Map m) { + throw new UnsupportedOperationException(); + } + + @Override + public void clear() { + primary.clear(); + secondary.clear(); + } + + @Override + public Set keySet() { + return primary.keySet(); + } + + @Override + public Collection values() { + return Lists.newArrayList(Iterables.transform(secondary.entries(), new Function, V>() { + @Override + public V apply(IntObjectMap.Entry entry) { + return Preconditions.checkNotNull(entry).value(); + } + })); + } + + @Override + public Set> entrySet() { + return Sets.newHashSet(Iterables.transform(primary.entrySet(), new Function>, Entry>() { + @Override + public Entry apply(Entry> entry) { + return new AbstractMap.SimpleImmutableEntry<>(entry.getKey(), entry.getValue().getValue()); + } + })); + } + }; + + /** + * Returns the value corresponding to the given ordinal + * + * @param id ordinal value for lookup + * @return an instance of V + */ + public V getByOrdinal(int id) { + return secondary.get(id); + } + + /** + * Returns the ordinal corresponding to the given key. + * + * @param key key for ordinal lookup + * @return ordinal value corresponding to key if it exists or -1 + */ + public int getOrdinal(K key) { + Entry pair = primary.get(key); + if (pair != null) { + return pair.getKey(); + } + return -1; + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public boolean isEmpty() { + return delegate.isEmpty(); + } + + @Override + public V get(Object key) { + return delegate.get(key); + } + + /** + * Inserts the tuple (key, value) into the map extending the semantics of {@link Map#put} with automatic ordinal + * assignment. A new ordinal is assigned if key does not exists. Otherwise the same ordinal is re-used but the value + * is replaced. + * + * {@see java.util.Map#put} + */ + @Override + public V put(K key, V value) { + return delegate.put(key, value); + } + + @Override + public Collection values() { + return delegate.values(); + } + + @Override + public boolean containsKey(Object key) { + return delegate.containsKey(key); + } + + @Override + public boolean containsValue(Object value) { + return delegate.containsValue(value); + } + + /** + * Removes the element corresponding to the key if exists extending the semantics of {@link Map#remove} with ordinal + * re-cycling. The ordinal corresponding to the given key may be re-assigned to another tuple. It is important that + * consumer checks the ordinal value via {@link #getOrdinal(Object)} before attempting to look-up by ordinal. + * + * {@see java.util.Map#remove} + */ + @Override + public V remove(Object key) { + return delegate.remove(key); + } + + @Override + public void putAll(Map m) { + delegate.putAll(m); + } + + @Override + public void clear() { + delegate.clear(); + } + + @Override + public Set keySet() { + return delegate.keySet(); + } + + @Override + public Set> entrySet() { + return delegate.entrySet(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java b/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java new file mode 100644 index 00000000000..ec628b22c2d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/OversizedAllocationException.java @@ -0,0 +1,49 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + + +/** + * An exception that is used to signal that allocation request in bytes is greater than the maximum allowed by + * {@link org.apache.arrow.memory.BufferAllocator#buffer(int) allocator}. + * + *

Operators should handle this exception to split the batch and later resume the execution on the next + * {@link RecordBatch#next() iteration}.

+ * + */ +public class OversizedAllocationException extends RuntimeException { + public OversizedAllocationException() { + super(); + } + + public OversizedAllocationException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public OversizedAllocationException(String message, Throwable cause) { + super(message, cause); + } + + public OversizedAllocationException(String message) { + super(message); + } + + public OversizedAllocationException(Throwable cause) { + super(cause); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java new file mode 100644 index 00000000000..c2815614307 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/SchemaChangeRuntimeException.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + + +public class SchemaChangeRuntimeException extends RuntimeException { + public SchemaChangeRuntimeException() { + super(); + } + + public SchemaChangeRuntimeException(String message, Throwable cause, boolean enableSuppression, boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public SchemaChangeRuntimeException(String message, Throwable cause) { + super(message, cause); + } + + public SchemaChangeRuntimeException(String message) { + super(message); + } + + public SchemaChangeRuntimeException(Throwable cause) { + super(cause); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java new file mode 100644 index 00000000000..3919f0606cb --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/Text.java @@ -0,0 +1,621 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + +import java.io.DataInput; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.MalformedInputException; +import java.text.CharacterIterator; +import java.text.StringCharacterIterator; +import java.util.Arrays; + +import com.fasterxml.jackson.core.JsonGenerationException; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; +import com.fasterxml.jackson.databind.ser.std.StdSerializer; + +/** + * A simplified byte wrapper similar to Hadoop's Text class without all the dependencies. Lifted from Hadoop 2.7.1 + */ +@JsonSerialize(using = Text.TextSerializer.class) +public class Text { + + private static ThreadLocal ENCODER_FACTORY = + new ThreadLocal() { + @Override + protected CharsetEncoder initialValue() { + return Charset.forName("UTF-8").newEncoder(). + onMalformedInput(CodingErrorAction.REPORT). + onUnmappableCharacter(CodingErrorAction.REPORT); + } + }; + + private static ThreadLocal DECODER_FACTORY = + new ThreadLocal() { + @Override + protected CharsetDecoder initialValue() { + return Charset.forName("UTF-8").newDecoder(). + onMalformedInput(CodingErrorAction.REPORT). + onUnmappableCharacter(CodingErrorAction.REPORT); + } + }; + + private static final byte[] EMPTY_BYTES = new byte[0]; + + private byte[] bytes; + private int length; + + public Text() { + bytes = EMPTY_BYTES; + } + + /** + * Construct from a string. + */ + public Text(String string) { + set(string); + } + + /** Construct from another text. */ + public Text(Text utf8) { + set(utf8); + } + + /** + * Construct from a byte array. + */ + public Text(byte[] utf8) { + set(utf8); + } + + /** + * Get a copy of the bytes that is exactly the length of the data. See {@link #getBytes()} for faster access to the + * underlying array. + */ + public byte[] copyBytes() { + byte[] result = new byte[length]; + System.arraycopy(bytes, 0, result, 0, length); + return result; + } + + /** + * Returns the raw bytes; however, only data up to {@link #getLength()} is valid. Please use {@link #copyBytes()} if + * you need the returned array to be precisely the length of the data. + */ + public byte[] getBytes() { + return bytes; + } + + /** Returns the number of bytes in the byte array */ + public int getLength() { + return length; + } + + /** + * Returns the Unicode Scalar Value (32-bit integer value) for the character at position. Note that this + * method avoids using the converter or doing String instantiation + * + * @return the Unicode scalar value at position or -1 if the position is invalid or points to a trailing byte + */ + public int charAt(int position) { + if (position > this.length) + { + return -1; // too long + } + if (position < 0) + { + return -1; // duh. + } + + ByteBuffer bb = (ByteBuffer) ByteBuffer.wrap(bytes).position(position); + return bytesToCodePoint(bb.slice()); + } + + public int find(String what) { + return find(what, 0); + } + + /** + * Finds any occurence of what in the backing buffer, starting as position start. The + * starting position is measured in bytes and the return value is in terms of byte position in the buffer. The backing + * buffer is not converted to a string for this operation. + * + * @return byte position of the first occurence of the search string in the UTF-8 buffer or -1 if not found + */ + public int find(String what, int start) { + try { + ByteBuffer src = ByteBuffer.wrap(this.bytes, 0, this.length); + ByteBuffer tgt = encode(what); + byte b = tgt.get(); + src.position(start); + + while (src.hasRemaining()) { + if (b == src.get()) { // matching first byte + src.mark(); // save position in loop + tgt.mark(); // save position in target + boolean found = true; + int pos = src.position() - 1; + while (tgt.hasRemaining()) { + if (!src.hasRemaining()) { // src expired first + tgt.reset(); + src.reset(); + found = false; + break; + } + if (!(tgt.get() == src.get())) { + tgt.reset(); + src.reset(); + found = false; + break; // no match + } + } + if (found) { + return pos; + } + } + } + return -1; // not found + } catch (CharacterCodingException e) { + // can't get here + e.printStackTrace(); + return -1; + } + } + + /** + * Set to contain the contents of a string. + */ + public void set(String string) { + try { + ByteBuffer bb = encode(string, true); + bytes = bb.array(); + length = bb.limit(); + } catch (CharacterCodingException e) { + throw new RuntimeException("Should not have happened ", e); + } + } + + /** + * Set to a utf8 byte array + */ + public void set(byte[] utf8) { + set(utf8, 0, utf8.length); + } + + /** copy a text. */ + public void set(Text other) { + set(other.getBytes(), 0, other.getLength()); + } + + /** + * Set the Text to range of bytes + * + * @param utf8 + * the data to copy from + * @param start + * the first position of the new string + * @param len + * the number of bytes of the new string + */ + public void set(byte[] utf8, int start, int len) { + setCapacity(len, false); + System.arraycopy(utf8, start, bytes, 0, len); + this.length = len; + } + + /** + * Append a range of bytes to the end of the given text + * + * @param utf8 + * the data to copy from + * @param start + * the first position to append from utf8 + * @param len + * the number of bytes to append + */ + public void append(byte[] utf8, int start, int len) { + setCapacity(length + len, true); + System.arraycopy(utf8, start, bytes, length, len); + length += len; + } + + /** + * Clear the string to empty. + * + * Note: For performance reasons, this call does not clear the underlying byte array that is retrievable via + * {@link #getBytes()}. In order to free the byte-array memory, call {@link #set(byte[])} with an empty byte array + * (For example, new byte[0]). + */ + public void clear() { + length = 0; + } + + /* + * Sets the capacity of this Text object to at least len bytes. If the current buffer is longer, + * then the capacity and existing content of the buffer are unchanged. If len is larger than the current + * capacity, the Text object's capacity is increased to match. + * + * @param len the number of bytes we need + * + * @param keepData should the old data be kept + */ + private void setCapacity(int len, boolean keepData) { + if (bytes == null || bytes.length < len) { + if (bytes != null && keepData) { + bytes = Arrays.copyOf(bytes, Math.max(len, length << 1)); + } else { + bytes = new byte[len]; + } + } + } + + /** + * Convert text back to string + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + try { + return decode(bytes, 0, length); + } catch (CharacterCodingException e) { + throw new RuntimeException("Should not have happened ", e); + } + } + + /** + * Read a Text object whose length is already known. This allows creating Text from a stream which uses a different + * serialization format. + */ + public void readWithKnownLength(DataInput in, int len) throws IOException { + setCapacity(len, false); + in.readFully(bytes, 0, len); + length = len; + } + + /** Returns true iff o is a Text with the same contents. */ + @Override + public boolean equals(Object o) { + if (!(o instanceof Text)) { + return false; + } + + final Text that = (Text) o; + if (this.getLength() != that.getLength()) { + return false; + } + + byte[] thisBytes = Arrays.copyOf(this.getBytes(), getLength()); + byte[] thatBytes = Arrays.copyOf(that.getBytes(), getLength()); + return Arrays.equals(thisBytes, thatBytes); + + } + + @Override + public int hashCode() { + return super.hashCode(); + } + + // / STATIC UTILITIES FROM HERE DOWN + /** + * Converts the provided byte array to a String using the UTF-8 encoding. If the input is malformed, replace by a + * default value. + */ + public static String decode(byte[] utf8) throws CharacterCodingException { + return decode(ByteBuffer.wrap(utf8), true); + } + + public static String decode(byte[] utf8, int start, int length) + throws CharacterCodingException { + return decode(ByteBuffer.wrap(utf8, start, length), true); + } + + /** + * Converts the provided byte array to a String using the UTF-8 encoding. If replace is true, then + * malformed input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a + * MalformedInputException. + */ + public static String decode(byte[] utf8, int start, int length, boolean replace) + throws CharacterCodingException { + return decode(ByteBuffer.wrap(utf8, start, length), replace); + } + + private static String decode(ByteBuffer utf8, boolean replace) + throws CharacterCodingException { + CharsetDecoder decoder = DECODER_FACTORY.get(); + if (replace) { + decoder.onMalformedInput( + java.nio.charset.CodingErrorAction.REPLACE); + decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + } + String str = decoder.decode(utf8).toString(); + // set decoder back to its default value: REPORT + if (replace) { + decoder.onMalformedInput(CodingErrorAction.REPORT); + decoder.onUnmappableCharacter(CodingErrorAction.REPORT); + } + return str; + } + + /** + * Converts the provided String to bytes using the UTF-8 encoding. If the input is malformed, invalid chars are + * replaced by a default value. + * + * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit() + */ + + public static ByteBuffer encode(String string) + throws CharacterCodingException { + return encode(string, true); + } + + /** + * Converts the provided String to bytes using the UTF-8 encoding. If replace is true, then malformed + * input is replaced with the substitution character, which is U+FFFD. Otherwise the method throws a + * MalformedInputException. + * + * @return ByteBuffer: bytes stores at ByteBuffer.array() and length is ByteBuffer.limit() + */ + public static ByteBuffer encode(String string, boolean replace) + throws CharacterCodingException { + CharsetEncoder encoder = ENCODER_FACTORY.get(); + if (replace) { + encoder.onMalformedInput(CodingErrorAction.REPLACE); + encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + } + ByteBuffer bytes = + encoder.encode(CharBuffer.wrap(string.toCharArray())); + if (replace) { + encoder.onMalformedInput(CodingErrorAction.REPORT); + encoder.onUnmappableCharacter(CodingErrorAction.REPORT); + } + return bytes; + } + + static final public int DEFAULT_MAX_LEN = 1024 * 1024; + + // //// states for validateUTF8 + + private static final int LEAD_BYTE = 0; + + private static final int TRAIL_BYTE_1 = 1; + + private static final int TRAIL_BYTE = 2; + + /** + * Check if a byte array contains valid utf-8 + * + * @param utf8 + * byte array + * @throws MalformedInputException + * if the byte array contains invalid utf-8 + */ + public static void validateUTF8(byte[] utf8) throws MalformedInputException { + validateUTF8(utf8, 0, utf8.length); + } + + /** + * Check to see if a byte array is valid utf-8 + * + * @param utf8 + * the array of bytes + * @param start + * the offset of the first byte in the array + * @param len + * the length of the byte sequence + * @throws MalformedInputException + * if the byte array contains invalid bytes + */ + public static void validateUTF8(byte[] utf8, int start, int len) + throws MalformedInputException { + int count = start; + int leadByte = 0; + int length = 0; + int state = LEAD_BYTE; + while (count < start + len) { + int aByte = utf8[count] & 0xFF; + + switch (state) { + case LEAD_BYTE: + leadByte = aByte; + length = bytesFromUTF8[aByte]; + + switch (length) { + case 0: // check for ASCII + if (leadByte > 0x7F) { + throw new MalformedInputException(count); + } + break; + case 1: + if (leadByte < 0xC2 || leadByte > 0xDF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 2: + if (leadByte < 0xE0 || leadByte > 0xEF) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + case 3: + if (leadByte < 0xF0 || leadByte > 0xF4) { + throw new MalformedInputException(count); + } + state = TRAIL_BYTE_1; + break; + default: + // too long! Longest valid UTF-8 is 4 bytes (lead + three) + // or if < 0 we got a trail byte in the lead byte position + throw new MalformedInputException(count); + } // switch (length) + break; + + case TRAIL_BYTE_1: + if (leadByte == 0xF0 && aByte < 0x90) { + throw new MalformedInputException(count); + } + if (leadByte == 0xF4 && aByte > 0x8F) { + throw new MalformedInputException(count); + } + if (leadByte == 0xE0 && aByte < 0xA0) { + throw new MalformedInputException(count); + } + if (leadByte == 0xED && aByte > 0x9F) { + throw new MalformedInputException(count); + } + // falls through to regular trail-byte test!! + case TRAIL_BYTE: + if (aByte < 0x80 || aByte > 0xBF) { + throw new MalformedInputException(count); + } + if (--length == 0) { + state = LEAD_BYTE; + } else { + state = TRAIL_BYTE; + } + break; + default: + break; + } // switch (state) + count++; + } + } + + /** + * Magic numbers for UTF-8. These are the number of bytes that follow a given lead byte. Trailing bytes have + * the value -1. The values 4 and 5 are presented in this table, even though valid UTF-8 cannot include the five and + * six byte sequences. + */ + static final int[] bytesFromUTF8 = + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + // trail bytes + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; + + /** + * Returns the next code point at the current position in the buffer. The buffer's position will be incremented. Any + * mark set on this buffer will be changed by this method! + */ + public static int bytesToCodePoint(ByteBuffer bytes) { + bytes.mark(); + byte b = bytes.get(); + bytes.reset(); + int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; + if (extraBytesToRead < 0) + { + return -1; // trailing byte! + } + int ch = 0; + + switch (extraBytesToRead) { + case 5: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + case 4: + ch += (bytes.get() & 0xFF); + ch <<= 6; /* remember, illegal UTF-8 */ + case 3: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 2: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 1: + ch += (bytes.get() & 0xFF); + ch <<= 6; + case 0: + ch += (bytes.get() & 0xFF); + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + return ch; + } + + static final int offsetsFromUTF8[] = + { 0x00000000, 0x00003080, + 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; + + /** + * For the given string, returns the number of UTF-8 bytes required to encode the string. + * + * @param string + * text to encode + * @return number of UTF-8 bytes required to encode + */ + public static int utf8Length(String string) { + CharacterIterator iter = new StringCharacterIterator(string); + char ch = iter.first(); + int size = 0; + while (ch != CharacterIterator.DONE) { + if ((ch >= 0xD800) && (ch < 0xDC00)) { + // surrogate pair? + char trail = iter.next(); + if ((trail > 0xDBFF) && (trail < 0xE000)) { + // valid pair + size += 4; + } else { + // invalid pair + size += 3; + iter.previous(); // rewind one + } + } else if (ch < 0x80) { + size++; + } else if (ch < 0x800) { + size += 2; + } else { + // ch < 0x10000, that is, the largest char value + size += 3; + } + ch = iter.next(); + } + return size; + } + + public static class TextSerializer extends StdSerializer { + + public TextSerializer() { + super(Text.class); + } + + @Override + public void serialize(Text text, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) + throws IOException, JsonGenerationException { + jsonGenerator.writeString(text.toString()); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java b/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java new file mode 100644 index 00000000000..6e68d552262 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/TransferPair.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.util; + +import org.apache.arrow.vector.ValueVector; + +public interface TransferPair { + public void transfer(); + public void splitAndTransfer(int startIndex, int length); + public ValueVector getTo(); + public void copyValueSafe(int from, int to); +} From 16e44e3d456219c48595142d0a6814c9c950d30c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 16 Feb 2016 16:02:46 -0800 Subject: [PATCH 004/210] ARROW-3: This patch includes a WIP draft specification document for the physical Arrow memory layout produced over a series of discussions amongst the to-be Arrow committers during late 2015. There are also a few small PNG diagrams that illustrate some of the Arrow layout concepts. --- format/Layout.md | 253 +++++++++++++++++++++ format/README.md | 5 + format/diagrams/layout-dense-union.png | Bin 0 -> 47999 bytes format/diagrams/layout-list-of-list.png | Bin 0 -> 40105 bytes format/diagrams/layout-list-of-struct.png | Bin 0 -> 60600 bytes format/diagrams/layout-list.png | Bin 0 -> 15906 bytes format/diagrams/layout-primitive-array.png | Bin 0 -> 10907 bytes format/diagrams/layout-sparse-union.png | Bin 0 -> 43020 bytes 8 files changed, 258 insertions(+) create mode 100644 format/Layout.md create mode 100644 format/README.md create mode 100644 format/diagrams/layout-dense-union.png create mode 100644 format/diagrams/layout-list-of-list.png create mode 100644 format/diagrams/layout-list-of-struct.png create mode 100644 format/diagrams/layout-list.png create mode 100644 format/diagrams/layout-primitive-array.png create mode 100644 format/diagrams/layout-sparse-union.png diff --git a/format/Layout.md b/format/Layout.md new file mode 100644 index 00000000000..c393163bf89 --- /dev/null +++ b/format/Layout.md @@ -0,0 +1,253 @@ +# Arrow: Physical memory layout + +## Definitions / Terminology + +Since different projects have used differents words to describe various +concepts, here is a small glossary to help disambiguate. + +* Array: a sequence of values with known length all having the same type. +* Slot or array slot: a single logical value in an array of some particular data type +* Contiguous memory region: a sequential virtual address space with a given + length. Any byte can be reached via a single pointer offset less than the + region’s length. +* Primitive type: a data type that occupies a fixed-size memory slot specified + in bit width or byte width +* Nested or parametric type: a data type whose full structure depends on one or + more other child relative types. Two fully-specified nested types are equal + if and only if their child types are equal. For example, `List` is distinct + from `List` iff U and V are different relative types. +* Relative type or simply type (unqualified): either a specific primitive type + or a fully-specified nested type. When we say slot we mean a relative type + value, not necessarily any physical storage region. +* Logical type: A data type that is implemented using some relative (physical) + type. For example, a Decimal value stored in 16 bytes could be stored in a + primitive array with slot size 16 bytes. Similarly, strings can be stored as + `List<1-byte>`. +* Parent and child arrays: names to express relationships between physical + value arrays in a nested type structure. For example, a `List`-type parent + array has a T-type array as its child (see more on lists below). +* Leaf node or leaf: A primitive value array that may or may not be a child + array of some array with a nested type. + +## Requirements, goals, and non-goals + +Base requirements + +* A physical memory layout enabling zero-deserialization data interchange + amongst a variety of systems handling flat and nested columnar data, including + such systems as Spark, Drill, Impala, Kudu, Ibis, Spark, ODBC protocols, and + proprietary systems that utilize the open source components. +* All array slots are accessible in constant time, with complexity growing + linearly in the nesting level +* Capable of representing fully-materialized and decoded / decompressed Parquet + data +* All leaf nodes (primitive value arrays) use contiguous memory regions +* Each relative type can be nullable or non-nullable +* Arrays are immutable once created. Implementations can provide APIs to mutate + an array, but applying mutations will require a new array data structure to + be built. +* Arrays are relocatable (e.g. for RPC/transient storage) without pointer + swizzling. Another way of putting this is that contiguous memory regions can + be migrated to a different address space (e.g. via a memcpy-type of + operation) without altering their contents. + +## Goals (for this document) + +* To describe relative types (physical value types and a preliminary set of + nested types) sufficient for an unambiguous implementation +* Memory layout and random access patterns for each relative type +* Null representation for nullable types + +## Non-goals (for this document + +* To enumerate or specify logical types that can be implemented as primitive + (fixed-width) value types. For example: signed and unsigned integers, + floating point numbers, boolean, exact decimals, date and time types, + CHAR(K), VARCHAR(K), etc. +* To specify standardized metadata or a data layout for RPC or transient file + storage. +* To define a selection or masking vector construct +* Implementation-specific details +* Details of a user or developer C/C++/Java API. +* Any “table” structure composed of named arrays each having their own type or + any other structure that composes arrays. +* Any memory management or reference counting subsystem +* To enumerate or specify types of encodings or compression support + +## Array lengths + +Any array has a known and fixed length, stored as a 32-bit signed integer, so a +maximum of 2^31 - 1 elements. We choose a signed int32 for a couple reasons: + +* Enhance compatibility with Java and client languages which may have varying quality of support for unsigned integers. +* To encourage developers to compose smaller arrays (each of which contains + contiguous memory in its leaf nodes) to create larger array structures + possibly exceeding 2^31 - 1 elements, as opposed to allocating very large + contiguous memory blocks. + +## Nullable and non-nullable arrays + +Any relative type can be nullable or non-nullable. + +Nullable arrays have a contiguous memory buffer, known as the null bitmask, +whose length is large enough to have 1 bit for each array slot. Whether any +array slot is null is encoded in the respective bits of this bitmask, i.e.: + +``` +is_null[j] -> bitmask[j / 8] & (1 << (j % 8)) +``` + +Physically, non-nullable (NN) arrays do not have a null bitmask. + +For nested types, if the top-level nested type is nullable, it has its own +bitmask regardless of whether the child types are nullable. + +## Primitive value arrays + +A primitive value array represents a fixed-length array of values each having +the same physical slot width typically measured in bytes, though the spec also +provides for bit-packed types (e.g. boolean values encoded in bits). + +Internally, the array contains a contiguous memory buffer whose total size is +equal to the slot width multiplied by the array length. For bit-packed types, +the size is rounded up to the nearest byte. + +The associated null bitmask (for nullable types) is contiguously allocated (as +described above) but does not need to be adjacent in memory to the values +buffer. + +(diagram not to scale) + + + +## List type + +List is a nested type in which each array slot contains a variable-size +sequence of values all having the same relative type (heterogeneity can be +achieved through unions, described later). + +A list type is specified like `List`, where `T` is any relative type +(primitive or nested). + +A list-array is represented by the combination of the following: + +* A values array, a child array of type T. T may also be a nested type. +* An offsets array containing 32-bit signed integers with length equal to the + length of the top-level array plus one. Note that this limits the size of the + values array to 2^31 -1. + +The offsets array encodes a start position in the values array, and the length +of the value in each slot is computed using the first difference with the next +element in the offsets array. For example. the position and length of slot j is +computed as: + +``` +slot_position = offsets[j] +slot_length = offsets[j + 1] - offsets[j] // (for 0 <= j < length) +``` + +The first value in the offsets array is 0, and the last element is the length +of the values array. + +Let’s consider an example, the type `List`, where Char is a 1-byte +logical type. + +For an array of length 3 with respective values: + +[[‘j’, ‘o’, ‘e’], null, [‘m’, ‘a’, ‘r’, ‘k’]] + +We have the following offsets and values arrays + + + +Let’s consider an array of a nested type, `List>` + + + +## Struct type + +A struct is a nested type parameterized by an ordered sequence of relative +types (which can all be distinct), called its fields. + +Typically the fields have names, but the names and their types are part of the +type metadata, not the physical memory layout. + +A struct does not have any additional allocated physical storage. + +Physically, a struct type has one child array for each field. + +For example, the struct (field names shown here as strings for illustration +purposes) + +``` +Struct [nullable] < + name: String (= List) [nullable], + age: Int32 [not-nullable] +> +``` + +has two child arrays, one List array (layout as above) and one +non-nullable 4-byte physical value array having Int32 (not-null) logical +type. Here is a diagram showing the full physical layout of this struct: + + + +While a struct does not have physical storage for each of its semantic slots +(i.e. each scalar C-like struct), an entire struct slot can be set to null via +the bitmask. Whether each of the child field arrays can have null values +depends on whether or not the respective relative type is nullable. + +## Dense union type + +A dense union is semantically similar to a struct, and contains an ordered +sequence of relative types. While a struct contains multiple arrays, a union is +semantically a single array in which each slot can have a different type. + +The union types may be named, but like structs this will be a matter of the +metadata and will not affect the physical memory layout. + +We define two distinct union types that are optimized for different use +cases. This first, the dense union, represents a mixed-type array with 6 bytes +of overhead for each value. Its physical layout is as follows: + +* One child array for each relative type +* Types array: An array of unsigned integers, enumerated from 0 corresponding + to each type, with the smallest byte width capable of representing the number + of types in the union. +* Offsets array: An array of signed int32 values indicating the relative offset + into the respective child array for the type in a given slot. The respective + offsets for each child value array must be in order / increasing. + +Alternate proposal (TBD): the types and offset values may be packed into an +int48 with 2 bytes for the type and 4 bytes for the offset. + +Critically, the dense union allows for minimal overhead in the ubiquitous +union-of-structs with non-overlapping-fields use case (Union) + +Here is a diagram of an example dense union: + + + +## Sparse union type + +A sparse union has the same structure as a dense union, with the omission of +the offsets array. In this case, the child arrays are each equal in length to +the length of the union. This is analogous to a large struct in which all +fields are nullable. + +While a sparse union may use significantly more space compared with a dense +union, it has some advantages that may be desirable in certain use cases: + + + +More amenable to vectorized expression evaluation in some use cases. +Equal-length arrays can be interpreted as a union by only defining the types array + +Note that nested types in a sparse union must be internally consistent +(e.g. see the List in the diagram), i.e. random access at any index j yields +the correct value. + +## References + +Drill docs https://drill.apache.org/docs/value-vectors/ diff --git a/format/README.md b/format/README.md new file mode 100644 index 00000000000..1120e6282a5 --- /dev/null +++ b/format/README.md @@ -0,0 +1,5 @@ +## Arrow specification documents + +> **Work-in-progress specification documents**. These are discussion documents +> created by the Arrow developers during late 2015 and in no way represents a +> finalized specification. diff --git a/format/diagrams/layout-dense-union.png b/format/diagrams/layout-dense-union.png new file mode 100644 index 0000000000000000000000000000000000000000..5f1f3811bf0056defe19abf494afcaba1bedbb77 GIT binary patch literal 47999 zcmeFZbx>W~*DVMH2n2TxF2OB8a0%|gHMj+L2n2V6yF-8wT!RG-8k``(-Q6Wvuan&S z``&x?y1J^XySo4QZk^nO&E9M8z1CcFjycAhCrn8}66G1;Gbku16lp0jWhf|^Z78TG zIS4S|$g&u=Blrv2Nm)`9s(hGu2mF9&FQw%K1%*ou`2#JjOmz%Sn6y;Ybk>xY<2AOk zVKy|eGcsj%x3LFjLqYMo^MZfcm^vGhx!YLVI`O&-Q2aTA7yKJ?n1zDu&neDU0u-9^ zN@TC?98JkMnO`!qQV2dHBO~K?G%@2<7L)kra`2M?g@v=TJueH3o0}W68wazUqd5y3 z4-XFuD?1B2I}c z$No8=p9S*b|HDN5o#j7A!88j#<7fHLk_kSmTw`B=f)a+3786l*hu+WhSjHKDd@Nx6 zfr%L=6;bv{oa$DcJ}z4Eh1HK%FO#L3{JvIdMCmEUTsRDI@$|lOWvV#zT&cF+Uwc

$Tz|>{&V05 zHCYFBqf=qlzjyub$N0ZF8#toNooGV*XM-F^O?%}+57&93yg3r$!mwgFFA-GfA`LCD zDPn@#^?mL+ZRcMNRLrZ5`YrWYt&;H=rF{knb zVp2*?5Cc1$e`n%AVSn)Tm-pEa_u1x<)o6}L#_xhw$c7O9j?mXbZHV>xkS19;l8`fQ z&f`KCGQ5$fFky0vTwoLVzZPEk4p^u9kQC3~`~N*hR-7#r!G-cO?As3)lRM*ua>eO2JtuXO@`S1i^$omew=Rw4JwbifkukR+G z^@Q{crg2(zg<_0d&}Qp`kqjKV|GXDFLg_>{LISVflF#G9Jz?ARYJ_U1z-lNZ z<6OF3>(1#i=P*SG3XufW_x{T{yQ6nC?b=ploQB_FHM(H%^u4NB()B1_Upszo^15CL zDN!rIEj78A)M(qU9(cGrQ`GMekHFR1l!zpZ4ZKQYsA#U1!FlYJ?u~jbeVRCQce>!V zbTCnbWn=nQj0a*dWTG(I^v_!Huqk0yV9_m(^+WbW*}d15yvbgpzpb+-I67r%yZfnw z90slEb~xhro+BUe4C!dG<<_D1dA;+Fa`i#OTGT?53*TO8jZ&==laVC&C6Ss&M07BPnca$DL^A6C;ARi~@4;D$;)}Ll_3* znMkn=?5Z>wX5_#@{nFbSGkNcDBAePVuz)7dqAOZ|7Xu@_yxpIvF;80|b)7a$V#10< zYQi1($Q)8xKO2TctIgf#D~oRzr?IvwhNa#h_d_h?_F~U`QoUFeG24BydZ+WVX>J0J zzFo^jOzKa(H^mSyh8^1e4z5^ZGF;dTL5!jy$7tniOE<2L0psfPSy`4AlM1Ki!}n_+ zRtDklY8k&;{i@>y=5MJlpzD1(V{TPi1y3;An)Bd5kg;4n%)6Ho3vO| z?ZPax1M^no2#?pCO+7i$8?UWQH#18bO~w1)7h2*JB-H%{$!`eRD?ae($Le^{4NZe?!fTFHr)!8Qh8;cx-X_@=Z*+|#G|?`Zad+5nPxmW3Q3IZ$xkm6YyL6&(*V?Bo{uaX>W@uOqXF%7o ze_)91j#Y)YoKFbXZp5mpLKEc(T^Uf{LaSm>4ybS&GS*Tguo0LN=}*_9h3fJT>gJf$ zvTW)XaC2e5KI_$DdD}cqt5{habYEI#bMr2z1Q%;oM=YrXtZw60R-RckdG+1aQQPih z$?fh~0c3epv7s$>>BF06BcDUNuIBHIi#&avK%Y2{C2eH)rWiT3*Eov9?NE21?P5x& zkUsRTYUgvh_AYI<*Xn?Th!f9R#MaLynf6~g*-3EHK{eH5)yEb2Nn{d`O{q!-85+Gc zjTsaAs_wJEICUV`egb+2PSWUIP^d{wbbK}|gY-CTQO2wk{7YubYSXxv_RH-m3OwDh zGiK>FJyFkf3KcTt`st(p7*eY5%Rf_1?+f+KZrZ{ag>Q#y8ZCKU#{+SbQ5X3=HW!i3 zy9uHr*9Hu)JTI;H{n@_jRhvlHsnNCl88Z#US5YkW z`zo^KDL(Iqn=RGnj?%+#WQsH3B742NcipTcIyPBt*w~Q18Ei?>q%fV~>&`_I^TcsH zlk>V+2_a?eE7PfG&N?Jc8njYT_+t{pz$Ai!+I>y$$NMY98kZN570LYH4zTpmbD100F>#}98&=@? zMCe9-tNjTgj+6k5aNE)+vceG0)SUDXja!;v2jaU9EqiqL_%0orY}g?&40vdV=^ zF3ftr%RHj5#GI?az35KKpbl$4@eO#bk0b(VYDCzgYrWAzzqj)+bW69mm%M6jT5ASG z@Ak0tU#a=3RAA_vTq`5Y7OMbXiv<%~7G?f3E z9ZyO3M{@%hlt+ETSdN;{P;K>q?}rnfZ*N#ogtO63{DZclfk6tAH8Ere_g=@@)o=q5K zO;S`;ehkX@&!4Qd9Pf!F;(J3glJ<8E;-tVDW17o5ifUiSyv| zzd3`fqd<=KZRfw2P+`U_$8~9`WJ6*Z#2d*0C`&gU_N(%r17u+UX;3*#O_)P09^wa8 zfSvX`uX0-db3lp$>;mZw&I_%7?%JOKw%Y#(kKsozy~Ql-u79&TCUGlB-re(C$P++WU7N_<>i z0&a8{pot<7C^LCopyt5jgUf6(U8ba$ekahWK%z{r+zCJ_6#yAVK0a817#@Q9{AaQn zh=sy0VbymPfx#xh!lMwX0`RhY)T~?}gQw>_2I4wK$XlVSd8ZDN;GXl{NrqEY%E9}~ ztEF%p2KJ(6o4LB&4>f&5U?&_ecjR6$zHM$WAIZ|O`Kp$-`t7Kd&q&w8K|I_SK}-tZ zxR+=Y)M3Iw7!6m8o^PNu&n)1sA;B_I-}B(>)sj#4(jyA@e0#}x>+NAl~pMi*2yl#hgyCoIn)kOUOsK%fH5vQ0vJ=uZ&7eCm1C=qGCG6bgFUvi z3-UCj&>kGrh!YnuA+&=J0wzvDgPwaSp%wceqn>rFvUETCZzoDb}(u_Z&vmH40 zd?a9g3pSviC^|?!5sHC7bfopItOoPdpH3cBL4iw2;&m*7N5_E|Lh&fP(Vtir_inP- zRA&y%s+ssSU!slH1|Bps4;7Rkx90T9nI!jDCSs{gM0 ztx*^6hu_Y>?YDD7(k2loVP;2jUjwgBDKPa;u-e^)q z3@)#)c1waK)%_sJjp6hYEfp$)MAmv&-vaX&ku0s(-@VJf9n__Bcfu1{8=R1OU*~dX z9S7mPwUCDFR=8`tHDrCRZzgRYOTw-3?}E)D_$Re&r+UQKf0{)*ZB`e-4S->owLxET+bv~{r zq=i=5bnFWE!S|p~;JE$C$=Y|>1?8Epy`jzT;PF0{giASdFw@!kf$fU5gEA8oRR)5d zg;pgY`5>+-7j%_k@0?*R;o-=oXnc6>3PlU_yW=Dj6qUy@Nm2omK6GUVi7mY_q@kmT zuGh==S4)cP5=1r=AXw`cGegXi7I^f&*kVzPzD(bEQFkBDG7&9_m70eUp~o(ngnG&& z7i&7fn{_O0M{%z*NHl*T^up*QwSnhkQYl0BZY0b5_fDT?^L6Z?(x*yviFL38BSRAH z+U_swKvsLRnNGQn9N}OQAl$D_@|*sp&gRlaP5>$)t3f-ij&-buj&X04z{P~hkCBgL zCnjoKXX*(c4!6q+`$r@Pf2UCDfX%@6Cwgh8;qEa9|B0cz0cOOKYK5PvWm&mIy-dfh z1uSoCDZ$DYraJ!t;9exqJT}&XNu5ZETn=Wx9s#5Zz>U4-XV$15J-3C8@2{FTw-~^7 zs9$hpUWA)(N{k?9^}0UJ>C|86cc!h~8%h=Yhzdd0#%;Vahpo4^A6f&txZM57F>Ie4Pb1IX0S8Ut^aD8;ed~VP-TUrDQUfP=kgWUX z#^-d~)9-EsUeI#Mr~q*eI)?SeR_*9{c{tO7<_+m^e9uN*PWU&e)Sq-2aar%}jYLSux6gJAnbs+a^L)Yo^MH)Rrz z)9B@I;J&ehy*m6Szpl)WHc~LtX+d+j*Tmt>n%A6$>DoKk7_xRX5fP>6zu%$8Ew;wO zrD55bq}A-%2kH3dr%ji$RuxUV^cuMkhCrMQUsUiP$ymf3nXYScDevO&%jJ_vV&9t4ggG@(0IzPK0NX4G5b-5Q7`vZd%% z?vnf*jyD_0x9r=>0C0?gg~Lv(+{N=_M6k!qZQ;`jZohs~1b5|-&^s{yd#~cCEc64n z>yF8DyW;OW=`dwdy1X>WvHh7K*T{;jAgbZyei)g3u6~r?3G_y_C232ir z5JH$2QQFmULdbQXT##ha4DL-a*A(y^58s^??R8F84NMmafZv#j`GeEt z0EQdhR#k%iiDB>Qk&#-)&0TKCQ`$?k_FdrpxxFFGFR(F^bZmDb$?y6ZDm3M;4i_@t z^(Ge*Gjq?|zb5+ZX6z*R#O*Va!iCXS@()cm(;&)L#LJvn%9;WP@-I%OhAAOimQ(h! zm3nbGZ5&JFuG4iqwF92gNBbEGH=&@;f80=FPLqtF8XRuhKr zupy7rS5fWAXf$3?(9-4Oa3=!I{U<#lwCZe8n z@#wt2Rl19vUh#R^OBYQ1eZD|8`Gw@Qo1QhY1kp-a68Ri;FR$aiudGoUo6jM8TsuH9 z1H6QVRF1_MQERqs_pZ*Wj6WS<#;#A+X2RhFre;f-ntL2fxQi12xMnoVpnc{zVN{vG9gOAiFt_}8 z?*phWPnEo9PtopAxOP$U434>Wt>Zn0Jl9%Q02WkvLfjnN%E}&owGfTHEsz}=5Tomz z&u!;}uA367UZ$P8#3TEibm1cbhb{)5WId3 zr+^$SygHP@z@?lw>~0>=utdSye@$VuJ5`26KzuxR1I0gK4VL|Ix{A_PWB$doclXb@ z8dEbI#5ZQEU17x9VkBbE)a{j2I4ys6;e>IuD^O=|a4NXx*@%7(6Xrqx(sk5K+*8CB z4$?^jwjk4p?g+d*{AC2AiYb#sz16L-gaC5d67meuk7WLFzR7HbnB~=s1fzSEyM9P~ zxRQii)+y4l>zf@?a2zJYy}8cN*m2o&PV3Jnz6T*`%^7uv5Zv5sW&dbASn}z6x!6r< z7i-bE{Y8^i^gLf%{a%wPyv}=)28J4vtN&1}f`0)+fzAR$+&^8RFZ4T5=^G$odP|&k zBN4i?e3QbBgv57qK%}$8sFeS;Uz#+btc_JNh8+g{KZTz<2ZTd_g1+oD_K^3LSA?Me zfa(cwSUkci7?4D*FAa)D5})WfNP-f&TpR)0Qe7?z2U0$D9eFP1CZs>fA3p4HTnHbj z_V?TqwXA$%9Np zQ!$#C_FqPMnVO6Uj|BySupt8j(pet7V`-seCf46v6&m9hWS*-9x&P`Pp~9sg^9;K< z|KbFBlRuxgAk%DqiTzKm3Su26c%iB+5*LMkM*S}&^6w>8G=P_w8qXUB`vHUk@Nq{3 z9smMX$Y7{t4KDX*knot1QSBny9{o?kF~Fd-&>q8)Tg_iID2~x!07tD;ZG!klKJDd< zMPB4Xta>h91TIq=kJIP9?;x#tTc-UL5dFzM+;i47^lC=umJ@lB(IA6ba+!o<2F$Eg zMaz^PJ$M{+Cm6Q(rCk{bO|&pLq6Rmo8^rcbAa?5P0Z;pGuFfXtoT_%L9&8iw0vD<7Ayr!$I!9mE{>_dk-S5Gf6<)QxbPywS1sD#B-*}b1$3&A6PW4V0U(vy%*pX zG$8)x)qKFhtO`l)8qQ$2pN~l@@@ypNd*1}bR)B==m#)(g2(7OGxMgNgE5=qN22*mW zz<02IFbBN*0R2zEVTTd(IGO|@F)y~#L6E;L@DDN&{S2%SU{h$mzO-As1JpGL=@Daq zeW=yYbqI2~z4)cCX!JMITbGp8iK<&omCj&D5_iG^jdr^ppY6ttfayV_;mF@E|p`w^u zVbnW1siB`adcny}3~>RXh!#aJup=nM$xdM3dYy>JYg<#{-{E>lS-dX?FpyBsJky1B z9I-Cy2KBO;=BqVOzMejJMvt2ZPx-b)or&EC&&pIW?)m7TjKsDN)oZ>GMw=ytKY){67mUBfhsqhJ@s3<*&-T?> z0sQxuCnvq6SffC>7N&z}$uc!XRiW)d2nO-D-9Vzczxb81jXMjCTy}Tc=JQ30{sUi)u=!UneRw-)j84nUqm&6zN3i5i*q zc6&Z)SUniceZr<$nVHYn!XFWGwlpgZpl{!Q zR>fS5!hgmZK|oN&Sj#ei1jOSh@hYI%QQ-{Fr5cSfjN^YYz9D06jnXB4@d zt)AOZQ;_TgJu^GLeG#+JW|M-?nsNXfkB{$oEaPG#ikZqsFw2-Awn(p*7L8EEK*1oe z?VrpTMKEnMWUT^YVEbiR2&d(9f1zi>x~#0{4HBif8ONpx0}&(LVg{l#`B?C#VGzFC zLFF?lZ2*+*XlWi}a^U4PNS2TySql09&yt$?EZ{wmu@*+z`Jc5v$6h|olhgw0SSd;J zZt(^DAkY=cOS#nyf$Lg_P_FZhDkH}7IpE5vHb754s&cVFZ#;5+(>tOcb_(>+F>9W&wekr{JtL{ae_6Nga1E|+J3On!ALnh z7Qx)K4fZ)oPx9MAfN-sq8sZ)Doc#Mdk6J=%Uf~^~R8pOTL{IDEz(@ho+I@HW5|OL4 z6jNQ{Y>GYphhGMs{7=89Bq`!9 zJf<7Z+W<#tZ@?~i@mizQv6ypQR0`+SMXxCmT;(+npetIaV>Jemx68jBI;P%)yvv4NReg%9jso_^yVDRg%PG9g;~j@+^Y*hE|{aqF)K6Ag<4eWMy$;oI)nye>Khf6mtQr9CimLoaK|R&m=YTa^9~+ za)SZ^ukp`7ykC{Dzpj-T>`bIXPrzb8{DLNK>%c?Rxl zxWbvbz)`a~;)xP2n$sDeN!u{yqAr3l%KucgMYv@uHZYZcK zB}TTr*y5>KzP*Mn7DEJ5gt=n15@lr&Lb~d1t;d*hw zae#?qANKSa180KiBgYhfBoVjkC%ItG^&}Lv@c`g{LB6Bn)fYqFOY9_Pd+{X!Oh6rE zL&(bAK%n&UBZ4kiN}yj;`dX)L-a&Uj7@#DO9DLd+ngx-(Kgc&#_E)){j*sP}h<209p6k+wD-XBG4V|z3Kp`^`&XIxl zND%*n%aD5!fd7X>lBtu_ z67m0xOAOrD4~&fP|6~N%*&ty?wo*(kCej< z;&C@evfC`iJ^?t&4Juk3)_NWvuG)ZP@mz%Hjq1sadG?sVyHBz}JM1g#2N=b1b1*dl zustpZR5|mzVLnBcxp$XbD!VvtCpxAJlq~dOIM8WpUHW~VfK~DhFSnj?wXZo zGI@TwG2nB1YX{N+Eu2BxGXPJ0+M%E3d+fP0xT#|3HlI8L%PX}Ox5@U23gZht3`AA~ zp3c$}2u9W*V*x0*641zxAbkLooCl~y3f+>uUyoNT*Qi(&yxmt})B9LEiNZC#3~c*o zyl@0ir;tD?sqN-&O1ysuWSf{^Q7Zu1&_=K=_ZuA6c!8oDDp3X`65dE4I1H;5tBUAz z8uuVSMAmYu(ZCcyC^Nc|@}4AJ7?7D7X+}`R_XC_=L3QsunDL{QtGCv=+w*`FQZ>|L6z=4FH}OKa3`d`OK>16r z+&jyOFBrU8kjjguuOQ=UmC^%NABZ{Xx~;?*j_nst6(jSmO7NiNY7w`c3U^Xg+*Q@a zTzDBwOm!mJzB@qR-8mC+BG3Cl4u32J9^s)^lpI3}7TO%G3Z!c`fS3&1cA9ZLjTU-n z03@Z&_5S(dyOOeg+#q;nm^T=s%5PMu)S?mY0@?vmegF^v!{w(mZIHJKi&izP&Qo>+ z_c=gZ`Cuz}vKe(D$o5WC+w@GbBD8EI>5TSQiU-~I>~P@mxZGWzOz*)Gflz1#vR?m_ zT(&;@``fE6eI*vnN~lEmIAWrYI$z%b(ZTbp?E-JDDjQ{$?afYMwgpDOJOCdrvOmv2 zb<1-)Z;wU`{npGTvhh6G zRP_$rB?qIB(y+aBZdR~U<#+>V)^J#f<$65au^2To#w^oB3B^a_7SuyFUnG3>O07NXX*r; zx#Z8Mq+Y&H`2mE}cbQZxC&Sf9(P@;0gYe+Bs@QkZASE)AfubG)snu|+#qZvS6_DZO z@4Q%=xh%8uAo*}U_A)ghyGy}f@!KIwBm(9vcP(CX;*7R9{cX4+-$5-1c>c)(t+Uet zB-*fVM($_cT2!xSc8kA%VkS$ZtynDO%E`e8(qvA{_Uf5Sm+uWJvTF8;Cm@%i`N>p` z#N#k3i50vDK#|b-=<5?xJnc^EWL0H_?2a3GcZ5JQ|g;l|!gc-wnQEQU_TRev!QLNlSq_=(7R{+L6vr>avU>dS9SK0;G z-HY-^vYm>@QA`RPPB?qhf?PO3720p)fsfIxU`zLF;3NZ5f!ZjNWihvp%2#fgA#3wz zEn}a6I!B%4wN=%f!j9>)k1rFiIc9 zZCpFWGDI6h`f9BEOhCqyQ1E%IVY%K76kCwb0d^h2aqBqb8!}9nXt4bH;%0RPv}cH= ze4_$OgAV&}rR(I@2#G>$RK8^&JqkKY{jMYl+nT047{zT{E~Wsqs5T31`?}ho#em8^ z)mKVB$Qbo%HPj$4#h2J^-l5g~EXOW3lv5q?6#6OEjicjE;*Eb1GdvR#g+C&-7)AB6 zFq-uDb)0j88s6Jmp*!=9HHz;}%w$PF>m~5cTQ6b+;fe(Jyg{LrBX89ODHu&F znX0rGVs}B`9U`b+^$9K(fqf?H8?E8xHwupnKP_j{x)V%cfy(Yxf#-@GI~8@NH6BXV z0Fezu5?|++-7uWZROje+G{dV{Wjy{~Jb7D|B8~ab#Qsg95kTZ`*Qbte6DLPfWf!TE zevHPVzos_|7k^x)vn0=^CAHLM@TP z8T%4K8RgJJaA0-T%Z?Zgf%rOS5G-e;WLEhTFAZzlY_;nB$!DPA#%toBJN1!=yyC;R z3r6mI5tE_Zi?b4nXdQTSR$QS=58Z{A;I*IsdfLb%0zZu9t>80BQ4kBM@YTW)umhna zg~5K=P>rlX$xi}A33h%eta>)%i7=&BkgZZ85cFyo zVI5Cyv}T#m-MNiEZ% z@(EBZq@KzXhin~C|9(`cpcuV#PE{JC$LY$^mxn!jW?qSds4%sN@2HvMySl0x0m$N& zgX7D|@dAV=i`B{qxGrGl)>03NU0|t+j7Gm!>IW|rHykdHk(P?EML{vpTpJokyrO_h zA~3sw7T}60WDol)j{EMIAhdz^19VqTETv>M`3x+E*OqZi#3TlTI-p={SZc0*-V}CV z%!AaaMH=qex3XaXyY(!vq+DLLRpl?Y;AvCMNqsSzjzD2a|2gE78OEqms9^fJx=j8? zyd5+UM4*K-i<1$;*s?^=ecLO;>#eZ7{C+&zt3`h^MzO58prf?%61GyeM~`TGPyjIv ze~M1@j88|KzdOhbA@V$og?K&y4l_aoP{+~*oxH$99obSJDy21}%(DHz*Z~B}ugxW3m7{c}w^#Lf8b>e&5gqZiy5GYWySu zOYQFt83mh<9-v%Emyd%*-ibP?%gnMc&r#6zgR%2B4|&pMS$Q7% z*=BJgpEX>0|CvwW4*s&oF;BfV9iuQ0cXV}jelh6P$-#s&2Ee0yr$`Y{avk;+OOk7# zQ$&R%2vG9kVg{42^HnX@rN{K^GiCjl@EZ^$JE}c8I=#d31%=mn8$FG-Q|cqf_52o7 zR89OSP6_f}m<|{An#t=Vlmh|SWv?*WGy~Z+<((4N&4Ys zniE%-AeiyZI?}Rsz^)o@-On}!GmOB55WKCVGfuo^VO`!BNxXn}7iqe}Eukf52O=e& zU2tFLbapWBa(%FrqvU$NCE*#;)>yW6Gt!V*w@zTLp@V45D@zETUM=g|!ww?>qNWNeBPc=1tG^%OH6#4?u@% z!W&(O^11g*W=ed;NBuS!sx^;ho!#>)7t*xfS1b!!CEH&dYcq#(<+td!)Kv31HwWQyer& zA%oSZt!kQ@Y`r%V2I0(_E8WbAWQk_YKx+KOmh1xp`s?I?XuUlg&sZ9T45^Q2U-qos z^sJ>-Om)Uw~}uJuGR{qQJp;&DPq>4ULP>&tBjhfojnCr(3kA5S3} zzJhN7>7pY3+v5*y0PyqVEeu-q@zAn}XA~NG`w^+gp*xn-*1?^ejC?|#B4VsQwv|v? zL%SU-HE<=^*^Sze?w&uNzF3g@|D_H7(i*}ifNP)=bovUE z+`pVf3?;br|Bq;n|5qo5k;@NgH4x#g+g9!1FW-Xg3+4z$SS^u_lGS0g3owthwhO-# zHQt74{hE0GXCnRf5dcFImF!3xD-5bib}m5`DJt(aSs06ShRENlVo0GqY7j)g{h2Qf z9P$^HLR3oU)d<{Z8P`@$R^SZcXp}5L(Id5hFp2`~gODE})gj$lAkDuAObp{YpuD#2 z7MCb>hQ|IOkA#)MO{M$gin7Phg}pqM{hp%hgIZzPk~gceFHO`UASYo5foK0@piap~ z>3hv?F-q2z4C*%fn`lm`%~bLaAw-t&EAV71y4X6Xut`ih!Ph?v6-mQbHdq%3^T01H zA+?%>XJ6i|D=dR20dh*`bd|9ljyWhF&U9PcVaEAuNTF!Z2lT-DzR-5lR2p@MlfMpr zX)`+^%><@{2q2pLB%_hQS%^LReMD6K@v4wLTzCil8qh+ZOt}Fg? zAP5(5NJWZ4%vJ_spvoC)k#wBUVz{-8K$gqmv`X5D>x)ZEQU)i8kwH2YyKH49a$zup zO$hkiS^2`uMzU<=rA+A1->Lm+f(!!MM}fVynv*WV9nh_4szJGJ?{BJDYG$WA??01} z1Y2A3Le`)^OBiM-ojaS$Ce1~cLKcQrNe`mk`#oaB^lbPrV$-X^0P)?#B9sxRIlSvJ zuVq|P2$LZ*od79S*kB#l5R?#00^FM2)}Q$oeJotax)iy+gQMdbz zvp8k`*@iC=!QPcIBIHE)_W*?U{O>dVdp`c|E?^&OGDg@QD64>Seg4v7l)eN~E?+-w z6rpCO63k9}40s#BqFRGCt^J<`vUM*&dnebsi7#NYzgzhp1n9l#yQ8*EVA_BMnJa#l z-~*~T6oDEK7{FREQnveQ7~DJR(|1U)xP%kLvvWYUXL|WIR+@^{Y!Kf&1(Za{gB}3? z`$w>SfN*WR3Y3dRkZr?qbn$Q2j0mnBNYFyY1DZ(CC6}$8G$M1!P?mrvXqwEe*@)By z+^-(k;n{$t`OHnH<_^jXih-zFtlFJS;s*$&<9Vl%dqBJmgSz@s*kr95v*&<_2J)kN z4B`ozz_}7@tEiCz$S^wso{qnKR7hpt0Q^RIjIGI{g-C}t@Qr}rdv<`V7zXIk5E?p6 z7zXvyWx6IH)ByJI0}$oInXm-OL1kU|vM5>wXki=mKCOYM!=RtS`DeS|lhF5}vVmRo0tl2(D_6`6wHk&`XDSeY?fL@(=;|sE&g(s-5 zmH}N|PvJEQ>7j(=zM%7my&@lh2ao0L9pLZ21^>P}^4TpazQxANY zcOI7qbRQc^8KCw6RqTnOJ(!D_KjQOvy9AIBptvrEU#)!vR4;dkjL}<@c3)_kucM0I z%gr^Qlc~rc8xZiF9Q62hY#`gZ2TYxE0WQT`|AEIZS&sd50M;!&mdi2(_UX2Hf9Rjg!1o4s%p}GkzGQdb5HMgT@s|H$hRsn>>lv!N?-mmC-J(m+lu77V3A$ z4~Nj0I6gEFn0_zrdWh`0)a~hDrscV2Oh5o!oZ$h^Q>oM47#kJS9njIB%(>sdYu*S# zptlZq#e*%v<4O27jK(Do0iYvPrPN>2T(+FC60Tp>@R!h&OK|92O2^O z=$3$@^Rh^Wo*D)ei3lz+%wvK^Eo1ey?#HJ&tR=se=6kC)H3V=m_5zTk_>H&pj@sXwE-|qtGrr!i* z?ror*ar-A|ztk>PM#?j${tY-dhkiX0BVe|@gjS^(C5;HK68^F=(x2IyHGqJUY-zi} zUf(Hh)G?|&wpP%v8I1Z|CJ$6^(P@v*a3wHS z&O|Xc-axu}2sq5T+8!?>K4%x)--k`v1F^-6-Owr;2lkTrBw0Hpp}`_|&4;ukgd!%q z9W-V)S&UIA-)MS(O67$#hvK`%>mTv7a~nY0-HMF1Nx5fNXb(T*5O6Rs=3B#X{h0U_ zR6F=R1D()sz*a}FifX1er;ASEbUU2y+ly8V+71IfM8@9~SMMS&G2{PMzO9S;a(XiK zNFb6xXY#oXH{)`=TVn`$K9QPSJe^@Jkn?1F1gJ%NRRYrE?=d?*6DS~3LCUlkF{`pe z;W%9jQpq(6WD-|-US8z^rKLuz=L0B{S(_P=dcToh`|9ZUG)h%g9Aw{*0Z?dZAgYGE zXsz^E)tm*kFR~7*hY$^{qu<3IIDz!ufqKT>pB9sO#z>{*y$jwEjBFBPBVgbgy5yzP z1UlXE45VRdUJZutKpo0m&O0gYYZ_nYgA%x*`E3LPJ&~=M{LM%?oVqKk7hhDxL(Q=$ zJ+B=o*g-?c=c1h@NYRteKoZk)?Z!B!cd*Xpcur8AjjIg)!3U)IL<`Ux68x~X>> z4dyvV1Ov2Lw=$2 zx^_F#&5(*@p#4)o#5w#l$q#?4ZQHcVgSi^$-gF*RLov3uTwE`1lUY*acza|APV<`j zsMKyUhMCQGs&*f-|I#g5DtIH#(h?>rtc>cySPSIm&>`SzbqFQJRP>}TXw!Wf32 zGNb!n)+{qU03TZ5&>14U#1NZ_i;D zl>?oax669(mf7f%bsfXF1qD|!I%d%aYeLhpotgSTL08LHjo>IHdl7W!Q|7oAbMlRo zJ5PuNDt#p~r4^MCnZb-5SJeY0jGZ{)DL;LrE9h2#N&QyC+H!M&p#jntcX-n6J~r7n z7uF73Dd}%x5rKgDWye#h73hdI>qB8Y0&{0IK*P03l$&`M z0M_*zXGk749%(j|LdOoIFml+K_d|uFciSg+Hki7nR*A1^IdIwK@1w2WiLaX7AYfwvav4}i8LDRAvcXKxLi}w~5u({H%IM=H+O$p0! zIOP`XlGF_ig(R5VJ!fOi%x&7BEd2SP*nYHr&INjU|6|YnnhzuMhcce9tPv6K@H@lw z)gi3LW&h22-Y}R{I5Rt)KrPSeviHp-MqidrFSv%NX`kS~&nG5gK`Zr<>si9nMDk!T zq?;w^G?G}8uV!7U?g8BnfI?emU)LR-j7bm%-sd?kQ*hKyd#{Lnb1BTU94IDq>QKdt zPh4KLrWw3xDKSa(Gr5<8G<&}8ZRMdwV(+;|VlK%*7+8kI8-M0R$!21wMTK9OVFuTq zE_S`07tJNu$1TOvs#=3b^;+pMZSHJF!-pyvQ^uG7yrJHnZf}i`zkaE&`y7u&8@uKj ziVV1s*X`voUo{^ncK8}9ztk7cYlK#8f`UnHIZIFVMJSSQy4l9d&8=?t%9mT1UKpZT-C(u`=J3}!VJjY@t z2A&FEth@^KjC*6;$eIQBPrOziytC}h z>8yeN9E;6HYa$N5rDze?6&70Dky&$=e_?wcD^*s3P^PV*4=0SO9hVh1YgF36&{utb zGu>`8GvuTnlep~lsd8iq>?+%U)2#fG{XF5PtB&qwV|G)X z8K-rIxEoW4tdZ2KLIfV_1~Fs;X}CS{`9kH2vt29)BJql-!p7SboN@<2uQEq#_KZz3 zHK)ZTpJM(3>}dID74{hk-_bEcqQKm2CT}NFM+KK*xYv6uKcw80KDpsZV@UK%5X~P~ zPVs|e1ayalVQ01jz>Dy*$3sX>? zxN%Cw;M>(GBBQP<%}&Si*Xzrx(;fiQ%1GZArpTjqCq9{ zS2d-fI`oaZ(<_$7-w!Z}K8#0?c%kKwsK^wxuVC0wp4!GCN5!gQ>F%edICU5d6%ei| zq(n^B=+ZQ}ORp^lmc3@=&C95ylX%hFgT$b~Usn~lzVJ~#Q~D?$nLzt=bixv zg-enA?Kf3qPJh{-%$$wT=0@8e>>G)qM??jA;`o|YuXO!o&j&LLkkW*?+zez0J`k~$ zgT{ZYN`!+=(Eo@_cvLrNutVIWfUR9DM=_R{gWgjQyS91sv1S!K?acB!-UQr( z!k23IC3&L33@2m;vFo@-5b0;+A|hEs^ptRh{p=lfxFRs(6S5H>#+$8#MHt;HR)~UI zP;JdoJes|y{lzf}I|VoZi@(R8&Ncr+-D52lt1NwkWj=E2cYF@$S{4`5BIYwntd?3A zjQ(UtGeEdN;2pHMXF_W=$~Uzi@L}DpPhr!5pN)TKFfKUz1xdE(S%Oo3%@5A_P$g_; z#G>w-pF@oX7FY#%T0kWwO1O&eM2Syk@YEV2>lDXjZhs0-(-bOWJHk;NQQruxPzn8$ zw9J`#w(<*$7LPcq<<(3|{cF(i&}r=}CD>!HysNwr84|^G4)3`hD;bPNdU(pP3`jhO z7oQNxa6thkmO369S-mZ$_ATf*K6dRUJ;Z*^A%@@B0Z(*{qY%WO`C2^u%d8_Ve#b8; zA%i-F{C5cd2YYWFRn;2Bivohu(uhH~bV*2ugrIavmw-r@AR!@*G)Si)-5^rZ9nv6O zBB-Q*z?=K%Iq$vu?;GQed*8TYoH5Q2*lVw~_S)Y!=WqT(BPG+a;Ks(rSdX!vB5z9f z%8P?dBWH(3q-%U=+Yc5U^Pep5j+u2M7T9Pc-H4wS-(TQ7P;pF)t2@4?q~N7)lpTn~ z>A3!oG~Ft5% z&NXJ-nh#M=xiwTxevEYe1J~uJ+x_%Rn=K5n*tU1=uaPJF|6XFz(TY#I@9tp97urtJ zN@iN#jCAgu-2aAFVC9x4GwR*dlg&eGxvLvVoZ(%ShQf(tHy;h^{?vGV-=gwwnWeS>(j8Y%+Zbjez;foJJdvO67?RzF~=KEMj;$;4-0$S z{X2K;4*jp!(7y(R4reCwEUT@{>2;DJXs%Bn&#TIRvsZh^(b- zGZCeuGd5vw4%x)pFZ=O+R9MsF&~=6CB-yf04X ztWvr^ZO6nZkW)7r*q8Y5Ebc0E**U#9eF2U-C1tu37i-uV2PgZF))eNeyZf$uGV&N~ z5V7v}%-9$DnX2_>_WBjp+A;2>nLu`%+vP%NPl5OKbg}{8Ly$)!q$CERXb8eytO%Y& zns^4Id=w$v{-ZT07!}cSei^DBn#TRq{%BBuLp11*UVgpioqp76qPS}!?g7%8wcv(d zlZV3xQ`g{Qe(Q;{Ik^}0>UF&;(|*p^Y<_;UpLv>J&QK2~4vb8`a@c2HDg0k*pD*wf z3$+G(y8UBg^`bG~R@+{vJo;3Ch>$_B!gKNppswbU(Oz^z4`} z&IwV|=sPjkz6D6F1q%?dQMlU~C9O^jOfv!+ ze$p%{b+a9^Fra&**MLhpV9*Ft3pCPQ?qdrDj@@X6JC8o9f5A&(*$KfjJnmu+T}^MD zd{?^h2A$1mw8!rYGQs+x&j;4~-nyRe7A5S{w)6v-lTzygiB}hRD4Lo$Nv_%0s>bY` z#cBMKIj;A#GoLgRFI314?X5SomE4avR=_2>Zo7+4dGLZ;ihPiSrB~VDdU0(d_b}0@ zutmx&7(wxTboL8%E%Sf196_kpC{P}~jF+E&a66OZV76nakg=@fO479ouV$SAXpNWJ3G+S5-(H)J_>Q0$Nf&;AiKpE8H%Q%4b_HIDQ%&z{O5e z`*@f3mcH;8zmxmv9}ib4*qS=mE*oE+z2sPP39f%vd|_8=x_gG+ryAB_rxYU2@4(5f zR>QQH<9IFK%n$zq$;6~GR%JlkwXE6gbh+vK`(-bOrbik#%9xhaREo6xR3y?A9}8gG zk=ULbW?es)dzz#c@yfgCXF*T`PoT~3Oq(AUZK~=jzJ{vKOnYu=)}Ec250=)S&QvkQ zY!aHORKupOMr?j#$PX(t(EM>5%@mojg;pc_H07uBKq3Ar_IFo8S0ZuA`p9XHXqS#( zAAGvk@TfH2{Fm7tckt!;D|EkHN_n#;yr>86`ewgkOdYf<(m9M!uCqNf`qA>1r8f7Q z&AlPr?qg{O(bVRuxPzM&<8@L(`XY@FjT!yI6{>_I-JhY{`h>B}x8nR}s_o3A zElM~qYUb>iM^t2_1h^m}ngds?-5msl~dg6&iwu zj&6e_*YV)Tnei*^bHi<^j$;{Nt!LH$yt~?>aHuNTul%n_)~N;2n8Xy1##K3Ati^|n z(nNS>PnH>d>bDnqKcm%8}5IiYMsd}Q{eV$opb5^vzUzZ;N5i0X6d_M zS(B_5N9&3L=N20~bKg9nu8jxU!#VGb<)skJYfev-HB>ZEQTgubNq67(IIyfK9mBhp zzd~4#&RQ6(P9QrxBy~K*Y*9H(#*8?U-i&%|$ivo^A^sI?9=55~nVJ^AT1`G2bep-n zo5I}J+?GDp0-@V}TX*`10cu*fEiV%CZq}hDsF@P6|2yi$6GE9$C;Gso9E2Dw{qvs+ zh?}BA!j}J=yBZ@*Ex+YS-2vYEA8=k)8h*me6#_E7zq2reGiLc)0%zI32a%#AhMyF4 zGwT2MgMWusRDWNC=c@)|tQ_iWxBq@H1#eC8e@^EA+?l@*<9~zB|DQiI8S;gw9id`_ zs57pwwWEuSO!=tbr%Pl@)SEiI=kKuMpN#d~En_YU=+fC<4Ho zHn+@E{lkCC41f4WN};PFtd$TmVwk4K!!WvSPkraV7vUpByi^a>>Hb|p9&5qi7PC*Y z8VDzUUIg#tr{K?_9{1mIjSv(I7K8DBeR8Sss9<^cpL8-A^8U^R%=oY45lW>!X!l?K zI9>qAbsorrP;%LW5LD0WkRNjn)CPl~5{4Bu3{Ywg9o&=8M<9}W2xelZ2U`ez0{Hw? z!JM!fL=-TuJNc|#SMk&I#^o7@cBNk3*#q7LEMJgv_`buKiewnlc@Rnd1T_1L-|b++ zA|2gRxhLzfxr1PYnL6X%<{!S5?#Lfajtic50LWeE`ZiU*f-rY-TqrW66+SqkO2dZP z2EBFsNA8j`^i>(d>ba0L4J#O9HlLr@0^?xL>T!V-UiW333yhN*CZ7S7;w^AB%8VpQ zE5TevDTSYtg6QOUc=i-T#O~04>7;Vv>j7)O6k&IVuM2!+7)SyUT354Bt%o|eN7aSN zJ@BD@|MySr0~N=bAg5S;nDNf94|K{V*FSKYbgs^s5QmBS-Zyy4QujlrTHfR6+X)aF zW}^@TC6KP@{6p#rA-`nhTr=QByf?VERo)yF zIkPb^Y4j<#FIQ^{^eRX)SfBochD8EOc3!9Jyn$mzTB-JUdU#tk)8@BMUvUm29Bw}$ zthZR66xBAKc)}~<`6T(qYRB_YqvQym7eJeMi$YiR*AH^L+xvZbS4*)NjDn|A_ zRZL2zyYV2$#jz2!xq#DGVkQ)Kg5i>^#rF?5iRBQYc^HY_5uS2g@PY}-n4u;NM}2Pc zTCwO@;O6v+mL|2yf1S z7dD`K38qjZIM7-2js=$uSLcLP~WVA1C&+EXN&P8q=6Vy+s!A3VsmOxw`A7RvX zdj=FEFNT)Ohj0cmx_>PMC7aFXnki7``C*M-)url;qD2X9CMvYd{7?KSsNBR!q5ycyDrNxzKtv;j8#d#O6`Tr z?MF4*t_tR1KiLyHJ8oYctetZRRi1%ydBELRvESb%2OKurnJR9h6Y1Gus4E%f;bA5u z2$s4iG=PNE=KD5SJ~)9;HJ{7u7jPBu#Tos`bG^=vTCLtF*;)EhN)RO9I?_%|`yI;O zWC=pHndFS15;}!=kodH==kbD64uo7ApPVj!pCkqt_i`AwzlJpq=hfB5;9tofU`7|^ z%;Eq8a+!{{S?OGF~tg$-dCdNxJ%Yy~t z{t#t*G7L}N3UZp8A6_HIMb5#i;stub$I^9>;!Ng{XxyD*|0<%7Amqd1?Ahjz!EyzG z&%ajv;s@^1AHBM%x67<)EPwh8c3Rw6Vl64dH&h-%X`Oa@try+*Et*JH5=jqinid%( z(?H_pV<45>ZA^@LX)wlL zf$ni`ujuKk(Oh5>v?Gig^HUr`R6E^56Pd8(=0CU|ck2EEc-wHL2PV9qxWf0J2I_Hv z-8j(DWA1~-U>5ip;qR7uD>TV155~TS*Zi7`S`{h7(S?> zd7=_{P*}x#?Z4}bru}hUJQI&*wO>G!HSN~7otINaLFE_A%c^KUBz(8%u8oyFe8u5A z63o5t_ZUp^wuWVF!UBc6!Htscub6G%4LM$hTIaiMmsKV$f1cM6@R8&@{LGl8CRAAZ$ef zZ`pGAMjD__Y|r&j#{`Nk@4^F~Jgue@jISj;xU!TT6sNOIWi7ZV;Q7&PKceH8#xp&> zZ<=W^ipsaP|9N|6+0&oWqRx%Qe9FQ);l@#e+w+z?$ty)(tZ~8*@;I|3_$WS}etrK% z1=B*d1;evDwjc!Yv8pR|>=(48PJeKHoOA?5LXS;h<^K+5RIx0l+ERuRy`Dz2ZS zev?^FVDcH{^j)~5$PIJotK465=7N&fJk7Pu2<0hv{jjt+E!V_^q%h+R?WnuhsHjg~ zHv;(peNl>SJXz_KNj%hqjBmlyF<~Ew85JC}iJDu}{kp^BK4H$El^>MjU{)FszIr^e zVi+jOCnpTTwf996>gs986TCKy1&t#nq6gq1A9bzvCCEAFY2EgC3j)-sjw;3rR2kr- z3?h(ihs6aqRY*Es#P|M?U~eJ6hy~6j7&liiyLH`h)e`@v5Oswy^tK^q;_%h?4z_C? znK*f1KXcF4N;#$OKEGG_FE9}?b(Dm=7*1&OcEV~=50xl zuHLPO`El_}+8INuKu-4~j;UT~`iqoICk(A{IZqDoX0cRnl*I-fq)2Wj5%uH;Wgd2g zPzi^fcqIpo%|DGXNi>>*_Q zs`5tY7Nc$wQfo}DuzRI*;kt0^6-?ax|-@Rk$B*A@`07VO}oF1LdR3v4&ETkE%(WqL> zE^nVvRi<^SMhIw-lv5(}Bui0GeCpLud*#j&7@%+`Ee=zu|KIc4BJ7qIjj=E{#k~?T zAYY@t6u8hM+}K$}e!I$lBIacQI%ivm39WB7sR0f>`5WJBg1uS>Rw4<5SJ*{%_X-#; z%y6EIoFnYwVFG4$hXC3|UDN(yD_DfXHD?Ps}s>p5z@h=2n zq^|Wp=U5{GC?cdmtqQ0}GoK>D2lVd}9?Rl{SAB+sZOHw+>y79emTl7)qnH*H##BGk zFMoG^ZI|@^K`W{zbR@NS)&zu~rWJx>%DWPW}SOdK|Eyy?wk>uDqZ)r8Ul|Nfv z6G}7*Ym~#hhLMrFY{}qB_`xKY<+=q3vv5j@XN<`G%{f^Js3P*^)jc&(jbFgpU77sA zjExtifHqm8-DGANTi#B-FDami%)X)%w#hI#6DUBnTII&rf{@*7*;b<2)XnqlmtGMV zp}f%YoL{~k*cJ6wwB|C0Iik3!FECv%cBe>t*qx6$b7d)o<|rY02~hIG3Bx1KcL4}H ziX!>)XwCqfAQtLne%2A#mNr?{qm8mjdxr6DyUVN)jV6B0c)!Tn7)%6l8L$`s-H$k7gQg)st z(}N7DCF{Dh)@yrx62B8|f-Ljp=_Ef{|1IofeV1cONB#n8EoPpppNJk>O=0>Ezxv!^ z0E(x%rrlv`U{>c4U6XtX{U4v;)jA;9IoPRKUW|tFbQ3#Uo3*J5_xr1y-ZxH6dqiJg zoQOfUhL23)i%F>GdEUJm{b?!cdG`qnR{NeUqqA=(qG|ar|pu(fB%)JV5&o=8IOKB=f^kQK+?@=fFee zIih3vK$}qaU$hN%6t{1d$KXI?LJ_9D|1d(VZP@B37eg-$Y@!?0k8fXR%f}f=(9R z88Zf+5I)8M6FuKVOg4cqDd}ZNvC^vF8xq(Z9Ns7=lX%doUvd@(w&TlomSnXXsR#Xc z`IzF+`%%3s{3j#;fol5tAxP}vW%2I0dHQUQ=r^0tW3hetDHfEuqIjeFF~DCz;O<&~ ztk?&a9j<{wPdLw?^uy3ZxVb+5cvlEvO7;H^6B_+%kMx(|EQH{@+xGGr{tK$I0VL9s zD)cAt1rV^_2*6${-a6p?`xr(eygD5=lo0l4|D0h3g5>`#Pw&dV&hY=)6#c!d41`L9 z=>yZG&)*;XKU|vr=bnZd(+7O(mH-CC4FcIDNz7zLm%sxMOs@H2-5qkeKzv#DzRVvr_>*?A# zC_FUyn6URL3as#a(GePCi9`rf`3oXulmVzk6klrff1}k=aAVPp3Yq@{p4$+@P_3rI zC;z?(5@LY=<6ltt_aITp{P$2%BP2`z zlTJo0xU6JZjTc%hpe6(Eq6v#jk#ctDy7wh8e;NU=+7c=Q142Zw1K5~neFNHfG36zA zXOzQ^xz)#9E%8qp3nI4Yn)f9lJP9nGX2ACW2v$SmnvtO`#pE(COaLSk0Q-31l`w8x zP-9wJM-=iMaxh01J^*v`mb>?vyb);A_C&E{kOvTh^C_ekn!N5!8NM2>a9r<~GtelQ zCg2#Hj^A6O5cY5aY|T%#1~y*mo?l!RFAB&z{WD#cf4wVnC*!xvxF@n`BsF!a~Nu|?Xl_~fFWxV-g33aqZQ=70D(MuzXU*& z7eGS#?rU*s#JG0^KaXrpmM#Edz_J0xEf(P6@AhNypQNB^<4y|4e$I&@fYyjA=OwDl zuJqKKlVD9eaeeYW2WHO*J+b;Bc*al;Xd$4rmM3o<3BOXIGRJ4S?rMD$21dD!%CCEY zH%SkNg$zKk+d%+E0dI%O4gqBROm21G>Wvx~i`n#C!YO#v#HqJZTf{Fe`?zjT4CKMG zJ>>F+M2tNkJTw=`Pam5;_p=uV_V1rL6b2#n#4G_ZzXas+d5MP$061$uI|2-k^pN9H z`{>*b7V$TDvVn^ZbM{@+gwLkRjI?oHfKR6ep0J_@T4=hqUnu7Ts^%X0w;a}5D_O=E zN~kluZ&j<e0=!CeYWk5CPI>0+gO9bsqOU z)3bAgw>3%kkA)O0bFJnIX(b;fM8Lp1gpm;-W{R&4DePL1ztNlYJcq{^AOYQ;EA!ubiI7Vi+B1T8$x>!e9WR= z(@9CX-}XU==UJJO5&DSRvo71^$R|uvV5akM5W^p$F*z{s8WP_ zLb9hyJj5q5^u>HuFLU1{$yyQQF_h}Mhuf32v|OYHcFNaP2SWdu-({eqhS1hwlX0y= zfQBTBon#vm*6!jDT9+O$2!S=i#@$)235g!fQ3~8!7mZx|@se95BCmzyBd7n1-{%{lp1buMc5k=C ze=mo!Kh}U?r9@f5>5?ZOU`4o@8RAM@JZ4d|?g4CS82oR6=Ea06TAejk4$Qmqh&ppv zzGOd}0fbPi?KtFd>(XPATe#1w*}`HH0H5o7LeQNL7OTd@d{l94t)l8(dIS$u)|m7f zJ+%013jYaA(U8kn^y>rrCu0>X_9sNwK`v6XO(=8LjpMdUanE*69Z&;6L_Xw%V9tSm z7=5)NAL64tP1&g+bp^z#phz4sb%@c?T5*FlavmSm(i&Ivw;g7MzZ0xa2nQVHaJKX> zGUGwr+pWzJ<4K*R%?b`lG~Z>CF`s0Y+-mJ->oLP!XVMF^{_IRM6%W6AGf)~G$326540eS6vHXV4u^elA_Je6yGQ6a} zMbq7i8JcPjr4ACWx0yn#l=aV~_+*+?@Zxw^$$t4%&YUFC=K5!#k8Z6kel1Yx5Y-kq zo){2k z72jl$D6|BT)N?+8IMs3N3_S+lhwLF%;6OL1D7W=tLh_1}Ye~UarNvNATq>I9O5Qnr zv0iUme-^9Rijn$zwSwp^9qX!`kk;u{H6?ZKTWd=BN;iT(^1n^#4R)^3Tz2~E|7I$o zvZo+LD~JFUPpn14_gk91cDZo?nd7K6g}Exot(K(Lt1t*5b#5PrbE?n6@(muj40j5JaK!I9Y;OSix{2_bZ}N&+SBIGxSNT(w2uBnq z^(qm1wj}Ad%p3~6sgd%@(rT%S2t)ZFMubK`)(>8T&nQ2NcB5LA#SpoL03mDIj#JXF zwrQ)~O}QdG(;ldW@^2;bu1Li=q|~SP^6{WO2=*h#+Q9A&mZ#w@L6j>@u0Ey^i|@;=OXTYdLJ$ zDHaGaBNKT7;A-oLyO+sG1F}x;(o=3ED;s+z&t&9GfF z$pjp7+;`My+N6-Lb?JO+Ph=8DLPnLRiJEWkNQ$w}3!}y9p3W(vvoDCw`GBhTE>=R- z38EbkHcCk)aUa?{Gpg77{S_5C(}Y}YzvYcKyQEPh*?%=TW#*h24Io(}PlF(H#rk*Dad4+2L;C0d8&x<(mM{$AzRX=+4~dlUJ)vpiK{>jWhmod; z_zvIn{YKA*aKm(0`e<9LtRMZ0^a-hPJd1YLud05z~St1<0@ z3GKm!f&%A_{+P?ECOr@l5HkEI%qw}lMHb#LKQn#zC}6!Hu~tXo9vdguntgzNR`|~3 z;3V__+BG&w6WC7V3Z~1SpFGj|tabNrt9*7zM#+EabW8{ZeYMnQ(B1U#fyG&B(Annm zO16s4&!KxagwtgWGVZt*52Q0=TMP!Vb>$Ss6zYX`C$SwwWvDr)>@{B;mgZILv5rr^ z3OcZnem1W5I2CP#Fmpgr=Cst$^~gcrlB<;ac5%#EzUaA}iRKRv+!~pCc8g;BL_H(f z4}?CnH$7HNRpVf_q^{hx8ReyMF0Qsa2xAaV6?g{CVeY z8nb;5|BRZE8~Z&f_vZ{RSn)Dlb+I`Q*u=2VO(~=16{OcS-|lve%v5-=RXKi@j-(i+ zKUaqggY40p!l_-Kv5%WC_t-D9w%I*yXLCQ&_-V=fPD{~H&?Lu^L7q~j17WiG)ijJg zT(CYO#(7KW=@X?PsL_|*kRN+9ipFZ6SFL-HjisC1nDt(*VVZxp zQ8_u!X>u2#iUgxSoX1lHxDi* zR-cK#teWe5=x^IWux*lPxm<%PS!?|Hff$M}J=^PzV5H`Vj}(@O8?*Qk?MyW2OP4Ix z8$$GD73I~(=r@pIyMQnFHzG>=^8GqIVmNoi1<>MC+ROP#;Q#&S6qDGp@rFGW^Pj)Q zlcg40jz$fj{`=1|RiE~b!4(O$KQE??lJaS%!@!jO`%lAsq~Orhu7G#{ycM1pWiV2( z0s&FT-+#6!A!`g)?*qM%xpKz5s!Qwuo$|*en$1hkuFJ?bmm6=8EcEs^4X^Qi&dSPa zInAO$#p^B^kXqJOpLnst|8bi$eSV{&iHH zzs#eqzo!PTZtp0*aR^Hd;?F60kO>loH@E)1EpwPrb#=tqB7bj#KfTy8!A;>ie?DU$ zA6;YuQpBzP^H#pYsIqzz+4&}a?~2`BvE^lru+zV9g_jIhZF%4n_V+|>(PZ`JLX;`~ zegYDNl=c|mX%TR4VbVaz~|L3Cr z&nKOU@xROCEtWZZ=N$zG!pqkltfRp-w;xO)0Ls>Zeg@(k^$y1#f$W-^6V|^d0c! z^K0FCJ+E3vw!ncvTXzJS)aO%WRO)KYr{ksckJ3E*Ibp9AbJvGkzB;q?rPAUN#EpPI zuG!TIXcMs5>JVupu;iZtflW;dz$&mRP6g+JdOn9vRY7DgP0-5sFSfqAPJZ9=h|-=Z z^>%dy^$+lu5H!dIU+ov*#<$fBs8H434Z6F)%Zl4NoP^CBY+))1ki%sZeH5gXg^cMs zW^11t#8hwY4W|e=DhVH|RYHn(wT^T)5|Y4OIVo{9Aw*_Zz2gSRNsr+o3?Qx-_96wV zu$_8(4oFf38=TEY0ax>`w=8`Ma(w0=#^8qnW*l`8Ok4txB8*%+0!y*aV7okryoZ7| z0C;p6(BsD1H5Q#;VsZ%F%a1sNs0;?Bly_9bL`3%id{mmqB1knuU}^`#GNsNa<`5ks z!0T{THM#{nPfPcAiDz@t0EPi*qPc-rQGQwkcrGn&8v)s5t^x|hE zxPjdBf>KXdSA5tu2Nqar=Q?@Jqap8$%DlD035eaa8<8gMP`?BK5K=6JRDVGg!(@B+ zxufnP7-k4A_GG#HGuz12k!hBU>pAO7c_C#Flei#f_MWcdU}_=#+6hfd zZq}T@gy7C)Li6vAunllo4ZD?S&Id5z1MiOyPk?MuQg8-0Eckur>}?W$NV){;`v!!M z1!R2Ua}{+7gUMezDV&FMk; zAHQvI{l5F#Bn`ksur*TC0!ss6%-pkmkQ#=tc~i4A=VR=12ODQd%pI(=YOL0#Tp0?y zn)@QH`h^kIW$~#*WQZACQHqT0kCG-s9SO!GQGuf}7kQq&2U7E-G*Vv*~vAmEtuNGH<@<2x~Av zUyrc^hf(@sF)EZ?q94aGWV2gf3t%adtJ6zYlQMb2(sG-0Ka!gC!dj`;k{0l+$Xkyt zFU}!4tGJW}?E3w=D_aXFG?Qv!6ZFABc_e%ot3-U4V4mg#j6-Je2so!GTii`-@LBTYOidR)Gli{3lPSF6O_#%&&!;=>b^R;W8x9aOtL=Dw*jc>H|J>Pl!{ZN#m6yU_}4`3z#NTQ&-a)gm zYnd;?4t0gE9U7p9+2gnS`bRgF5r}P zagOq%_vK4fMOFQ}E?T89xF2{WKI>uWV4UE=Q`B|?Mv@d4KZ5L5GF`EtFkrdQNEWE} z!R1+H{)Fs!C}_&X-&O!|>m_pnXQ z=^&W`Q32^8KWUt(Uvl1_Uz%-2KgLFbjm263x9MLh6W#!8f9ha~9UjrOwQMoQ`ruguddpME5r#}>) z`10H8>vUy)T5g<)2zHkN)L!1gUY3fHx4DZM_ho11U|~E=v8ZI?s|QX-IFpkK;XJxm zCz(Y%U)Jf&d~Ex(km62jAvDBM1#V~ck|9T?KV)r07%V>BOo-lZsZMD%qHMylE9iDr zDaEm3D}LJ6Umk9)q`K?u9hJsIq*+ocuLnfWrP#@0T~`3m2X*(2d!p%J)gRl=ju4bz zuvxCzn{eO)VW=F*E(X`^Z3i1-JVq<-e6#by5=3?4cK(5y0Hx+l|MtT zkWYDmgqpZQfK>5;?ckB-HanQV$n7hGl@c$^M(7ojEh~0_$pfK)xitrig%_}$=Lnq4IDwZwg?&4I1Gy=QwKDE!;B`5k$Td@@U~umGWxKpX(HFTTNcBCZ z*c?yDb(e=db_K$5`CLq4#sj7tsx}y<__8!`PoFB)zC&a}Z-3BE*01VG`jXIeQ`b>X z%=^wsxuTdtF|EjwHicAi`1>O15q*G3*Bws~-acrhTkO3GDUXUvu00jcM}&OSA!vXE z8Fe4sZV@9sH67YdC)lySB!*VPFwyOGsH1)|RJNPa9j#dP7-G=qA+r&8Tx6H6Rg?4O zaI=-a?_}ZqR3%?nhcn2F}b`ac{T>Yg(OpPzqP6wn6{B%t46k&yn=NiD!Cws%h0^6O7S^EwJ1 zdw*P#OzMC7=6@_Q@Hv9tr?3VdU4>y2K70)XXNd*#R0|0?jXAF z3e5pcRT4D%s5VMiS$jjXq0Iwld(&kno4<%}J(y;ftKFgeuF)TTs>fWhDu#FUZ?_i9 zBDSnjfzSJI!%F~bmjW)T(Lb#=piDh*Gw5z9U-{b#vOx1){=ewU8N*u;b>z;hW^(f9 z9<2jVLDCCrtm7%&En>4Z(LH`Xw91B3cgCui@Ry5MpPj34bVL;U0 zfl4+0;y^W1A^=!t2nyXDK_^Dlz|uTqFU)v$gO|W=18TW7WSe{iam^bb-(bG`{(C#H zoo=Sf6NZ)v`W{bDIT_DO)e2Nj1#Jf0ePQ4W@t887zJaUj;B~&ztNdWM1hib6xjIhU z&p=st0S0BsJWmNU2XzY9DW@Kdelz6`b}9tP@<#h32q^XyC@o1F`-mya}Xi;2@eOXfpQY=@(m* z*`v_V(7eyL4S>U*S5s8!|W zNmLh9vY4_q&C)-MD$teF&Qkt`7%S+1)G0?KVvoE|WPC+1Ogjgasb-AM7w=22iujDr zS9Ugl565FM!gEV=Ok(AfjfLg&wH6{U4|@-r+*=&vRQn#8%Bm#B7P17XnC{rk53%cP zjC0`3D@AXKf+Ljyvu&6@gzU&0V2Z*_e2PH-fsP??;nzc(aI2KcX(QIad}`7ZE7f_d znC&$B6WxjwbtD}jG${9;r`;T>7C6^DCd47XKUL)pQ|Y8W7xE8=$2su%Bd%bxzke-E zfd^O1M;YNl2ObtKHbEdma)9T=M8WOOQ{;JX>*eS0SrqzOj)9H?bU{BrwZDCJOKqHS z=%HBUYo~?y%Fc ztb<$fW6s`ZvkP25>FQpIKSD>noAKT^?-R70d0*>6E@2+S98;I0(p#s;&=6K;E@w?% z#Wn;?-=V6`QEjsK@#YT*5=I%db-HNTAT2A$)>W^}(3dcmzG<_tX8j&Eg@_*3{WA18 znon4$4l;{i0?1=Ba+sm^su1%0rgM5`3cfy0D{t|>ILMxlBl_C*v57HRE{N=2cSJ7W zf=qO=;p6=uhu6b?!Z>9T;xM5N)V_S_L>m0^(U#wPTfgzID4@y;3LhaPL_v4M8%Cq7 ztZiz@vFe;YfQL7~P*MQp4ZvH$b5v)|K5@y31@0hxf(Nq8w(+No1#N?*N>49D(x_$K zqDKh6pyL$Kb@xR=5unVlCNw8{s#G8dqb>msyFedU_|-(Oz)lINGq2OsonY>;rKj+X zgNNy%s(x$}SODrTRK>O+yqg1-&BrQST2pWK4*?gstxV}d8hBpIQ* z<#(y}5G)+3^1>YA1(Wm(^)_1@86;mubpA(5;|VcpKN06#V5JiA@4l)RG*f49QWH`39hlM?Bd0p@jub!nz^9%#n1>iG{<%|&0L|h~U!oQD)HnDP$6Xm^nOL6V(1wQQ zT|`VBz8GbaPo=4BGTNIzqp%7-MoG$Olob2mRU_45Xgx=3-vL4`Wbumdfw5B6=3b5C zSGsasI=CKb--%+~w`HWPIlwuKZQYoye+N1IefhjpN_h^HWICDWkYiAUK4_1ZcNna|Djb z^IQ(AvX$cn@!VG|DBYnnAsD~zLw;^Wk~+ra5LFC8A(_H(vZlQ<=2tcJaun&C!aK_iH zvUuy!grqwBYRi$fqKrGRouE>)!b*isA#y$gd)5T7t`SMCQ%05pR|kBOU=Mu6&X5C*aSc4%YZ_H{UGx);oHzIa%JFafcuQd)RZf<03Ne9RJjXgT=c0rTZC8)wd{mjINiG zXEYDnbV8`ly3D-&a}5VIh}nqE6yja}g`=S$gZ1kv-ro|Xf)2JbR_&QT#U2teqT2Ih z$QA#KM|a1=_q4|JDe2!oN0gF3Q)93GJsPDhEVk3pxljK7`41@AI$Tw5O8o`WZJtFtlal7o5P1kJ$hHZidh)d$Ue9xlaNrmW{z#c_Nf1mhSSPj2flvFls{37v6pWRA zhb>9`kk1ODK*`&3f4CuLx9cxOtbeT6dsNUIHGHH9WGg$&JV-vqY+oOX7wmECM*FH~zixeNA6 zR$#WOC-9tx0a0<`)5u`o5j+4~w7m5$=gvrW7vc+=<}@l-zSby1c@6c~MBJ5YKP@B^ z@4N58Ms!k1{be<`h49h+%>HpxRxpWp2K6U|KZp4@rEc$l6<=ZkM?LrE^71l3Z$Ltv zHD!fw>3taL#ug9}5g{CGL3ptY8qhLez|4_x(1Lw$N@+7&qevR;U?q!9PlLR6 z7s^ns4_}~sJ>7hyPxj@L|OQ3-ETVtH@`!4{cXrXkP zHDu*aEe7S^9W!I7I46L~&eJ9RJmF6<7DRCu-;_3^k_EB-x(M%e(}14t92CX;1Dj6C zgN@|CRb(Dniue$+sefFmGM@3ATws5K1=RC2v-7u7E?`qM=(0Ysgs6LS8VqskJD1UKp>l*r)H zBLY^NaSD}}1ykOzfVR$=&mJP)ptwu(Bw$J(vKm!yn=bY|n1W6t{ z!k#Ff{{bA63r}RoZ&@7kz{2v>j=B;EWRjOqfE9kofGp#F?jbb-!lYpg9xRkoHd~!% zi0W;S*%}eMRA>6V``3jZEL@{_H$B1Go`<&-D7(qicoN4>Pm@}pczY;?bBpgG zu?{>q{q*Ye_HwzVjwfrP=Q$ydNs%`OQ{NXi915$O$5s%qV0uv1^*`QS35SPXfVhA>=+T}fa!_)@a? znjYI1il|Yi_uJwHr=|ZCcxPcM^6(ov9=h4iQ-Wn_B^E$XFWt zArB`UfTGs~a6^Nvsd0Ss%UgHNMotBF^frL8ry*f-VwZJ;JXnD!az@l-B9*2=HpiP5 zm}V505*J(13#CrT-AJ1+W7G%+^FVknqJF2BhBiYth$mfc+f3*|Gw9ZbM9zvFCAIxK zjosg=@Fpzrq?*AVovgiyTI77`6fa$gU46qh`o{S&Py!ul_S)EycbWGTxKdoIWOrd_ zbz)eB7L!Z~^}s6Kr&3TM+4($GQOWGAQ(TG-)#;`o$g5%QAUsg7MUvs@tJP3A`0OoY ziObqdEPtB^VY0BZ`181l&8%qEL9AwgI(|oEXQqN2Vk@Q0)_puxs-cV5AANIF9+@@8 zYI(G1M-3S&IBa@##a7FBzvHyRqr+fxtP?}6iE|*M z8c~OwU)&+ht_*fpw&%k4dXK6$8CcbM39i`3V02jmy;S8PFUgb7cJVvGo5Mwzx>$tU zqm2?)AU;cONhzLNNs?h&QKV%f8w+U6p0J+Oxuesz1T;J>0iuT5hb`LO4}dv_Iq(m2 zEM|`p)3RX`f*&4g6kfARHjB;X$D1{t$aN=-uD=n>e1D@vIR7P-$QG0>hXL5bZ40oR zvVs6Kd3pFd&3qgwX%vYQCmjHl9J}{P zAJ6zyY!d7q=C(E84uSlgk72g;NVE_kLO}e=1O?U$Zkkh|W~$%}Ogk+HGeFT?UyYI+ zqMIvE>?J*dNsZvF6K_BodtSS=?tpx6L7}p~2l58~_EssffNrX_OzwBTWICAa$I6w$+LTd=7oEO_N$$@wW%kgs3aerA)M@&Csg9y z!pdSluDbBD2STI---l*6QMouL@#%+#qn-&l#PX_}bZw(m6dHHfS4QJpD?h|PmAiQf zy}hyR1UHk?s30AQM{rqtXSTj?PyPBzt2cf(lP~z)C;zm}O|PR{=lNg;;EMTsRo48g z=@hT(A-JPo>~KR>Kz=TRw=@^bFa)df_F&hnD{efb=CGOKh{2D5q2<@l1@?+tTAPCy z+*jm|9q)kT9zMX z1+V##B4Z`aB72IWCXuy`5N>x^YDL1A-{(}8e&h>IkV8`}vQbpE6^J{tQXj6<<+$###Q?R<2i zq8@N&&BB0u0IrLi#ZQjoa_l?(uOtjK?IhHSw%L$Vc+b=<3}2vg3lSVe_QC&Sj?LKFh)>#N zY5oAUlz9H|UB>G@fM))mpERLI)&2Tk_18qFqnP1t_#D^A_C`_8)(er+!}WwK%2$>d zyR1-8Zk|w9dwmP#W)m@Puq}M|uKQ5L_xmsmTU*X@J{kO&UX=WYI7N(T-;^4BB@lgq zZxLqV^WS`z1T~^<{kT!*D;4nnRj+9{N)yK@`rk$Zl?s|&e=e`TaIgde{@dT1{q2PxKeGuTy3~ll<2-tqWfOC z2H&?MMD>5J{_e5SaE3UPD31W+H0g5>7yFEcd5{@-dgC=>pz%6OOkCWz?dX?+8$?H3 zo-bS$yY$Ah&H~^JEpRau96;y1x(dVq3YeF4@;6B(+JB#(9l5ZdrZ75+Pzq+{?#h*- zW=7ohcuZ(IKAW-vuBZdW6hiegl=cyIKX&W2;X^U=FqkIEcMG*r$ImjQXTXuqprZ!3fSS1p#YgP zy9JBAF~cZP4@~;7GvmlC_&7P|yI6$A8rTiz}-Cta6cl-cXA4nJV9?6mx zJj;Ds4>x5Tcps^|h1!*QT_Yz1nO!%E=2Jg}6bbX72ErFKhbU3VwWNaN85=XRym|r5 z_5hmRRTisF1^rC*SpFl|nUN(9vYd)|M6wc?qd4*XZ2L2ZeTsv)7ZPFy2a5Md_3V7z zGeuJnRxAMY7L{I{e({ERy=IA?!0X3*rIFzIvso45K#KFE+aleort03anH(Dxg)L9$ ze{`fCj72)Jdt6MTavBpPP{wW{rXoKyVYsLGO2^C=AXD&8iLs>**;xj_?lw?pn$KsO zJ4p6sKKPX29RfOS$_k>167!qTDGk6b8wmw+hD!XWpzGU#r9ctV!U#Vme@dm~(^GFx z*@yrzs3{FE?5ybho7T1q{d;tHq=ALhKz|B`Ec1Y|Lo$mq>Q%x5JgCXF6|I!J1&evX zikvSjm#!fa#z|>J@~iy(a8%|5tn0 z{torF#$%6AXlBSglm^*um0Y_>N|GpLwo<(-SxALLq8WGW6ON$sT4y%0qqZ!$vQVi((YGR}!n7CL7S3eiEP5Oox@p8rv=n)3 z!>n^2YS)AQa&C2dbOr#xrqZg?E)P?0pOQsCz#1gj91&s#{5(M1wL;5)tRb%8VX7UW zv(U$7jfw%~746><#J4!?=~=W%Ipfg8gZ#|xr}(k4yEV}PLm^Ejv$jt&W^kIhvv|+i zL8lE%sO+!0rL2mrtZUJL?$Ye)rbqN0k<7RgCit2|BO0}UsY-M0Ui-CtgC*nN_ zx~|M-&PcaLYs}dYxW^^&{q84I0~OELfME*AiIoD50m{3sy)^ zq#YB#^Ck5AydI2y;I{Q0S0DjAhO@OM>4kN)Z}j992}$LgLpre(Az`@f@!my?bSWo4hyPCHdV*Dule2sVfC!uN;um|J1i5EZv;5X+*yxS|}|;pO!pf z)l`eAy5ano^e-Q*TgUQO8~@^2eeNKyLZROGf-a=cE<_9r8=ba4_5)LvzHnlP}edpdwGU zY{>KFjU)A-CiK&7@s95%{P4k>)!(alQyrYyl!iPJKR1c3DgxJ+|> zAAx>47JKjf!H#A8o|J=*V!m9vpWl?&X1E5ckV=7Q{0Kh%uKKE=srg72PsZB!knPf> zQF0#~m5caE%)~IilpiM8rvJ;&SpSYzQ%X;Eh4#UUW4E z{SA>Fg8h|Ny;!czG1~iib6HNnwKs_)l0H1YgYP6RJUXAN<5GU`Y4uUKgRWi{5KMQF zKH3kN(FQTrgY+1)Q1WlBTr7Tyoi%HgO4#Ic6#1C10sn1}u*tI}r5NZhco3bROL8_U zyht`%&aIx&scP*^lonB&j3a>gHhAhPhIL)rmS>mVjVhlSS5(8a-5*Z3nK8s*iqA;Z zBh4wmE(?AI#Ff4Bw%xm*L*usZzV#X9)a?=UL=2>gQ13kAsnt|P40Uh9wb@t~E!6`C z*Qs&gu<7Af4((<#p3q_>FP>-4d)`07FP5cve()?fU_zY2nC9bmp9if4@%nZ|sPJ)e zOcvq+&$mz(bZm8ZoQZ}MvWjOD69k3L=+hc=$y?9}w{!!y%gb-fmB7MWV&N`F%o#*9 z>rY5}!IRG~gaQ5WqE`|rpSkBC)peIvHmk{>KhNJ9cU=nNbGQ*sCIU5ol9a6S*D#GaF##Z>yh zY&wnn$a%*XJisjKn5&(P;#*=qEOX0~>qL}8e7z1PsCYD-D#h!!SdJ_TX8u9r5w|qM_A>g0WeLv5}NY7mSS_#-{3w z=p(_{C}M03P5k&yOQ;HCGis({B^VoTy#B9kJj%dW)sR8uNV#9u*Jp_`RIbr<1gW4pH|_zgc}F? z(+)QPyHi;e76Ml4*8{qC*C1MIoMbHXb!wymyBZ2pFIJDUT_`2JnskMe8UqYn zwn*-qu7dY{6(4a$**8do=XRw}bihX!d${iLH>ppkUtqk^cEGEqq{Dj~ni zL`rpK26Hg^ikNy(#+H*06j(9_GQ6ET$!Qliw>9EO&7j>xHHH1Onk+uCWWeZ29E(7V;nEgT_+(Z7&Kob$#eT9^;xT< zRE`eQ5**|XJKsD|s1%EAp%cgf<{V;x(yLcv8yE$rGV!b0kmQMz9C4>Oa!YV}J*^Kn zxCHnFZB$*4iZFY=2Qcz2f|(KBDr+N-tRC18TR#DFxsXCmOS zJRWxrR>URv_mD+X{YDAOup%#sgS66K}fSg2OZzsWdk$ z67~;;9gjtyT}Q6Camy@_VV)TX2)^S?-@5%zj8>Gbw_l`a9`t*6ODHhK`z(EbZLj7N zg-$Fvty@ybBCjGp=Nc1ziMTFOt@9ogh;7S1Zq)5dHW6Dk_5N?}d(>O>9F;5m6N=MH zrdhFP9D_s@uKqOei*0A38c*y=(R6@0;O@#H@MU^B(=Yc|K+xm+=u-53Ra}g0i`)u% zx3SROl_wULQxDZ>!fkz{d4`bQW|iEBk&u{+rRCVkK~$Bv)*OSDm{~;4|FLj`F?eTe zqr?rcMs;70%DjI1mA<=D>QZ$=1LMARB8OZ*eX}{4vK-ay}%KR%F-5c8ijKK(JjuXdTB`R)31;nkt=ZOk4`OlOuTiAUqEzaCz)Z{w~mt>^|q#Ziuo-N6ulO zinC`-O_9F~tXD@UH1iZu=uC{?E*73==e;N|{w|=FfPh<3S>PMPcVaMu=FPdp-ytJ- zTy5QF7Eudu*uPR?azW7)!<_YLUVS_`W}I<63spTe8)e;*F`US0O5+4Ai{7Gh+EpyneC@olVg6YUJEfpJ?jkxkl|ANA^vn?;hlhixPl>KzTp zQqUl~dV>NO_d;Q=xAAJ2Qp8DN-csrlI|uUFF7VAU#2sS|_uBYNikgEZl=S$MAf34g zU$=a$cqxBDQXq<@=NN?wk|W42Qip?qHVd$hT?>ln(Fve`PV^^M9wDdh)~!ZevZ!=c4Kv__0~L$*Rb5SLD9{2x%!u literal 0 HcmV?d00001 diff --git a/format/diagrams/layout-list-of-list.png b/format/diagrams/layout-list-of-list.png new file mode 100644 index 0000000000000000000000000000000000000000..5bc00784641ab3cb3c3275ce1e52e7ccd3b2ec37 GIT binary patch literal 40105 zcmd?RWmH_-7Bz?l4Z+=m26rd8LvVL@ch}(V?hqij1`F;Iyl{7yAOVs-<@%`clw;fBvbbvxVvZO0sqO zw^={~8D2hNV4`Pa_;238qr5M7xrOa)>>W*os5MgMjPaGj3S=#q&{Cw(x!RvChW3_F$?Qtb?A$4&v+-!}->u!u? za!i#@yTu;PM??VX&#MPnoms}P7y}#y`HxFoM8L9PAOJSZ|9@971u+{pE3?U;>5#$8 zrf?xii6Q=XA!qo{9()^068x)}A~7q(KQjvxFo#eq6n?Ap>aTL+a4VVq%#0W!A5J0~ zDOv{S&pQ9E+#fXqGXDQW<#M`u%yZ9H+d`|Axc+KyUK*6VVWCAgLd6hKF|>+`ib5St z_5XSXXg_+0fCyWQDXow=Iw>ZOa@b!VLXjXxlBjo13Chz%|MdZ_n1JPtA*VCRAN@xR zAqWeYTu9R5uPW#>0sV#5n~(5Uf58d~SUMw1Nxl6mgGiJ(+zxCcoxeLA8OR_=UJ>`N z4Ai55(b8{!AoSO02}F*=B?wNK{wsqnL#QG#hL6Hn{~K$-P(cm`G9XS$68kFy%(ujB zo=o_ye~r}uVju&U{36!BGN{1@#&G23w~)Uq14RN1c3ARals_f~azH;|8C}Fs`oAng z3=3o+NJahsbJJSBk}P7~mD0bz*nI!tjVvY(L8vu}AU80_oanpZ6U5A%$s&5wA# zSHqN-P4UxPHhb5T@4p^bwJDwyNhV&E-%Mt5*bV>kF@!0A5dFP`I#M7ISD&K1Ub2x>=yoMQp%dEn(P4nsqO*K9W0$<$# zu2q)rk3*DEN$cAMU9a(hwaNFt@U@5D|9YBnHm~!(nG)}2QO#uj(B)*c7fR%74)KP7 z8$G#Mb2Er?NQor~Ax=w*q9Uy|RmXKP9F1~MHPA8q_mW7KE^x@@f{4x7Wx@D{@8+Aw zes-#H=Wx>LOUZnmtlT$V=N$`Qoc6IpywASu+oLaf9#y^>2gpZ53a8bKfcsdHIl;+qO8y3W)^R*rChTvcb<|%2 z1yYs3Jf%P!rIYehR^k1KRxCW3!nHpW5&FvzUbje0Ee#8ES${=c` z0vM+?2EBo^NC5qm`MskdCxL}pi#({aDM-O4H)}Q9 z&bFL)J|uWezzgEdW4R-09=)Ql$Jna?yCrE4bR^TcaA1HCcs zZ~dyK?GVCo`B|fWg-$Qe<#>KA!GXshh~MJdZ9BPo+NpTTGccxQJMb7k65T5YP^YN4 zcqg%77%GaEEofZ@zz`m6$9WD@+tDLP;lZiLif!7*)7Px~K006}$3G;9R<)hA_N}oJiHCnLC}a2p*McdF z<#-_M%nIT<;&7%k^CF+ncu%ggU|N@1=O!MG%{(2>u6~@I;=LAuw`j30|79NgEX zN8^SB;L~JS#HmBoYHBEiMQCpD&s%}ab)ulD$n@vdj0FVP8s4}n!LI|$>95H9J3swa z+A}KF6lm=t=o-a^O087X3C?Xqx6xViBJvz+8&bU7FBf7yZ+JcO4NS?^zO0>X+V_!0V>zU#p z+QzvBiLiUxygpGpihhwi*ocwYS?TvoZ_LuV0N-jF{TujF-CV*4U~$zbc3bl(P4>83b<>i${rZ#=nn1#K`3lck z0K=m4e66nu`Ze1Go+PkB60yT|&>)tGp~V_~T<|}QQS;gefYIjS^h)!{X@tg-1k>I_ znE2wmc3-n8|D(9s-mP7Rf-NJ!%w~}DXgrM%AGw7FYP6V(zP<*M2J0MQXJsr1i91Zy zSfMa+MnF%+2{-GPVa%y z6Xxp7R3P!3Fk<1nyO@?{Fi0jth=`|KXg&F~r_TDN&u*Huc7&zTo=B-kj{A@d%LqKv zsKxOLt0vW1U-|Z7mj4!Hq#SYKUi@e|s3@JN)nLB4)#iK{NIJ1Q%JB7s^zr5}DEp1q z@-SuY2`QX@s3~{=7@cVG?V6MXsA-H8vFX}7tj`$LE)MXHa;p)ep3U7wFXLtPSu`fD zzSJx$4gbKH=$prS@QR~;KawZvIPm^h$GiAqqXv{x(YT5kz(!YQ={e2<(G2P zO8$G9jms7o{xpSc&A2^j`E9%!Ot_9F>r2F!GHfO+JMeZBMFMq)h0QY} zzo$*qOr#OxVIH@-Hq)Qtp<;ZFhlyxi^D*_vSKqu#a4eq7ol-EHoS6z^`te=AZGE)* zdC7&9<}?7sp7`9-xd&U#5XjH>hSHV^84i=QGL$<2jVf!sOGQCNEgF(Eqb`(pd!R?;E+lafoipq*fO_B!k}Y>M-dcAr1rz!9g;Cz?)TTOG+}YB zzW7Le^tA??@({=9=Xze(EYyokd()JZ{%f(B;9_ncVGzz}I}7hvB1aphQMDaEZQaE- zCeQ-TO(c&vsXH7~32nG(s}o#@FwKWI%+`f(G)wxdS46CHC@8Wf)~S^S4RH0%9a1p} zqh`_*sC9pbal||U2?~Ydg^MlcinTzn2uKh-7-EW zcYU@jHZ&E*O>#_~ma#%a9lGbY*6pJVjy*MW)<{6>Ldb#?0O43ny!-leh2S*{yB*W> zsxE6c_*HD|=~Q2K5E`Tt{kxQ?VY?D7dB>kdth_+O$i;~oyom~4*K6QRvdWzqO*-7> z7}moKQY5_6@}FF%^%7hC2V=QFrFzYVn^^AQT;m$PTyYmV-mG=<0VaH1F8ixjw)A7MjQ4h z*VN~wf`|nF!_ZR#)~`s`Rjz^JpUOfWhV4J5T0t!Lz_QpGuo#wnXq5U$Q+WaIt;30m zZIThx#q~4f`doc*!@LJ@BRL`%PnVm`@|gGgG>1{*&dS0SO_SQ;>?mT(8z=S%tBN1o zfgtZ35x`7Dbw{Bd;Gu|0C!P5JXq{O=Eb35TGsD-zIfdy6(~!0AzGVwShwTY%qUt9-)&Nkdm5we5H3yM zg%4?hPD!g`df>Q^u5KTmy%Of98$dOb`Fi zJWo>Gb2uX#ceCK=G2$XYx37sRT@Ca7%4B;F+8&wpZOVn`TkR$FgO`xUr!CRv@SjbQ z6j-w?K27aSb+@hGvq?SYO-)N!r<(?juWf|6m965#dAD7vr|dd^kqOy$v02_rdy~e& z2;+aj{8i>KsIT5AD%T^pwi(j1zUC1*Rosx&@@S12@}mtN(UB%{t~F8{+b{SM0sK!w z80tTieCAnHq7xB$y43q`I}HY(%xY{3NsUL;g)XBKvt5_q0xo?ey}Bf*HW0C6b_Tj1aH1uaB{2lh{}adwp>?rh>>7O+rOsb!=LFAG2wiV#2p=a4C$r{l zhQhZr4fI%lPGPSWE%*?(rW=z+j?*l!b()HSNAgt{Gic&y^(gh0+~Dcs;v#I5*Wowe zH{-erac!pK#p;OC>c_)9rp!yWc0ES(n{plBlT8!)zGDj8ZJy|}T*=@aV6}x&*754Ku-&%n1i@c5Xbvf~#i+3`Pfhoi7>85`uLBqk1 zGn16hjPmg`?`-^csS|^37A?GZZRkZKfN)#@2aVB%+z?WF<2%h+GX9C#R2nM{QIaG*tGfNQO?()=c@7@lFGjbeCS!oSCKcg?o~aS1cSf^UQ_~ zt*sNGFVY%E+Wp8_E^hQpY-)prOf<#o0p&3-?bg>x zpnNX2{?tee4=r*^gmU&!KNeYtq%xEa&+k{oZ(I&Tv=%X~eQI{;ur61)&Gh34i8OM1dOOS%Tc|%B z#l?as^K1KLR0%njY+Tu|7ql!wV76D=^kW8Ny0smf@^9*R+Ojp>kuU1&9Kv*u=~!m- zOdQPzwILQe42QKc7-a0OphLi{r%ZW5&#QiqyF6w%P^@ELQ>s6%4a3bIhO z_lLY^i5-<_F$X~-*HS+HtO4fe0GOZ@7fA}XrR+GM2c)%xR(rmUcB|;VL7P=Ya+Oe_ z@95wA5M6?q44d)&j~9 zN|8!qQZ~j#qHp1|a0h0(<|K%@#*GibwD`}EU4yw2Sb7bDYlad{R01f1ed4ATaCU5O z6j314nq$0bk&)QU7t==NtWFL(g{F0SO5!ecPOrj^}W;+cH+D8|KstL$g{DE}bL^p5>YlXC(w^q_T~im@DBx0gjG z?*y)|pj#@M{u^gE2C5H=5trX$0+BYyyEt#r@)Ec6HFphpE}k{!4wwG2DUBF(+JyV^ zrY+b5!KxN|wICpekt6|HOJ|F$S%o9MFb|7ZzFUFN0Z1r4$-{5+`B`)QIK#mKiISo8 z->Xuec0~l6aN&u!#=6wZ_8S5f{#fviUD|#1 z1=mzo`4M}_we=I+qjLjIluaHU>wBlR)Wi^QBzZ7gBm^W(F99*J1%ONM*NU^RdppIn zY(=q8Xm69_4lh2X4zjDMv1&bY79(}wP-Lm2?sTjGAv_O^%OafK97gpcDe`-8zM)GN zmGBoxTOJdtp}&T-haGb;K+sB%nt zZEVqj^740e;f63oLE>z(TgHZ`zjhKSTg({*z(`kE@^;$Oa0jM-_l>+Acp}&hd^4A#9u^5J1}&- zEU0LIhfzSP-&ep3f%E@rrn;{PVG~iq1j6%{rg2Uc4t=(S_DNZ zzXY{6nd=8YlX^fLlt>|a)l>vvasWJ0!$A>xvGTz(fKc5(Ui@9D?^;dnF@o#}`X*afVmtDQcN z$v^MD9F(*4-0CuRpc(_<5jT(l481}j zL)%H6vcn|rwai2wCqPBHt-35s6^d{F4B)x|n3Mxh?Y4qALCEdKc%WpPTD2~1%iZ~U zu6u;f&xie~1qbxAl`q#of3~eSj(W@h!S->|?O|!I;{bH14sHDyv-%)P_)&Xjf08TE z1#y=j@oYDs5bfYF`M1K}`T|stZ9MUJ+lw^c=N}|*%6kD2p%xfAj`)2Jg9ObJ+{ec6 z47U2hSIX6@*Ak=$7|lA*T93gDUD5;jfxQEP6C8Q($8q&OYP1ai!RN5sWJEex&|U!a zSmr3tVI>@q@BJous`hl1P8+|^560Nm>j|Fl2LOpz)%7spC8uxM2yA%uU8CM~6)3K# z@0E75txg)F{(&cAZzt}x+wg_H#Qn(Ey$maEm!qA?8{N)_z05bf7hp7WoD2r)HnX2l z)MNzcPS^Ye*ydhU{M77Z|5(FIZ(3axn`OXH`ojA?sL%%V-XN~I-W%? z?oXNPop3FD@yQ^Vh4JF68Q8t?s|Bw8EXd|@j`b(NCs_o5TyHq%-=Ja$aSh|rph%SJ zM@PzR0R_)5FW@?`xjhPnvXIMswEOzZjxnF2Wjm~+T&t$%Oi{*A_8w?E5Px2hhV7Y*&}V~L+nZGcct4gc33ag6(@E>@NeXM zvTzKi8y6gFomO#pXr8y7^XKkpUx@&`3OkwDPnzm><`)b-j|>*%&j6<|Dl?=$#DIb# zMJptdB)E7jn|IMMhC|>QLFY=XIe1Q9j%}ESUcx>hgSwvCtTzb%xG!i_$CNK%s9{67HoFvVj<)PuDS z9%#cVF~Pe5w9bnjgfAhQT_4}CS8k-LuTu3TnbJm5oq?um+fj!@PlKwcth=ZLZkbpJ zWgi)~)_*a5Xuau%{EesY`^ukB@SH#C^?m&%yoM2$^()a9ZtMv_rq#=8@tsl=g;(fi zjt<1xj~oz0(ePNL!ArVOwrOB0GgxZE2N52&W8aGrl*XQQKHAcxLrQ87b3qA}O|kB; z{^GxZCcO-hDDmV9jDz|Vb>OF7w>S~Tltzx8!uDxl7an27qyelrKS&W};tehC{h6ah z7Y<>Gr5OxiIfql+(*T~Gh;ndutRDc?-%#W{mry}QV3aC9NcLJHe!1VMOZah6Y^o3i zOBU__dZS2E1@5X(NY~Pt95MNmO_a%}5Y(-17@{IY=dCiOQcbb>oA7RG(nfrM)ELdU z=Q(fRM$Nm**wMw9nKh3z`|b_E{q%gASeKvQwx#_8lA~Ei(7%?lv2<}=a9*U+TvhtO zXA)9)(}oK-mc`}t{C!~`^N3&S>#F5cK?wQ871n0Q?hf*re$Z>G_m$sQwUz)zSDbm4 z(~b;)kbOaY*Hw&Zs^Yx2qED>t8W%IfzWo?#eQjO+nU&eMtqz9GX+9d8?yxO=29Wu) z=60KB?&-9a4X;VoWyZ85EIEnoQ&FH-S2Wc@o7p_{Yk05-yvla15rGmh!A(hhbqS*q zVEfSkrI)z20juxYF^rlpKJat*^rGT$k-(|#BRB1xl2J{QP%Z+Fu@FTpT|}J zRj$%O|MoqqUqS(lYWT?zFjwjs-}dE?Ab#_bMwxw=jbeRFE4yS)_YL`Y*GVFpA=7Th zLx&@BrmtI6MD){m8*J8}+pis@;C@6%cb#{XEm-&9u~{blM#J&QL82ew^{)ri(rDeS zicI^Y#OM09@$}t1A_;A3-|T6TrW-wBU7r8Sv}w|?RCoyM{?PRV50c&=F{?7v!iY|b zbPGWHz&jN}=l>WD4brjzSGF`|4jy0aCp~2l4(ad=_E{$Cu!z7jZ6O6qaX)DrYXf!Q zrt;Pl$Ktrq?$xZOq>II^77O~nHL&f#DYCe?i;F7z>}G0(W5WbPkU==9j6TV)%STIo zd0^9eSYm5IrR3TFJF=n$>@gUq5Yi{whC;SiyjW7L1?MiiF)&ii!?Ss@XtJ6ftHwx40v@w1VBMhGOqsCm>%_RxxLk7L zIRk@eWHE4<*ynCrWSUQ6>M?Fy`e9sg2)kVz#3CMT-vtROr--t9g{jeoAwuZpi(d`b za5F(W{?zxQ|8XH|Jr6Co^3o?l`$2Fka5sNQ&7^%MNwXre^%2^Dvr~#{jwV; zj5PeEL3slmL(Q&4%A{jBooh|5fzg=O=XUMnTYfJfdx-@hvb39Alz*rcW%}*vGXRRB z%PVCICS=A~q$JZDxp|LOn0lIGrz-)>LS@7`uZXUi(9v#5%53A%kXqd+B6cF4?D52l zbz;k^FBgW;tcmQVF6G?rI=3zw-YtVXb@nZZk2s*Rb<>V028*` zrs9jr!+zjAN}B|{h%D-3z~kqZxYPp8Gyj-zAjd3xJ6r{G5QdlcQ8H^fAeHkT`wric%?(#%~wRoH&tv4TV=Oe-1 zw>RqA(|$kmU1t;iCGr!ivUel7PXzBZQu-3OZYwkt{%x}05rn@btF$yjoJgBzMJOaJT5gT1xq#0;B_P7QC zfo}8iEnLtZ(iQC16C-J%O6z_&O&mcIwjJqOYIS-|(y8>;J0=vvoRY_VDmGrp|7e?t z0r?w@`AC8|#BYTA+oa-q^M=`Ix|x$|k0hgT@=pN5J$4P63rXo+{=QPo;9pv2lWD}S zzt)w7nc!-b$6$0Tz05Z(O;w})L(wsbnMu8EQDe6)c|lsqOkp9=I(Az|Pb=OGz7L!l z_G9t-aZrwqGIvkLFbcl;bV{0v?{(FtO4yuSM_p=eT-yl`tCY2oECw~Y{j6S=81^Sb z2_)}60O7W^OiwqIZ##ufgduQ7w#Dc)Ht=Ez+nK=rb-x|^SU&1Ie2ltS(WMj~$%}MZ zCgpc~@{g_~9#+39?5ei4Fe$Hnm{Pt*cK{BAsP+_G+8=1t0OvF#%kc&36Re2m`tzP` z@heXDQK=2srrEYNY-J85V*B9@@eHlL_M2(xH6LI6n9XrR1H>v<_CT9y?wfDIN;ARH z1|h4#-ENqX*(QTHvmL2Wk0wjZi>#7f8i#UT98zRFpeVALS3mkAgtChtEFv2hLzn~u zGc-CUyw99E-C11LEi0-Z4tG4|gqwb-KMKg|ovnjx^IW2oz~`DqQ^hF{tf4 zaaWdmE(cF`MjrD}sT$k7%Jnk|os`ciF?BCQa}O6vNa6-n;v%s?I~%HNm*gJO&VUb`P@Cxm4c`VR5w&`LQAtyku8r zAG}845i>+>+`_RMYgRcW@~C`8C+;K-7pN(hnr{MZC}?+%d?*oD_Q5RIr>T(u8fj8` z*(!wXW-F4R>At~|6{n+=r`~8R9GAR{ha!sv^DD}H^>{|)$V87%Ny2rh*!Im9-&J~{ za+*FP!d63LLl5W@W#Ba(oegg3Ts-1jZtsRWQ;SJ=Wk}=WtTz6;lg~?_Qit5#AjxU< z;!a-P#$XvHxoCE9Z@)#XgYhu_(A5Qg^*}nNfL~jd1srWRRG(A8+z3QW%rDuc@O1i& zO5LbC0&2Q+baN=YcF9U?&mY4jHlQbnVK^r+2GG*((_4hR1Bg2g>!6bGIZ*dMLF!KU z>`#f2WqN);nj6KC0OP|GN3s{hE^NC?=^>TZggRNyYcQsNV_`(Xz}Cw9O4wHmWmF!4 zw4fsoFUTsT572ILUtLH~=De3qdC!)S>&I&9#t(L@Pdk%#lZBfk8STom%vfV$Aq~>% z{rI|<-fG_w47^(&$0s7bdhDreN;x<>jzbT`V{09kH)E!OX%FLWHa(f8y6-FnFEYgo zQ2HJu?bQ}%s93IYWDPTAvWPzm^0&OAyQj$!@L-F@w*5}ZBJZm;sJfo6)74EesN0hYZ~MD9zU))c!{PIX!>#IjeugmRxx`b1ecwYQljY zKdQH4LpGTsAqrfiZpN*=QI2UPoT;V+v)r*HYjm>kXCeM`-@BjMdk z*WQfqC(m^1>?luTa6qMZ5ZX{-zDF=%9=so6Mlggyr-ilGQ&0|t=0K1!SE5BAfZZO= z^u#IRP;x>+lXr9-zdFMI$SuR!_|4*>lR18Iyv4g2h*XP(@_PIXB0;jXajCdEeT1~> zqi@2 z3wqYeWLBAm3yeYCAmFil>+0_p(tUps;6alQ3q|aYVe=cWHXZ&61j+G8He>cy{QFr> zu#l9ANCf^>31or(o-8P<#3X==h$@h03Sk@vODcZ_1;9|$Na)*9ysinCskbZc+tKYh z?K^mw`QTvPBxx=lO@-+*EV>i7uTo%(-&r}pe&UGi)oBEHDY8reoiVnA^|eRi|$xRRMV2u-p;dWXUUAjLcql95zoD8P3{ZVbSvOk$4Yr0vGh* zllu$H85+TE*qv{g(224bFPc?>T3GW)gW91~#g5vR%~Zu8@```8*yzFjz6J<^T(acA z+Jo84YK-xZUgS5*33y7205m-@A^w{}_`Pra_Mu+`b+gE$lKKlMtwW2$&CZTJdtv4N zdkNqIfZ~jRLed}i{9%nLfz6MH=wH76Uo1U<74Sq$0}81AfX2a5hU5Sz#+CnHwcvlM zWdS$=a*(15|Ddt|zQ$z(Hb3!pR{N`3(}2=F`&C52OU3_t5%T~oSbR9B`m0(+FOu}v z=%W9~eE)+py8bz|V9+jeJ=D#$X)~8dpJ& zc~vLIlk?*6B+&(2Js#36S1I<25JQ5?hO_u9I5WWwakQr0s5JjkeSvV)I9#0q0?OKd zIOTsD8zBjpkLj?&_K(^Jppp}_wTke_8NLJn|GCg#@I=FDwbeguN-hOF9`2gehxrGI zK}7)=ufM3e_0NK&l7PoiPdUTs{s7A5Wgtv`{z%^J4SjJJ0HDtbphY#CtQP@@3y5b~ z(9Y$VRE@~LZh*EkL#4Db;8moyeTZTOgh;jIej>6>8Ohr1K7d zjCFx1Lc1Mem<-wSlvH`kxJgYhTp$XnL?-l!PEfdr8*i9}Geo?+>^SDi{N#>)PF}xem zY1jv}_zFV60h1)hB#w0`p-P=#sVueO#&7Xz?eBPr10JsT0o~~-0IpM)ytYdLpm9R) zi|(~F{GMIzq4y6viK}kw1jGWPcg98liV6%mXQYc{QK)x`0v_d4fHawZ^$Q?&SE&$A zq*dM;!6&zDeEYjmhH9cG;B?M?fj2;+HS@w~Oj>mT3voh^9YAl1C0(1WIt1u!J%1dQ z&RAl?yy8+Uo1(J%Y)+6rY&p|k*0LJbLN3{b4kG0ai`n3Kis|yVP!2muem>MadF4^vZ5hFxA zyCnLlPoYS04FcI6Filx8ptJw_e2cdZ1_*AG&&z225sP)9*EeTMKa^s;fT&U2&SggN zPrw{1mL@*5ya8f2uUQ50+N>kwdK&2ZZ*OCHd#+}VDyq-6t_cJv*8seBAeQL-T?R0c z0J^3Vm?BXzWtPAK(lN>ZXpIjpIr-+*;={%-H$d?J} zq9()R9EBveXk=3rV1qT=V3Ro`lt(h{Ab5j-vjwjjFSLoAzX<{}!4o(YQY04IZ~H-{ zQnPWX*^Y?4dipm600Bp)8NKIb1?-XjOB9*bfdp8CeYHk|{J);A)tMRPj2ca<-J)^X zB)az60m0PbCEG92s6I#-Bi-=n2nx>m{KHq9(T6V(TZjyu!tqH!$fW7FryJIWlx2j6Y4*lMg>7j3W8#!g``fP-M5MabH} z214MPlz!R{*-Z8%?;oA%3rD&s9+9@}YOKa{ir33b{L7}fT@qNL9f2Kqx^-yAQ* z&R-JT1Etq&GyxnNfCW2fz&8}XpTFdsW3kpLI~1&WVsL;RrAR1~QT~k|0xEfNcRJ z*sZmVl1T3}Fzvf$VhY;XXH+VI%>oR3!awq=7E_FxL`nBn`x;v0p~dpD&ohzl`>c{G z!@3owA0YdMF?gY^68M!3X?|t9t<_#VEM7|`u#ch00fzx-4DDp)2sybF(mp!)W zo`4ub6NBow%uPpzN>aNj3>=NtkVJAcX3KEZxYEgQLCcO0ABsiN>isY4#F3;fJR(M> zP9lDrVYuD_YP$eTvAf2_x5F!MxXOop?pF6v>&>^}hfxKx9W`Ksllw9`Y7MM>MJ`)i zj>WhDbf00jro`BUnM01w6&Ue5^ppId^QA~Nz>%Y#%w1rGB4zU7@VY$#sJWR47eB67 zNWB3Zdu5{U4%*#w(tkbLLAd5OUP z@DepaytHVhYzeL-*YgIJ|NNmr`o_!&0(?by0C!ny*BAIr+&qJmd)#Wf>=TKR*qi$($Wz(`l>sh{Xe{aM!!xd zE4BJu`^t$BEz@`NZrTSC`XdJ*!-XgD`P8S%AH(+~r$j4^Tb3rmWbp)ui)-0y_4r-n z`jx+ZL`EREM`wz=;8-=yvcj<*LrQZY^QbJ-NnS&Px>QsbqrzBqwXe!g(%Qkh^{ddJvxZ6(+yc%1|kexX6`P@Ta`&iIbbV^F+ zpNvaGJ(uk;fYYhf%>d{joun9x(_4{W@wDRbhnYR*(ZIL0c&S^P!OA)@VS-Fjb+mtM zfICGX@DKGeGwmk(_U(KqRF~Ws3}b+wQ3}7T6|LAqwBDL30*=oa<;dpfeP$QS+@0Bw zy)I^z+-T~7r%|Sn{A9Y7bkKNAwD>?)KMWTW?N3&o-Zj!-u~8YH-@07`l~-&~*DOS$%`)-bc4J*gpd zdjdc~$npC4T~f`GWrfFEI3t7_+!@UopYR#o*6#vIzIG5dO91Pb+w;)mEFDIt*-&#- z_tdX7pe2X2>A#+vpO$tN0Rov^%v?zI8-H}yjf_MCWZzmmr!L!lwoH$8O)MO);lj{8 zQmEzWatW{5L10`TI5(lN64~RvnwC^f;8|dIFzuqyFSwF_bu%Q%)n;3Y*;;p=k)62> zey#}sdlNAWCCF-&N~3Q`%V)~QdiJ8WP03fbNi#eR4JOjjF7P4prK9Dk_onoRhWLpX z$Jz4RP5}3_tL=4y1NcjrWh{o!8jg}$LdaS-#X4`YUyG>7VLhnew(l6;rGV)(h^%dV_GJ)LvWX;J~Lb5Szc+E7?wsvY+A!9{D% zwyZ&p5r3>P83!A#`2MWw###SLCONem0^M|9uPQLrx)QL$FX05?u2&sX@x8+%VTrsG zsbJo#xkZAxUA9sXQbJ)(Qixt__bhh3@Em^qlL3Yu0@`q&G*mk$y{+K3zDK(zqr%=l z<3xlj=_>h|tiWx#cV%MRY7x#TRO@5z#i@qX%BOqMNC0~X6{9jsHkjb-0kCaJCQyS3 z^QWJs*m{3yqIuv$1t_d0=}i3EU;Qi6=`Ab8bT|xVTZ^RIJhP-WTP>syvfM!64)DM{ zC*Ak7TZ%ml_-*48e0SP*7@uz(!iPio$kX|6>-Wbo-r>t9$eXCj8OF%w)$SE1E?Khg zp@l2GeJ@((W)y2c#QA*G@SpDjK>Zp|>%3tZc+C6N_&&CP*MZr=Qipnuffd*AQb@7rE_rWk2+(w&!}YQ?9+$HrHL zV74PfjWPM;xOUNJ)U>1TwNVF_LN7XFwVrF9W$2|p3D`>tVQ5WMU-{n~wvg`$4X!F@MPVGzO z98OgbV|Z7Xxvt4o=I67oSumS05HnDL8R>K+;Z>K}kY+u_y)4r8wwc2}ix1uC0rhorHQx+d%9tM&3_@4Azq-AS~H-phtK zNz0Ev0kv#J&{*HbqCvZwHx1A5she2i6$A_&>mhL&q9Hzh>u9WL?{C8h>sH*=NfaHe z?X7fQMxZC6o%eEbkkDA#meU|<+~u-fKG}66guZ7slyM{vPUzcJ1XOki1iHBCJx{O9 zG#rPMD#o@h57KztY;VQ5gzavPr_Ps3_^dmCSpU5*1ULd~1(}_7x20`~`54Gm$gm65 zdzjH^nQ7nM?$OpkvWP1V-M#GFxNG3&JyEkB4_8*qQV*V<*cv(^RM<8o$rLK0i8eAG ziqcsu;g2>YOk+qj>f=#?5+0(&)ldl)Vl&n$OvC!pTwP#Iz@d^t;6AoelN4cF9+cEjZd0Fc0>4n9w}MStnOHccRNha=CP=H+&G zoT5{FpVcOF3{o`JV!K*Zye2fLJH0N^@2e9?EpwGw*xL_k{dG_nn{WcevX>89W+R!@ zYZH$~ktAZ^dblfcVFo3DlDt#PqXUs`=q>YRMQ_G%esI%sQpsk$LA9Onf<>cJb&LGQ zjfq|14)pg<1wjmhObbM=zHc}Vio3K@EK2?IfDj{0K%}Crk<5@pqwdX?^XF?PF#qV0hWO&^BDPmUl;#R|dJNR=Riuk=Zi%x{L< zB`%?w`fLe%X$L*Ya(~H)I$M{Al?N8pIR5L9{dSnWy!TwdxBoyP8N96|sQ{)R{LV!%O9)>^%Y)N&_v zh%%gpOM*qmuv1aS*O94-j&RrqlEUw|e~{+>*LL{HY_N}11bH5xtIv^SeO3^4pZfLB zFmJAc*{z~5@z6)^QC))2nakwhi>F)Ui5$>bofQ;=*pbkbWF|j{Ok`WxgO$h4GY0nS%L0@3@lV~*Dj+aw9YL=ENT>1 zE%e~~F_%4}H|$G>Qi{OWO=wHm`4{6Ii&!xroTaQiGGEjrxQ~8t1mWfc1)s5z^PZ&F zEq|e3+dc(Ir)!%HB#NH>=m1Muqj|IQ-H9SGo;!`{LU$8O=}+z{8(AbN`JS#0Atp6)M zerg#A?&JoD#Bh2Lswj=^PgkS##SF+r>GTL+d7^0Wl%GrvF$yvCV#^pHCzV*T$)iggT994|eXvaip}O>Op- zf;s2?Si&xFzQNmcTc+W&=#%Ia7XXdm46{Rz6p1Cq<7DiP6gZk#m@a8q3-!|<%Qbbo zKqXH-ZVk+8d#DtaA-S+TTIQ$~S*COD&JhH5lYwWa$ue}(<*XTUVc}Q5Up71Rj4*rc z=j_ecP8}2qKxF=9X#@jR>MPS}tu^HXl_$H(%QAMZCkOz>(B45g6kIxNaW}eGn64{1 z{ZY24p;_wxYVSRuqFTN^VQEppL>2`J5=8_=lH^til9PaB5tSr4v}9UDP?CsA59ZLA-nKmb^;c4gZS=bOCODGy=zYdv5Gz!z2TBZk09|v-~N8AN6z|(F=1O{HnZ}HYkUT5 zu5yn}ZTBO>HNGvod`JxkmMc5zV-hII>z2PC!sJgt9)0X|LP=T4qIwb4*5~n2&f_OH zgYDAT@Uk)_%uj_rmW_#I3YS%L6(e}qR~5|oygm=@NG-%WB`DAHJk66WeBm?vl)Uc8 z$DQYGs|c!JVzgHuiv1jCJwt7@iQhtcgZZnCF-&@PMNxlf9DxqlaG&2b&yV`AB0P)rE#WPM8-16?ya(|z3Kvsb+L{a&i!SKIR2K7*G zXUmO=@bByZsWiwVj1)@1{!Xnh#ej8>Z3HtNp2$5+1Kqm}k_)eANC}z#$|JmW1v!`y zjPyI@zjHV?AYa6qCdc&lpCpDad>JIH6y51${*NtmJY8x7)**-FIAP8N3O7h^o~KVEvziUlOyX$#cXTrP`` z%i|7b#>&V%EqkAKe|-CTn0e7V@wP`-40yKY{3R={kiDu^RTMitHIQv2_C`J2iTTFk zs;c4-bP0>iG!YW@X~hi0jL6JNpPC2S#gcpr`L~NteB%XdBFm_}uSgTx6>s);o2!F! zHk`*EtN-1vO4$++I!q_%o9N&Bd;%1(6oMqy-(8*H1tdxDIbLP1_RofAy&~Fqx;bgU zXWC`Iw$^2q_e(aDJmt={59PC3JU$+L&o(t<1hmq6r_EElEyvZbUGS#j{JrJO5Vy2) zx@;r}aEcUmf9kcmaF9EK`Ll>FE-UNM$oN> z(Mr7Eu&Fa{H9m?uYAh_W%2vXOT;laFbKba5)Fmbo7CG~qnMSQ2^RRt;7!p3SvD-z@ z`0Tc`rB9Z&8?N!i^Y%6eC@%ZHX)DZd;&ZMf^$Ywm8glebP+t_(B!{%X_h?){SLv(sCF@c% zf;15#seL(vt?F&-lf6vW8W ztcyQcV{XSvcc%taf+BIt;+az8QzaGS-IzV)Ve1apLpHFoU(>r`xoW^<);)}&;Q+mm z$~GeUcLoP)KOCWE3H`{n5>bB0wsD5|8BGF+45kt!z9vRpTRh zt$rC))GoNz-4#Grj?cA-KFD(~5zKm;h054geSZgkpYg_gx;P=fX`g>tQb#Ra>&oXM z%V$4ySNiE_mGj)a1hb44!bPUavbPQJuSJyS6Ox&Fv_GLL+xz)8713{nr8L0Uj#uXy z*s%qV)^HUj1VLJoJQ%3l>0zZPkx+?qRD8}0<9X)?Z1cFp_lsBfBSn|j?V|Y6)>GPY zcIMmfO*_Nl$Qc&tkjZKnnu@4+&2|T=x#;p&J{Y*K+e|j)85$cV1_NCMb@B{)n-@_+uR^qPt%? zK16a`Y;#cR?et}Fbtb8dZk|yR$Br>R@G$0Z|Ap~ey)sy%>c$3k0Y_re_r2PGs@<|+ zyP{j))e+@8?SQ&19kaXO8?h>o{G7VM!*s0-@E7t!Q)K(Tbj&5|a@rz&C--!r9CJHC z$6fsWzSK8g<7*D24_$}`jSw>CE zshlR+41#A};PBXaf)56c>*%?!xPgHP7M!Rd%GazT-!ZBXm(Yo0P)6ZvnDs}jFXZhs z3+THekN@nYdD*S)rVgnri2UBrQ!@pua!e2nR8d`}OZ7$69PK?%Hl49yFEt-z2KcGK zfvJGIu=G;ReD@f2Q{p!(_HS5T%TOXrZ*^pb=DB|hnaV#mV77JPmXuNye~F>jcI!p) zr3LqvlDbTU^Pa4C&^bN>{C9A8DXMd$;)7`6JCo`6Y+$=rztVPC9SajB>OoT{XAs<% zd)P4N{PrZ#)X8(yPGm&RxpZNo8zSWgzuiPe@R>*33~92%W9-`1Irlu>-7qdj)_W(<70ngqL4zcQJVg;DGu!OXFKyOO z>e|MoZQM<0`nbd+lA#?xDi5;s~ z?yu!m`H0XJ$u!6}b)W75@ZZSt8Zm1>d6nyUqB>pK#zZgc<(n>Djh)@vRylm#mzK~V zx4=yvhBcqZHYC%KM5cxzUGTI$ro3D9S-c6Ux3FBt5pA}cdT61XneLRyM+_%dZ!Y>C zJM*bTH)h<{LKEpDeZCVE9GTfGFEf_z4M((d?(e4zuS_wyGDiq4&B_U-q1l`x6Qo=&)htU+ zaVQUa;`_V$o7tB;d$ER4(R@-TRsArNr3)OHijVc2cdw7-RNoLD!%4xxJA(G&Rqz_y>K_e+K7ZZ+xa<%>1XM29Ezu9+>TS zF48~#1>`|>Auk>Mt4RN~EAp2ufXq{%00?2-Skq|5We5bDR_^%t9zcZ!ceB z(W5Yu_?sy@sFyXAi6>qhr`)?{w8ZR*#Ua9 zUyX^EBEs#aU_7=`)%&|HNCMG3vpE`B^Pt7*|A>Os(TWVW^+x!TR6a15CEIXx#MCsmIYp z>9X)=V7MFx9kZT9@;l)=^9<(1u@h&~O#2P1Ir}a3f7|b@r_HyX15dea5lQ35Nb$1C{UV*|xeYAnp&0N}>746q;Hd;DO(kWv3#`hC zmL+5@h0h0zoq%E;!jN>@fsz`t^R$a&MjL2OK4%;4!fz%po~O;zZ5A7*?THJ6Mzx@w z<~NKT*L~jee4tkky&({QQ`2uPcVD78Q_`xFJC^iG?cnFXwxC)FWT$A|p0TPfA5U1@qQndW-F za9$Xd3Tl%L(Mx)3s_t$>u7%l0@S?BDT9F7r$Du_-gAlv%z?89%2E7swZD5sUR8m#H zQl^_0{p?VozuAW=6zsXK^V7cf$x7&*i-H?Mk8rxn>_G(A6(9C>uqlb!a@n}h*pH1M z?2qaT-$L?!7tsj78K>}4@sd%EW#Qfj9sBOjr$6Ie^9m}DBXQ^=9K~(hwWsD`;>-~k zJ-wsaLMEtM&3f;L^>9UsVhOh7r&X?g!5lL}PY1Lu=5OurRu??Ia;@pyr0&wlXiM(1q6(T8(Vke9O6%}YZG2UN0qdz=F0qvd z4P6!s&o>KziA)&KeECOQ;CBeIUjk!kygC|Z?hf~!=^{@XSW*V+B${sn@|2rP20 z@4GU}qytpbAs+!fpyx&uyQJGc(}w33Gt!Ach-@ggUuaN|k!F*i=YbX%$8&OZrr$B) zJhoLK()0-<&vkV2a~N0Yc8#%t+vp9~#d_s^n|haX2~xt>p;E-(=!M7)K>&g%HU3m3 zh)K)Rvva~}eUk~2KA0&+7#`u-J*=L+32D%lQO97v)oeqjduO0q^yOKbLXRsLVno~|=r6#itxB8Tl8_m($$I}bXkfv9p683%FF17` zwwgRF9#yV@yjkl>8T-GD$0HER(7V3;?eE&r*q6Y}a1J+zVE?u#)eOMzeK4l}FK#m- zMDL0yR@C1?k2lcyo6Nct{l6bX3VM7Oz7$r9gDS87`~eFEwU@z~C4WDN88WAYh-eAm zI{Yq?1(Bu;;N`0s(w+Z)knBfrJL$a0>pu=OG>!6gnEDw?PJdOzpGLm_mM{aCx&5-R zHtCVvfh98q!kemO-qRsp$zP0)d0})l$F4n)`+Xe4X}O1WzOTJgZ%9}~3G)&LYs*N| z$KcnTr;BJAVGKnVkDd%(X3qkurJC$5U*U+4=dmLKfWN$i^pXhA_|dC^za%{eLdxz= zK=6l9&<$|tVkM8-`G2TFjC{kY%%*de(Qy?~S$b(_(jeLUI;L>^o4L8y!)q6-@82CN zO7{t%;Su6PEd$_FSksrh^yaAZ^gSv-RHXA4j3`mra-(0e3_{SP#m=hmI-fqsr$)ic zeCFMs4DlqJI`;0w+NBo@Lf4h+@uTT>`(nE>yCabcX5HF_4JV_Tg{FE~8hFzl)0-86 zmEM8cM4(6j;Er$lWl?geFFo_yhN>Y6fn@BY{;6EkydMH`JDpvN`qw~v3G3+-VWLa3 zPW1|J9N-13*YB45?T0)P+-@3t&@ zR=mZp6@vXra9;@7Jd>X1Slf9X<6(8&rYk{n!|jJh+2qN>kFuw}90VVN0LP4EYBn7* zTqj8nA}@N72^3bnqS~9=5O-#zfGU2Gam1}MG?G8lWDnWR_R0;9eb|rA6di%E#$N}^ z$s~OXmeU+lMy-;lkFIIpZ2#ia?tE_Ar!E;REU>(Vu{2#R-F@`Q2aC3Bt&)@H+L={8 z5%_{u<2~keYZvtRwXbU&^cz=(TBF=mE?bM6(VccEdX~{0ykk|=5C|ax0=c2C6fS$M zE_6nN{wbOqi#$DD03+yOVlX<7_3FJK(VV628Ar4&4~FBwM!<)G=$@}~ZJ6)RH7kP2 zOg8TskMt~6ie28WmiF5ZpyRQ`7!jSn|C02zr%(9@y`nrbbf7P2*(`rq+_Hz2&UI-@ zdY9T`kYb0{@e}Lk7vNDi(oM@MrBgA_w`~vIdc?XX*8?NwH8B6;1jlrer zP8@-ooO$bP$aTZTQorbB z5Vd*qXf;lk5X6?|x-p1;2k;l;G70-d?7(M2=9IFt>1$yo`9C!>JcH?kT}`g@?QgE6 z7tuMNl@#sHQ}PMzt5$uw5=gBaSR-})#u?>%1!4^#R1Q1)+E*PMn$|gMH{}f)8Z6gk zDVk(IGb;5c+kX6jQ}SvM!E=J^R4S}7Xp`_KlmSVGG0yW>ufiBIs))q*e%$j(`!x>a($x zO5!??Y!HbS{QVMU_JV6Upixl*1T16qg)L%?sCX?Olo@($txN>$&k-q3(`P77u5snB z{eo|W-owAZYTZZTB4E;y@9y;qY}G>9xt-NYNk%iTW=dVV{t-e4W*CJ601_u{EL6Xn z1e$@vm~eMU6I8A3aFs>z4qM4-+a)Jx=$BL?S!puSmgr}Z#<2{b?~Dk3DQ==c#zQH? z7t9yhNhC5oHs?J1m1G?GCH4>0g1-I$N4I-m*~w}ZK5|<~?Y&wHvuc0S#EJaD7EJJ% zz7^Nh!lHOlz-T*rdySg2><)E6(%Dbse+u*KAqAq0=}0vQ@<6`377WhEDCgcA=UW?+Z-Krmb+nmpufpy}b~AnF*Fupo#TdS0 zzW#W_RD$B2<7(kM91TTDK>hXGO0qY166_PM{Z5VqUpTY1oF@g^uZV8B^~OsFjh$ip z7hmr(68v2B_1aq=lrm(E_3Z_B$ttu(Z_rbfSUx1b^>D2AJe~HC zvzKN!FqceSA*7!k*r*+P`q$=72vuFKk2PSZXz&yM^~(+LtB~>>R(BbZUa76F?3+fo9y{s)W(8aHpbULsyUn|@rmC=q-F8XB_EoO z?^AK3CaVYG_z~#jithfmk~@UDHq9~%#r!6nf#wVO4f-kRXw$J4@zRxPKf02g8Yfn| zL90cdChhF+%A97o1Yd|Zt+A89ed=>=Z$5K@b9b@5W!`xarHOG~=uD(bGf3B$)s{Q} z4Yvug1K1ZMoTy)q1a&ec`lGMqqI_*FS3&UH*)o@9Tx7rP6nm~g=eLid<`bMcX*^Hb zFku7M0)-A{B|nYqEwjTyrP9Nsu2f2it=q+K2u@e!iC=6>+~_Q_@8^7EXK!gQ;d!E2 zKdr^KAUCDFlF?`S(Uo+LSAuUfFOc|3JZ!5X9JBhNYnOP}POC@}9V+}?@u6vfv~*lb z;}gY)dGo@p6;nJ<-#ZC+iU6~#3guSFon>x*pafBy^w-p@AZUD!cJYh(rT%l`xG^v7 z95Y1kkCcOeS-JF(pVcd}c87*0*QEoPj09l-EBoQ_lp0yq@;hzOr!L!QE;Mk7u54PB z?e1ofHE%AaYFd>}!}2e?KS~#J7+=zwVw9NMAh{@DII>75B_Aoa7eyrQKH>dSK+oN9 zWjx|fx6#u|VCTWaQX~&X-Wn;5KPz&vESwIPUT|4oIt5OYl!qfge7$F)fBvcv&>~{6 zyRXg*Pp{ftx9wbK=X<7%34@10I!2isG#9#h+EM))Ozv6F6ThxG4H&)gcCPNVTC0pN4b@=t2P$ddB^vyL255l zqNEJ)=gE)>AHMbGI;4PZ-upaO`d@k9KmZ?E0mV@-Hh+`E-;XJ;T`1!-pYV4OnEg@N z>{8gHkT>WaRzftgyeDAb7z{aMn*{!s*J>vXJcx666M+1^4wE*6OS< zIafh@0`d$&1Vpwz&UVx#KBG9en6( z(E^>YJBr$G^Ll%dN?w{yl;bRkg?3K%#XU%IAQ?zoFH|yAaDn5UHkDlAT^g)ODBmB| z#2Bp6`rHzDuMSi-J3m^&u?otIacNiSji{+tI8iJIz}&gxG$dyD!}3q6yIF z3b^wsz;%`56_9)?aT^BVGdjHHZ?uGJSimVqe*l5*|0@JCv#-)m@}12*oAcw!6W+Ka zRP4u_Qwu&j12z<&n%!r35(I$gqa>2Ft&6nu}! z{@>rEhc;L+<&O=%YK2FT6jt&!;FSPNxS?oh7Z+7YHC2qW>Kn;Y-5HMGme&T9C1`=V z0Mvh37huMs$N&lm65xk|;%iWe(TG+QD_6mnF5k-(CnSb&gg`msdU#rJ<^wN*t5`C%d~2arlC^`KgokfuQd&@LFdth=wArY^@$*y!g&fjEaPoKuAYlwT z$NQ=DEmP>7hX4|^bzI=kT;#h*NA#%22I~hN0rFzCJL0(g+Pmo@Wtt%Km~2gMRtmN_ z4%(vohV2)Z#rfgTzcu1$jZ*$*VPq~y*`r-%AJktyNN(YS zt%)jE488}9bV&gJXYV~amxmSkWx8s{@-gcTz6%APcx{Rg^H!hO({%E!U(axgX>G2|GgQaG3FMSWw zW5Esl7F{{DmVD+NLrkXn)eOpjABaElAm#8%jX>LpK#SFV7F)Zt{vy&kuUW$0&@{;E=>`9 z(jI(F?P&TXAa?MZqHMPxFkewP)PYR?;j=e^liG!yee0l!zbqFEd@7Bfv%Og$p^t`8 zj;?vhu10d3WUK@MuVv3rD&C%J#e6nym_vtm#%`2PNVHEHL=5R+-XMHbxY(#3Pw?f6 z^K1>L{IfH(ii2>XO>7(hBPni7p?1KSmKeCXM;@HLvn2 zml)1U{<+|TAIKk+=Uu#zUyv*1c{q^1^%@>`q56*rX87+bS2H{>>J3fwQ5~$89_|TG z!LPz<)JKHFe}Tl>hS!2{p^mri9dK=q|p$+fA`M?Fp0;bkAD1YKj>jxC5r=UL1Mg*n+#+2cWKejruMwg zEEJ4R6xcmoSS>*Nq~$mY7(2f?q<$TsW3%jF;W+8@l{HPSkBenirTgEi{U|*i?I@*Q zm6C256kF{!v)5L;!RojvdoLiGwI^H zJ!COk9Z9*lL>`7YYqMI{@FSVw^zuj|oF-f2_I*kE1y~}j$7(#$V1GK!(_?!AG0_<9 zu;oOiu~m!k;LJ*Ar43)5Ga`CTdpT#+xHe|lt8$~4S*W>Tqnsm)Qk?Xj?oeEohmbWK zSCz;IZBhgYaeO3-=R^6g6YJt*=8+be(=O0lw)Zz=2A!^zQL!{OS1^|jl&u0Tw_31jS{*-xI!3m5R% ze{Y=Vs0fZ~rl$=)_aZ4PnJqRnKOZ0-P69;Yo+{j5QMFwcl8(3E%ZE?V5bvo)uH*;| z7LR>1XZhq&k5CjIjmv7e=XfOxhzxz2rxmw9`-CebPjylc(GFevgM_0#XK8t-f@^;y zmwyA-u?$0&_LOIRph<~~;X$AUti#sXsi7HldOVmn1#osVQggV$4CG1=M=TJ)4IEvG z8O(gHlf?+bw`B!4&uf2u2!4NmZ>U>fH|>XP0%!@$W84(MyR|f9S?G|Zz`y;ozBbIa zT!WC9KpiNQ9G+L7LM_{nqgo#vg`hqY0NosAgzB+;BUoVauQ6_lWU zDBo4h1zE0-7owCy7Czl9sNbS2-WbfDvT8BNco=19&Jr2z6L6WLb0W0uv6m{fen+>h z%0_6PH)`?D>$)OpbVeBxf11=c&W?kOOK=AWUzo`9ew?HnZ4j+4X}md6b2ML;{x8CXZM1l|HsXFW8()kB)GqC*?;MrKXie zccP}O#LdEwQZcarj(zn0 zMD*2Q4-u9EJAdB06aeDxKW*I%sf1eo^?+l}EDa$lRC@4RYg z-WhZ3C%5x)l=OsKp4feDXsMHE4PNFwe5mhrp8ue`&ZR>9)Lco}7MZnxkcX%mH<&<{ zVKGnBVCECQTNtmsTaGy5sGxzBuVIXo&*WQ<+ZW&Jwg3kyPGDuAc*Be|Lqv}Yt!wRQA2JGi?lNY2A zQh4YDTbbbUrKLrbi;3_3M7XSO4wbfyRQrKXF;;LO2KN??l%6NnE;ouudBUB6q$L_# zFhkqn%Y4H=;3sNpZ+kuU2^M=xsK=v)OY)SG!j|0k-IL3>qCaMr;1 z0k6txD7#)_%6+lTWDwugqqB#`t%7l@O{oXt90E9g66(1m@ zS;%PWlB2bORG&P+Lh?Nvve2{+grBTuRnCBB5@nS^1^W~I`_>D`S)-l%tgH(5e7$Nd zOQ%>I0uZEts@g0EVj-`;b0?XF)%5EzBQ@gIaq_nuC3rn2>xz@~>!K23N~U>@l9Bn!qoiWc zAc@u#(v;*$VPaS~z7S!G%!F*o+jr$}e|o8E*WU#HN>VC!iJ!5vy#U?;+Au%?P9O!p z&Ca19Op_7+M;`*A$nn}9pHNw5s`fFaOiey|Jv=?d?tU|w)A`OJ9sWF;3S!N^&zjeE z*r_)*Qx1V-^@4RDBWW-~Kxi&kE9j4&h&=|gIML(90Z|M75f-J!;-mH8erHB-{EJit zASh#Z%F$TeY?0u(O|*g|j6 znqNML!0dFqFS~5lb0uYMs(tgi6f}qsk6VaP((ZsR*I>rcc7vY#dx5t$J@0E|5r8KY z0zN8gU%CuE>?NahE!~34a!bo)r#j@`&f1y8YoWyS%G$tL$xh9)0Yk>*hZ^l#4BT?W zq=9TSPit`E!A?*R$DiqY&<|8JJQIJucyhlsW`E_g(FXO+=gOImPX)kj0LV0(Pj)DP z*Ha~w$cxjQ>zv6?r_uhV*MA)kb_Klg6@;_}^d^hQa=i@&=hQ^50|A*HyTAkq%PklJgBu!SjTy{y3p&?r zz}0+f63`+x2aGXyYf^;Nh{EaJ)lVyNm-g4Nj1iw|Aq~yMKGaYivc^jd(Sqr^b-Rbb z`$hk+0Mx=sffi)_Y;qp_a43)O0QWGP=g<8E7OnuW@PEySLpY)KuW*8mUMEfxmNt0d zXPSLq#ICGT!!CYV*-fk4Q73@mUwNh(ee4*Ox{So_`@rAp?CmIWjrPRN?l1$@BlhqEYr5r{!l!WId%Y3=L z)qsoLN`;4c)I-(%s41dOV2|9d%F>n*LQCp=TNB~}&M7VKHv2D51fjQz@b-2jUBX3; z=W~nfB|??bgesm*#3z z=adz#g(tVD|S#Q3IFJrVx+K5X_+K&Rf2d zO2u=5SMSC+k8E1AYh1#EC6Fu*V%xF{Uk9)(?|)|L5_k_E1)Nbc7eTERVzpm_QlYSF ziQs$MlG%=lzC=Srb!%vP#Ur!n17l>$yccSa8eMD{!E*=~6Pzq~b2S*2_v0TJZAwA0 z3()aUE!w4YbSNEkRoPrsQ`*`N#_@XWzgYJQ%AfzL)Bd2J3hWV91MHEdlgJhkQerNA_Lr#b?1SV+@!WI+|K&`5-Z?Ulh#w2XwGyX7Jj8A4hHf{_Hlwjm zSKH`x4tX(g1=ERL^hzY5d{7;GkS+xrDt=f&){ zzkmNG<1E+|4SIU1^kX%ZkZO3FQ!bn}DZyO4|J1nq4#1)md$*pY@)e0;AsaAtX7fwn zKd|RnR%e}pz~JJ9@cj06nVUyYRILP#WWCvCAxP}I(Q1-KUXOi9;aBTJC^J-o;)v&H zt$JPTkp!pBea8wQj0XB!y#Q_ytEulY>Jh~^{1tQUEpxuYszz8 zGo#L(XzgokT9EU%9K6wV>CLt_=X7aXF+Y_KJBgFkw8>e_RYC=*FOvPf z1t@bNfc^emPXqbgdfUOz#nrEcn!rOss&v|Vlh0snfi;qs2Qz(3eiK6iA16Hx4b80rj#2vtn`m>ulXC8RX1$MS$5CvGK64Sa03 z1l^_MzOk8}&SzhDb8-_3YEGTWr;aKupnxKavHD1$Th<#!1D%(~H8KktDSU#|V_l`jG83d8b@;Ye#09oNTxKBI4aXsXnSIsJ zVr9NFBpxG$__`r7R#$m`0Oya}iekyr*cmVZlQ0>XRr(z;qH56%D*daF8%w<|@D}1U zZFXy1We60E_-xRd{^4_cwPVjZS-(}uxO%q|xrdINl_Nl40PD@bZBIiqdwgY4YBJlT z7kPXeq37z?B(^??!3_#pICr0-0d|Vs4In5tJ9fy@!}VZv{U%=6VwoV3uC>|Q$4^O# zmo9HpA9*#ds?ysN8V9-6c3N2|#r>qVDq^H6)&?IN zQW$0eE6A2FkGYuZp9?2rUFP3~IrYd!15Kvokivo)i@7LOElu}psRXwmhj3$5-qv~ra! zPu~p^LuJ;cN%n@j0VQVyC{o*`CIqMlLX(XhlR=v`G5~X~psi>%zXd>u&hYeeHR+FT zDijcfUFjDisjb|n#x>)VN_S>3`q8{wnItpk#TUNEoTa+uc^(Y934#MWR!G6mCscG~^D9j2a9)nS^{7q85>z zVAkkD>?{w{RFJmQ%VWq75JSs3DI7Qt-1yDg<0pIqa9zT76Y`r&L?48&%QmXK8r+d9 z!tD!$;Wk|`rCY!JYUev`R%k%8Nz9k==!M&msDl)dWmIbTflNljYv+qxaXL7msSh*f z2aH?B1B(~mq{P>j;Zm*Eu}eP}(y`}PWol_?Xg+yVt2c;Zynt0OiSiV1$Zqaxgsdj9 z($FYf|E=xBNbU_$5vNEF=`;b?xjsQP+;$s^byvs!a*0x5qzV*1gR|og1Xy~16Fhjf z6Vm$(CWhaw=AD4Nfx}*vXiZ=wzfnzHlh_4OTG7becTa&9D}MvxG!*^MUqOoX4yiAx z-TmfPzLRA;^Tp{gw$lP-86dewx=^F-7k2>#o!k11IC&zD!=H7+q ze)r*!Cmu>F+GEM8h~~z8f~Nl7`lgYh$a)=Cp$!$oBo)*h%~7K`_u?2*7eoXLOhOo- zH9ib2O-bjE^(Z?<>tx>(aW}nWGfa3c!|hyxUEeCu*6F3%bW(r__teTGh1xoD55gt9 z%o$538RM1&_BN{JZm!M>+QmF~{-vSgTAb{3js;M`6)3OLg9HKn+u-v2%pECqrQ*G2 z<{1O)YIP7+S^zO5PNyn7@WyQ2fz1bDQ%&qP(h#) zZ&+pY{G0PN^PTR^l3{zbEgnlp)<_a1-RQ>f!<&uJaC-9tU z+#YB$EDuueT}cAT43H!7gQx3TG(E-^KU0eW(F2fM^ckSZIf63y*MQI7ck#*5h5yha z4wUJ8jsFps_`jx1$u(k2N@Qm3&>s`T&MsBv~nRB%=n)pAa(WsJUvX#q%oWT zvPW-;plQ|X9F;fF*8)Qw7@25)H0gKX#(W=3mHXpA{tY`1o(tCiw`eSP z9Tid3WqMwW8l7N^g7zO8LeCDAJ-gG)QIu}%0f)=6;K`DYUv(c9euXCMQ!aA zN%yZ!ahmY`5u$t$h4$%>w7zBzlC_B!L1-QlJ_bU2dwX(I;eVIUx(SQ zBc>*(BE84-vyhsmVW9qqkMYHT6RZb;lHZK{2uN{e{n?ZxnLr zk6h5h;r@>2{@e?agBFpSg&2;QoOMPya|J7{=Yl_s?APxu6^* zvhmm->BoaT$~OOJF8DJb$o%1W|17r3r2C(_kUc;XJiB@PpT+v3(*EckFf0>jpPt7j z|5+?}@#Fu@^`9^PhtmJg?s)45g4+qIGLLCP{$Y|jZ+I3WzQ4cUE>r2B!ZYFr46UF> zVk|$?3o5vI_C@6%Mv`T3oke5YDtd4IVLo^`V7;3N%;1qMQjrtKQ@`Z#|HHkpjevK< zi>wxora5=o7j>@U(owOX@>7t1YeeTE{s;&1B$)K%@Y&zxR-g(X3)CV^%=Zf)J=P0M z>Sg`v2ztNQg8OsYZOk2+#@O3n(%ZKxk3{AdFnnPJJL^ZLu{GB7BAWgaRAq1|#!r9( zXc%yFIWmnV%D|Z=ZoYA}uA4xEJee;oIWmpBSLvk$2#i78@^F@a8##8xL(>Y>xfkcp R9s~c$NGeLC-Ff)p{{zlJ;^+VX literal 0 HcmV?d00001 diff --git a/format/diagrams/layout-list-of-struct.png b/format/diagrams/layout-list-of-struct.png new file mode 100644 index 0000000000000000000000000000000000000000..00d6c6fa441769a3c86044a52186d71c0bc23d54 GIT binary patch literal 60600 zcmeFZWn9%;)HR9-N+{ALozfsF-60*)-Q5k+-QArcAtfP5cZUL-kVa7&M4G$yIeH%T z`Q7{Je!BPd3){W_v0|<{#~gF46|NvB@em0g2?hq{p_HVk5)2I71Pshw5(HTAjg1Kc zCHQm4Nl8K&rfQ5}8~g{+UQ)ve1_m4afK7QvN{QkCT=3CCMcr9lR)*Wi&W6#z*v`;| z(cQ)#Tnz)m>&^{6+L$;Skht4e+d6T(^O5~?1vmH%{hEo4>N!< z*csUvnaTK(NJvO{9gR)7l|;pV-VXl9M`rHqY|qWan^6c3&Cg!J1 zPoFY?D;S(SY@H3<8El=%|9!}>=ZKm(897?mJ6qVM++0s>woTM`FZ#6-~GIvmkHYO z-v;8}QU2#!FwFc&yiEU189!3>`qL#C7(o~*Q6UxgJ3HAfc?2IJx66Wg>JA8UB!V?7 zdvK}PGzyCr$e48K^>CVb6EI?$5!iMPj}#V{bOP_W{a7yLZU51ln7;JhYj*EbOYZgh z)<}9`Zu-)QTh3?R(&a}2R|f(<6eM>LB?V#8+zd|=hx&v>fTs{BlS4V!e2Q>Qr{|_6zg?g6XHvYBz&8I8}$m_oc z<%HtShVDzx!%&|OZ_MspUwkhwedj{?moenoAbz01z2r1DbU`T~HL6DY+a}C0{X=kZ zbh;t0h7EXrjrT8eV#dD%|Jj_2_kY=dI2y2lh@#zJ7yf!{hdHpM!qtQSWe50Tzz&`y zG?4#2>4o4)F(KV||9889Efh-5$Nb&xC~?8gXZR@p+lUpxtE0&*{!b_VnUw#p&qSVY zA%1Q4De1v#C~*w?jQV=4fY0TFvQqJ*?neSq`0SP&1MzQKkA^4`Wu8vUu#R%AN3F;3 z*vd!u`~1E}CKTa3{ezD)?bBKP|_@K(grZIh&6^O%-J&L7FiVxj(e*-hV$^k6pHH*=>NBK5nbS(Q%%f)$`Tyfbnj5(|&W&eznO28vU&Bx?kYt za3%1OYSo&?uQlWuA;?s1uj|l@B}LSF(1q-KwN6}`?K+>c_Oao*?R4Hsn!X0EVU(>) zp8uSpc$kvNeZ_y6sXeRiv27EnVTE>$ZmREQvYKuEqsZQY|EfuU#TyThMAPDrlhDV z+nx_5jHW3L#?YFPnZkB`&h^!8HYVn|oRlE$?fIPT_9;ZBav1qyBT@c{-5U(8u=gV1UWBHAv}9ckHuVpD#Ghu! zYj}#+=`pM0cKubY9qBFeviGq~O)s|7*fVOuPu|C)pusr{+co_}Mnk*}W4y;B_X(X9 zVx|-&2!nC4_(~JapLoST zhjL;s0OLF;$Zs-5%Qk$0IviYZ*!S3O2Rdh7XM%YPpE zl6pk?ntL?s7?;=XTbwr$T)|HshTg*TcFEbYP7Nqx+uvM$7U(C#FWt)Vvi&4I#(%j( zu+|<|i3NW}_cLw@K0-IQ>I^69$U)53Fikf(?8Twpla528ULY!(Y0$w@qiEPw$Yf)t zCz?EZI!v#eAGyI8d217Fn3LcKmmvhxD_J)%3~ae{?gsRcARV%q&AIH&YWH|h!jiFe zfUH?_J(}|q@y$W#fU$+=UPb7ykbFYo-=Nz)7?72!>xsw>m$I@ijLgupMu2^;pX_;i z{UcK7xjQ_O&r7*jFn%Pl-+bepH496jwx=p6071*nZfljRG_Mq!Ir*Xc*4*&Z^1Nf!Z zt4V!|=W|A}o6H0bQdtgtc%<(U5FWh8-|8(~ZT_4itz1E3Qvcuvpd8aH=Q}JJn|Seu zYM&~C)%k(rykA~j?Zd2`l8m0}9*m_bNxuKI^ORb##4&FeVw(9-EP^4)aM|aQp(H_B zo{wFI5iM#144))Lb6L}!*UcVb&Rtz~{&(-Ae<<2e@MtaJ(aozCWl-~rP&ai~HN9Be z;EJ0bdrnu`Q}8zTYJrLfJS>;i;}@LY|Hj>s0+ z`p2&k?knmR*Px=Q&V94zEN*esTh%U=<=aq8VxrQq_mXFu6b0_|%muih1)IjhT`C_f zG)Z^$-g@;C{fbG}CshgKt`vkMq*a$O73`5$@tARVC(rWDDXOWz+ z9<;v5@!A{Y$dp|tk{ZO{5_^zi9y;-c{;eVTgmqnf4u}=*0V6RK#SVyl_KFq4D3_G> z?8<{fh^H!H;uEKm{X@ZZW@=hCZxh}p;*m;$=@KS7a3x=GE2Y?UNy4Cbke1QV&Qr8U7p*WuDWXI{kwy+m z-pvQmGQpYLgf5HJG#;%!OrRWg?lvB%7?vMR{E<&a$Gw@`;X8)!L!w zlE%m8Oj;pJTSAr_dO6-*eLUlaNVuRzJFw_19F1iQDdXX%HsLpA;TF;l@Z(eJ2#eV^ z3LR#QN=HtVVsqalrOQ%e54K6rw13A{o~8Pkvb^*Yyo%NSEIL2ib^gg~|Iogh0W2>K=x>K#fkp$&N6_v6gn4|pvMVg$Pk zb}3*VhO0Y^uJ{m1S?p#xjnUBHz6#zdf)Qq1@M>o3!djO~qeG1HcP1U99p8JUU4)PQ zjA*NxoY21G&LnJ$TTl(cFp+!*FY*z$fL!#mt+Y+g&O`=F(m}&s@{&?iX*S79&n;jo z*O8yaa%dEvNBNedn|4`-;$a5|zQr)YKMKFvnPX-!?4#hIPk)8ESQbh-7`*dDEfXm?EiAa?fo33{kqucSuJ>^Z(Y2&*u z9&9D$0w+~-M>t$M@DXXBN436M|NMBRq`z%+-3E*~YUV~bbq=mUw9{;Xaq^z&Q+HAJ z{qR>|C7bX_-BcLIF0bGlv?!=|I!1F88xKs)JOS3w0XWV(7F)}j#qbr`?U`o?)01|A z9iE7xp_Kg@d5H$HWg&Y5A=&gRU4x5t*cy~{^*2pv1#;6?c`6mXh>MJ2C*7%$UL~+` zj7(4Jy3Mt159SJ{BV%{Qp0viT-A~t?Q<0vLy}^%Sz%mz!vatR&$=Skra-H~TU#*Lb zU98h4x%g`!u&?3=pdh5~yeRQuME{TK>ciJWX3Rw1@u(j$C8a~$O9t_<-LK5x ziq5)s1|Rj@f50kc%S%wzviD9!s7Ak$K4)~3q}*OSnlVMk6|w_^D=@P(5yxA_GN!%c zj?nGFo~m~qC-LKhPOT$d_q$DEZn8C%?h(gPJ)3vp>BQSqmnjSfs@H`%jLAtlJV*TH5Avq)oX+|DQC5! z!>whVLVWM{1sm6gGj{81wiy|+u8v@WOr{_{!*}v!+`vrMoUQg>$Xr}Z=iVgkzH-c% zwc%3eDQ^Dyy1X`6D~e^|t;N3}SkR6d^|o>5S>)}#uN9xC+md- zNwqzd!S0ip+|3NiO)F2#KB6?M{F;Eoz80Q06)PiUqqwvz)FSGzL0uMC0DZLICCN-G z8cFgaZRt>kVbDpCqD_}2DTF0#0#Vqx$Kvunn^rmOTC3Q5DPdd{Zl5ntt*|Z)5+~ZHix8f*~_358~>J{A_OtENM+1YDNSm{yd=68qG z6g$tKB<#*BQ+>_QAcr636Dj?yAN8?JH5Jj@!GqIZ@m-M>3ll7=nEP{_1)3j8i<_e& z%IpYZP4=iA-n#&aUAwb%0!)a&|58vb`Ga3gBa2*LB~;22(tB{ z?j-nG;u@7E?w+4YC*_08 ztomh1`CmlmuPZ^4YOxhWt3mwi48igN7aqs^RUJ=u_z-MrO3BsY+hI`@BJ2yj(vF2TBdFHU0xV3B+ zwv!=Z5gMF#mAXx+4${~jY?-%EU7+Z@b(!HGCj;cM#tzmiy&Gi`?j^x>IW$l7bwjxbk-W*M&(_4+x z@m(u$d-ekQe+((|&wO{*xG{Cl0Z5 z&iBW^Dr~)Q)j5v(+OtsELN;2g1>k0+8fls5CY>bEzj{0x*cX+*P+G89d%3x1#qv^jr(4u147$D>1QazH(O2hnalo zt}8Jl{$WA|-U#+vMHqLcqY|vtAQyx)mO2NRz$jVy6J-?R2JO`EGOf zf(DP(9W?`ra9*Xv^x9Qp%)Ya}QeiPUF~OxF6WZL~u0e5RZv^t7_etg*?&ftubpC>L zY9l$a$^)h|Dffg`WJ(6+=Lcuj4ERb!7HN2~hHYvRE2pB^%8yiaTt4QDgrdXWhYY&i zp|zAVIS;{r8usM<=J*v1pA_5^=9D9NBx?ga!{K??Ih-9 zi!NB)%8lLpoy!lc4Z3jZUX!J~t|fMye@K_SD!%5HOzi83i)F{zH2s`=yp)y_gD;?F zLlU_Df2hk71K8asTxeb&F-n;)nAO@g9v@n#JHjJG4rMxuA_RwX2eRMuWnd$65H-uR z;ea(w!ps7c)u89M%Fy{9&pum%9T=(a%u|L_QY?~ncb}9f$30fX!x}Hiq6pk(GvT07 zTxg~J*2l)f8;8d+(}l~MIfS}fcqCf=LY1*&(};%9#GGO6ZYte45(|-1O@RW8m`auU z;FTk%{9sqJ%%^-~r-Y9zpJa{YKW53rF3oiuGWQ6vJdP>eGPQa&FzO3UMxr|iB=pU8 zvR1#uWT(H>K!_XWUY!dmbzEm3e}(5h{P1lYx(_Pvd$e2H9eH3PQYLZ z3U>_*&lxBnrKSC$ejJtI-?*q?b+etxWw@{%kFEKAvMHh&RAVtG5rTRHd& z>aV_JyF6!2x~%|?d(|`IdILCin*sAfDUtpu%uYvlU8dKg#@TeP{j?PmaszT>=#;@Y z2lOh^L=C;P%>-n+L6hi%qwDT_*4#ajTvL^3S?nWps#PU-%rIbDZ*MLKI|b6u#mPo^ z=Gtm0>nuKyOlq7_+%D`utTRfwII$|lU`lt&3^SNH;=|+CmFX2kuO@-ga z-0ROLh0rq3)RrjI_{XDKG3Od|hZK(>A?_^zA{En28vCr~=3z|)B7(=bDaDCWzTsW@ znlrpq$-NEjOz(U|lb&K@4~|mF4WuOA3J>wo4lT!^?^n)A*45fnJdg08)Amn|Xl8~wz?W}}vMcKY<8W8Oc#TML>w zQzu>vR|momW}*zb?&t9DHUqTsW$ASsT+516Xf-N|1Z-%-S>o8XAo1i9Ap#67$j)?J z&*2(`NUPU9EIBe)@aQzfGg2NKH+YLoEoILF8yrF^Ptd8WxlgC_4C~OVP^O1KoJ7a> z`a7jn4LPxa=^4rU=hy?L;uHpKmh-7hUL2WnUiB1O&rhvww&PlO$5JYMfPdXOa-h-n zKI|RrIU9W1x(MN-a5ke*vfoPRjYt;>dC)1b0w8VwNK66$e(N#w)mkF~4*~WtCv)g` zm!=X{zXTif1u9+(BemEBljo+cZsM}NnS=(}0)&u~p<)l^Zo0d=1P2#>xd^uRZ1q2p zta#49nx)))XY1~BWVLeNe?X9OLEcM&0nk&5ez`GcN=B2tzHlFPY7I9>+Jb71WcYrv zW9H$EOKI-tvp{brU zM+e`4`35nA$Y68k#(pE$xEEt|GA)z%*ilVn5LfkToJeMm%($icd(;bal@zdZrsZ+s ziK08QbA}|N*$hNvZiyE7i6n2BBTNPi_zX1BG8XiywB-R?v8CP1xX(Cbi`m2ysOw`GK|h?v=U%44$zndcberhs{i{n-1!uLWKWHfKfI zUy{vI3$t}7@GenZGb0zpjZ@a+J=s!NwVH zkcJW{5j+rRI5b(+iY;SBDjLvrpTaIz87spx+BE$&A775)))0%vEE;C{vw9vxX&CgE zn;IGr79c&EPTE&OS7G$ylCs)RcMQ>@+Vofyx(qh6*0c;RhWF(#%-y($hy2kHse*Vo zyK;-!rcR2y{d(7q<-*|7kfOpn)B9*pd033_MdnJ+x2N->J2gQG!3Jq45$hGEx>EM{ zM0iatP~*dOdoe|_27zq~Ty(9Zd?T~Dj9Gv3Xo$C;)k<7=XSrTa(sqAZ8yV7FL5gOC z&@FgQZq=L-#ISjoeO7NFB6q4YFhiMr`bv=f&|KO|g`)jv6eGHblPWE*Y#>1_i^0DY z6j2UgMzJ2;u2!rRk)<4v1l}{PcbCO*M?4JA=gCvV(K2z4zX7vFt01F~XCz+ZnL-rb z#Jh(PLYpiTfT8uIDl=Ii&A8A|JvrPnd0yx=6cU|^JVN-f|2Z#5zQ4c)HGC;0zx`|B z0zB3N$lh9|kp0Z*w4#Rn5jV_owDE(Dt<|s9T)fg1!raY^pH*H4(>}z>!eV?9^5ev5 z=tynt+mUcI@ryS4IqT_iJcObl*e{%6vt<@B&Tyxz= zf+MP&&r2syR0-~7v~Gx)*#uzJ#8>D};*7=hI;gU~q2SWE$xtzLxUzOvP#MLWlIP zs@MGV?h=^SIYqr#-n0cg_sry%U1fOR&8T_Jl`liN}2SOqh+Y9V%k zu{rCfn7Z%6r|E?aY~Qdv6y&J#%!BS{1NuK@Q)x##)`fAqAW8CNjiMRvXM?#_2i{G! zTtS@!+z7F-m1xKkeKpQQB3@j1O;PFj=S3H$!^u6Up{Ihk%>8T5N|m;wktNlWyRFA# zX>`iS+EiEmXvJxkiNjXh$c#C%cCClv98;1Z6I{cv(jy`6rc{LzuV0Y*3wM{|_;zVq zDZ&fzVZ+qTr+hv$th4?SI?VQ%A;`-qST~~7{9!l!wdrR`wp37=!K0&#S?i_UP=I-g z`yk!gmXAo#Pvfb!!sz;ju#s{XQjblaE58^l<$cJKOdE~%1gMD7x#>+3{&;-MuxI+A z2=8M^W6N>nacSWkp+JZ_f&*jkZ5Ai_AeA1y#xv^vcE{DeoJwyYk47eu8aaHxB0DSez4ypQLT7GxO(U-0&= zvRS3&7U(py9PqkS{N*Q-3^NVU`0PmSs?4g)3#Os>y%*Cw2Kqwd)RR8$Os7@mNUtk| z9&LzFP+N;?6W+T!f`E`kx7Z4h%-t4wDuy@XQZ9pVdGOKs4%c#recmfB%qc> zt86U0Bj@nQl)FNHOAXdcbL+*`TD`USXrI=xyhOa<^(_2~_jbdgqx520at_H&wt`tiiwx757Dx0Hk1}#!rG_>hS5ibFWoBD zPc=L?l)d}4{P;CuNHWC)GD7t>-D+)(PH2k|6K;*jfA!5+bOrkvFN1J9Q4FrY8Y+ec z*F|8_wbEx_ck@EO{`lze9aXL+Sx_3W+JZVh9n!;Yb2~k?w=46XF6fHM|K>LHU>P3~ zl|IP+t@V264KbTg2dSf_b$Ryd?;kw?zuQmdZ;D2uMCXHTaX-V<#i-?1^+__14i;&w z3cNj)k+23Vwa?bkimC$@ZP#F=o zxS`D5M*F{h{FCaDCO&m8~faxe{NPD{Cq*)OEAz2pEbD>ZqX5tKf@x^Gk;W3mJzOH_EXf!_te{?6A{*dxNR&JSjG-Wye?WUHj$tkBv*N3!n-wXP*1)bCwf8xe(wt zsM3uSv@4z6LAf#8=VJR2|Je#uO#*~64DU~-=zwshc9@aJabuwU_Ie+4!7!2ml*IQy zjrIF>Y3^2>>G>tJLJ!KflLJ1NyAl;kl4iG4M|fsgoy(v=?6%V3{~!cc08|T28y;*+ z5P59u)Q!iC0G&sA`W8st0D7bbl@T7fQNV{$BB%Yi_D-l^1wQ-;FPoVa&wc$K=vTUf zp1keV0C#&}$g(LOZl!^WG?C|*eC=~Ysx-fZ1=|(|G!3+fUQMW`2Axff4&@Qoil3o} zBUIC{1eg?uuWOM^B?mwppju0j9<&fW1GGFuCh98T&O5Vj^lFgSuYkGM1)k5g-y}ys zl?@POtSReG^tYdUZ?BPhc`5%`MQoXZyy>%$NbE6+$c)Uh>wRC}lSzr``fJSxjoRMbPGmUU`lVr03{3%92vKy0TBxj zAz(XJdmCyJcEwl;hqm0@P*PLuQM_JpMU>nh#H{$8tca{y}V2q4B7% z(H&}|G*Ehj7M;HV_7h)S&h@$Y=&HLuu~MvP0PTkTl*zp!aemnEzzdA!a07W&E&fqKB;!zyn1>RYV(Ms=bFICSejlD$D^P3Inv4OP z{o(-d#n{%Y=_UU|N|8L%XeJvDtuXxX1zW1v;kQyZfX0y^w6+DP=UUHe`+iO}E zP719rFJnOjoNR-M~(J($XAKh+Y@#z5IDA)!kRf@F`KkS>N>NNDAOB-9+v>x`+ zRo0_FV*+KbP-34nIh&@{kSJ9W!s&%$U9Lk2pyuIh83d$Iw# zBnbis(O~SqtgHSJa5K|ydc)Aj50Hy9U;bFaVKuF32jxHbW0Lc;3hZ$r#{tq_8P|v5 zZ-{1GPd?t>tj zi6SsVIW-Dmwq7l(3dC`kp`T_Xc@DA8#GcG#$&`b73YS;WqzJJpacpe|-vZ7Is0n{0xG3{cj^Vr%?*G6OHuCW7BY|@+n1H9aBuw6+;vR4R)it!@gi_Gv2nHOi3%2 z6nWGQE9*2YS9Ur>G;2D5t|^qSE-lwFhWqO)uo`nWD}vt3rvED7{u^=p8z?~>yeNiZ z+JaadUHAOqF$CdEhD4or>7z{PrsRnO#LW7FgRsUaC)*5aT&dvc8(%p#;zmv9cus)) zO(dmGw|;{o|Kaw4_#aTiKLo0qhBGP_55vMNMI;PTwkj^TJr;mU)HY$U#;9gh*B&J= z9wfelW)Z>I^S2+yeNI{7sMmXJvyk;Z+yuG>{T>y+umQkPqUXSI-n5lr^(ACU7F0SK zw!U+MMFo|t)yp5`ROs0@^e^I1?vxS9W3YvpWbG^|8PttZqz%)2CCUxH`=_oK{Z8OL zO3cycBwU5S%lo|3Cbk~)7zVP2wu{%=>5Ru{S&Ca=)|oWYKE|qPJBSS`@5T+H(BZmk z_ziIlM^j_>)b1mfj5(T&r2Y|Q=b?~k8z;bq7ERJw+2F2}D*>s(E@&+_rPbqU5TV(( z(u|aTfDl2IWK|PiH^xmTMopWudzcj#FgyUMtl!X|gJ+}9Iv(XD4w*=LlpXWwVE>}sy1 z#?(DQ7_D118;mdxnYac`7{dBu%g;K0?JEGfZOQ&I^Z_1vuPabSVZzY7FxHa8?(ADN z2qjE8lKnkC3&Pq+3$iNGF+N<7&H(`xU>cTi5SMI#nl#7v<{a^2C?t`7zZKXeUe5<* z`E0lsqek2Z9e07xFdRvn)%Ds1UhpDP@5*qHGS#>x2`T{XLS}-7C6-2Aw7Cmff)%3ty=9VY@T zuq;A}X&!_5Z}AGNY5_Db#6VPSZX=zFh3@P2eIBPV9mJ2 zxG;&(Z4u3jgk#*=XVhYC`V){yb%#9EG6=>ZOQKs)GyEzlwe}+O^n4_I0Tg-m-p=Yq zK|OML^S28+!|(4wsA6nd2>|23w}4-07)&jktq}!+9z^+$;I=a364u&K!&2KkN$sj`nVI1x4y}0_ye`<$K&-?lpf-dqmW_*-)DWl3*yF zq|Y5_Zew*aBl}}e*fIn~;7U~s#r0BhtR8t$`IJgN=g^>e76HU~c^uE)&Rdoz)9e3@ z&hLS2kptEA5hptcXy`#HLW=-r1a$*Vs9oJ%GVHy3kF5n!c=D=wq)(H6DklD0vZEw< z(l)HGPSrEzc@N?8%^|f3Xfpig8=@JMw!7cnw9B;?fLKPGGJmuB$d~I&UZAISm@EAs zjXt>t`_Q56h5c!W^inoU48b!;;F=7~;teTEp=!KMEUfwLQ~E!ByE7RGd(^t$X+K{9 zu%_z@F=1-IIs&LvDrsKB_lmxz2Yru5iBd4~_lzU28w4btt9$krzq>}@)-uHMNCT$x zght=-HvIV>$^VCYK*~g4`~S-IUf&+t4>L4g?evcW)63pcc*>g1Km~wagn1C3N`x8R zhI3;flI+{_vhTBc#G;Xm@60n}r&s^z+e5??yKa_eMV1KOAb|mi4?U&8*-D^F8`y-T zCF6Z{*guI7z6L>hw#n_e;(wp9{mPNn7C;XmB9jQl_8g`;eF{Gb&6Tw7?Qs2`PEPQH zq;A)V;i5~D$orrZs(?-(0Z9C%i2Z4B`5M9Race>ZT+0X7mg*Lnx(#UOE(&arsOa)z zg);pyQ3p3BD&xp8eug3KZsppUpUU7JJis8Qu)2*HKaHsh11GRN1LIFaF8U5DNF+g zz7K+UJ9wlW5XmptwP(NL1$ynzK%89%Jg@B2K?=wz@S>^(y$Q$P^lm+8G|WbGB2J+C zP@bYj0KC)9a+=li6rcTAaw8_JK{*DXEZg?oHny(&su;G0@hJN&0Z2`qba;V$7;5q9 zRsd}Qo&Gt{s)7YuL!iX+JO|KC!2RJ^;6wBSA4(Qk1eh0uQ+b}PhJtJ70Zv~B$T}A! zLgvQ6NoLzLKnl3efsmiC#y;2g#zP$ToesDiID?qF=60Yu9|f4&7lZ)jC~(nx-`ks> zGYx%z_$D}LPKIl=vHaQ_G#6;MH=)|ubpRFm!2SeYkXo`#gS^f)MZ)PRJ6F*A26l*rQ@Q_!z?WMeFJt1K&h3>(8j zIvFV3F-AGSv}4_mlJo)kXeSq^_sc*L+#3;J_fb_#Y>7=46aShF+3AH~Xnl!&1hx22gPOs`>+@>#x11MZdne<2AI;(8Q~HnNT-ZkQfQ}5(VwVQka{O@+zaBYA>0BkAp z%dpm-`S30eASh=sY6HfUw!*PsT?YixRj-=9?$qaey}ba*Ajt1Y3aURCSZ7(3q|dEv z0Qu%Mz|72=jP(=3)~>=n&5QtIWK};j955-t6oCslB?9QMY6eWYFR+2z4QhdWp%oB| zdD31$XBXt_&Y^^^evi%iL0AKk8QzxI=WG?EdQn2jkKLeX9-7Z-+jo9g0n>U2071ah zjVqw0FJ^y`{6q$h`89NBg8<0n5mH)!4U1ugG=eS}&289&h7=)hNmJc>@p=M&J|_#j zfW<}e>77@ATzR^20p9KgOd7LPqqZRZzX1a5%@=yuXLsh8y8S*bX})aoepO+=ezmmk z^P?9>uV}E=fXeYnn1Q0q`QXWHAhw)%@XP%$@)*fNGXFR&i{fO%4z9NawNsF+VAjoC z=P4=08-Qb;r^ic97r|83S=*)~t;|OBL`PipzDi@qT`P!s9aRUyxAa@dJUQ~)G^3b7 zji)SgePw_JWPD+u65l+={cXY;{&KY{(G><_i(g~*kP$Q-ju9+_Jjw+6a;%^b%5@ksRM!e8D z0V54l?qpg0mJ(wGwV5}D3t0dl)xrP1_5hX)_X}XnH@7d?5Ednah?*xNC6siozp82m zTUERPZ$_KayZ-|uq)}JUtv|i6>>F3m4I4J>Y7Pf0o$-OgRxHfuiot6;z&vtk+kSlV zk`zrDHk6V}hlr|YA}$5p#9q})eh)xq6ntZfoY>bfOs2%&OCqJ1;L>byn*jKS3xGFA z+7t%fZsq^M^{wEKl`P(!yC*GPB9xow(FV3LRoGh&A-sb;>2w*gpr+4KC)aR}h{tIB zc#)cuqKzDJ&KWY2FuXR0_xPSvlZiZO0vOs2Yq{s^-#xn!sZnh?H$?;T-ii{D$%f5? z%`xd=og39>tYdPw>(pwLglpWxGm!xQQ9p}hXi$gSmNokeYKwuyhb-~lhO?Dh^t}@` zHw=MG$T)3%rv@B#ODxIMy@&WJ`Nr&}2_1AA?$F#koj7GRp>#m>VJ$K;N@Ea?VJ;Sb zMGgZt)$@rv_Xz}zNoESspM*WS$bBUMHb(VC&{c)iqZFFgkRiE3_Z-Abh~jpkavKlC z8A>_*9vVsfMFmsmg9_rq9cQ}4PbViD=DyVmd#T^7?EvaUAz%HQ{3^Ee&T?_8ZibDt zDp0xufC_af#+-^DsL-sWE!Ux;H(Ld4jww?=4v#3|WeN2G7sx-`OafjP`Oq;Eqca`* zZMX~PDQ5992p%ET?|Kt{N#B09XGK+sXDHBccPzkK!5zzsr%GZ#SYUY!q_EkC?6jYZdkAn6pLO4w}D zdV>(w6Cfizj}G^wMZhe$xkx!tmIIGY?L6*yi0(}_&l9hAD;d&Bn$9~KX4%Ql6FQDl zUOumhOfdbv1-xY>x7cD8V8oi{O{FtZ{zIlvuZ4bxAq!C~mU8NbfS#WISr`D%CB{B? zSs}3X+E!_t03Y~_)!z1FD$6KVsee13PJ>Jo7tPCrK-f4MBn{3t0f(9FNXF9rfEBgi z6zn)cRHhEsz9Z9;&oM|HgznQ!(N%MrD9=BLb*k_?ct=M7Mwio$Q34ojTsYl~-br3S zkXZZ}Sc8afR#(IqM^S3=01|A@J3+a}oIxY6^rQxW}wvrG=_M-9vg8N^UuJcYJLEyQxZT zRKi#u!vhVu26_;@;dj$2q)>H!azj+421f;L^DNhS3lo^!oyt zzPbY-t8T7?Qwlhg6mfcMXmU^xM#uw2oLmbZ{LxiYABu(TaC@KGi&8BF8LD2d6Wced zA3Lzs5O}cIuE#d@iaX@*pfw_37I4^}0Q`gtUV;Pxn&b{10mjnG_1OUU6G^CAfEyp` zT(O&w2LIbC<&N_c3iP}wFp}Zfh!dI7^|G0<*Zsau%|pcz+B1OH;$VFIIMWp7_N-rE z&brQc<0A;0hagH+mGlEBmaEVEif1NUT=`XJHJz&Rdo2?%Qq;~=+~t*?WMNY>{)^{eGPV0C>OND?^l)YHh&N zgDw2bETIT-k!5x2eSfa4r@+!}9KN%Z)im}4Vj@R8(Acmg2GD^&=B?gPidqiYDFv7d zrd={$m~irU3QIB*J*b>}P)^Vdw!Aum@Ci+y*tq&|N8h(lWtbNL40^8&Pkfl9n@b}? z>IQHaA{uhN4Cb^>0hiR}u_Z?JRLP82`%_JX8Kn zw9S|b-v5Sgp?rf&iFVI3p&#{vhwMH5=9LmzSdGXs|d#TSrd?q~2A{qFm~ z;}}Q*SBPlJ6aR;YqL}aiDO0xFXm-ti1N0Bk=AQx>zy`Rw>_3?!^e?<%jg7Tzjj8|H zAVF#HUabI#+;8R#{pd^tT*GG_C$m43Zpl1K@Lq;aYwF(@FnBX}EJ)3ctD0H=^rRFM z=-|C6`VAO=bt_u}41>q)BHrJFhX~#)&Abr)SGUHd922y&9?B(ArtSV& z^NJJv47U8ceqwj)`>UP*XZZgy;r}!I|Ev#S{U>%(Owh&nL0CVUeFhgSc?R~%|2aV- zLq;Evx9xyv%vcN`{ox6F4N|F4PpZi`dIGO!A(DJ|tE=EHnElUuzON#3v;VSkPxd=6 z1mCoI>!7reYRV&ReT02U8z_!Cfg*T{%${>!*26sW=K$%d)?A3ybTSr>x`Njq7n?17y943I_a zuh|#=_9&6S6J$EpI6;akO+Nr6o6a7&~EC8qjAHMgW0KYvs z1|GCj>B1TD8TSRY`(cKPc_gX%4IU&5oJmy$G}Ml3NIO+`+%fOk)%agV`%ZYklbnCCkDw_@16?P6c`^x-=o4^U!6GOTL22$`#+Hf~-8IC&=J%gr z&5MOrTW*BOKAIkgLQp*;fDQ~z9U(dZaq_Q5LAisdaRHD%L5d*0 zxn@#&`W(z2VE)V{gQb9YY_rFT^1pDXBG0RO=Z4$aK2Epx(hu0rx z*FXcW8r!dTjiDI zo^67is5RinH`QH8y$M*3>io}npfp(^sW)1Ps^*;Qg3W!9X4THTP}9LW2ggPj48qcj zn5bkG7!MkxR+#+7M+3|6-n(;(|7^E@TFFT?AbLq*K*)oPE7xnUd4QBcUat8)(9oMp z2N}Kw1N27VWI_ZmF?--GZot%4Etp<{0}5sUQwC;+WdUXF7^pQ2&!2eJ7hNUk-^ZCU z0#pG=xl^Ckg{x0V$t8{C{8bQ^%!37OfahA`Vb|~Q0~_~)xuNwCz_Bg0``!XhS{x(! z2WDqZIGW>!SBjX>(H97ZxliQa)>2qzv^rEF?FYLp(8BGIUb2lfHKo5ujjFab!# zy3zv-4M24ZVY&rdeeP3wWOwtepJf?@;7w%F2xNvy#dlkdTetnqvnPXfPj|fC2NChT zQt&sgA80Kr$v{dpW&aMyNv{u7p8McHg=J7?PN)V1`Ub%JfWvRh1ume6_%(rj*tImY zETxdCWnBaRB4Ownl+*#e-`))}goXv%;posaaX=6;r0*;x@Fb&9=eQg8u=YZki6o0Z z>JmU@l?wL&{{v;g7B&}HP_}L7rWZgzan}_vmPH(c>45rD=fLy#mg8E5tBA)6Ir`Cl z%V%8_lyVI=NvD6HdSQ;iz-v8MZTKCO%xIq?Dr)gQex^bU%u}+^VmO&a?=M&r4`9JS zQSmc%_U%W+^B^b2<}p=EfTqIUUAIulN-r$UXW%o!PXOJP{i3q01A2ggb@Dbddee=# zVH6AP5~2eK))@O*K9FO`c4B8R73GR z1t#HbpxSU?4u}q4Nz!B*|$>1mpyrd`Jkfz{><8)p31J}4ZO)s7~AH} zlvgf*ws+HLQo~%ipMnQ1O1(F%M^XBBG$v5sw7VgRYO5d#3X^Eew0##o@isXWee&h= zl>F?0H;V(-b)zZeH;K7d6ppk4LVXZN#b8Gp&p9mwRIGG9cfa0 zcFoRUL52_AkNfvCU0OE{din!BkfI4;m%%BdhaPdihBh;gJoGZfENQk z<3MxAX2C=PG(w-Nd;H(>gX^=+66`oL<-_@=e?ewm8Hf_Hy$fRN^h;6b$e@%5)g3gS zh+@d9R0&3~Tmm1RN+)sy_|_~Q9*A*bH1M^rMt5eio1YGPYknwiP9QoZg9V296;uX? z{QzsMW$h3|Vv3_dMoP`}O_wdT1SR=lggG%if@6Rf=i4`Jvs|A_cVI?Z=XT5XN5}Af zm1Gni0uKVVfWDwg=eKvg9@e{2a^f6J5`f*o3?2yM2t?2 zaNTlar9Ooxd03uO8k@Ba2Th-k=v3w&?=XFxzUMrs_!=Ip8ZSgrP1WXGp4s48@_;i* zenv~Qi7rStjx&H+T<05!#6VmHDt$h^SZee!A(dR9K5cVE6XL5`Be-z|h*(i!)X_CN z-%G8*WOpcu3Qk)+$rCD`Q;BC*Rv?iBzjfOQciTh6g<}T z+@=(71N7>VTYfP1+RR#XA7!fc5n#3Dfn=#yN(a=FTfdX!vPZvSGVI8cV**4N7_C4Y zJk}9rkwurq?(&LSq+VN4*x`t!L|{crn`RK@XHmO3jP+e9ea=jz8-se0?8fMg49+zP zgjRpQUjp}$*+@;`cswmUG)|@d4q7@(>@JTpgxdnASY&AKF!ylAaMF($ww-U0?>%|V z^Lg3_U?V8NQ1J#8{If{p4iJv;b_u{iNjfr5%mNKk); z0J!8L$Z5kyTT|9rHzyzLZgnr3-2Q{_V2~(tv5Fpi2c(xdfr!6*V4gX2Jb^85!HX1&%R;s(5zt>@9i~a- zsdVR^gA9IE3h$yYhfD^cg<(GhLiW2EKwXHyi))pM4wJ8(47=%#w;X81pANrvFLU%0^wwP9T9cE883{KG|pSnA=No38W&tU zx~#`lSCJA<2E`|GU_Lt@VLjhUSrkCb4wP^nyyt+pC8p#`LGvDHFCA*0&(NBN+;#n6 zQt4hJIGB|npqeD={(rIemSI_DZM?UX($Xc;H{IRc-Q5iW(j_h3b<-u?NQjgI(%ncZ zpp+sif(QccbvZN7%shMV5BtM=yvKXIdp>Z?Tyt}C)mrOZ=lT2p&lF(3;Y7EJ+`=IqQ)J>8aYTNH+=lXP+BWbpp42+iHC);wYc6*RR`q&I_JIkzoK?T~l6 zc-uve8&|MnHpR6koX$An8SU)xmD~Yo%ZySJzimY}B~JMZPz148KN8H?*QDN*2bnif z^>Gq*L%$bt94a{;@rK*>t*iih{sP=N=aOoDkNY35+!kNDF8VFZ)x~y0P z1^!lpu^QzFz_(-`fOye zI@!-L(T~{L4ecm5>%x#3J|mFQxzca|_|~lUiyQmq7^+rcZVft`{;R6qszfhiBo&QkG`FXm4^AHD0^a(WeqWGsA@K&m?me~}(UJ>}nW9hk{1m;BvcKVn6 zRoe|w!aI=l2#(}&fpG~dd()~jQ!-=Y>(whohJMmzEnbEegnT-;W&*{T5fJjIlO$-$ zc6eAzCB^ud8_HG%iMZ2GwrhNOKSX50I(nW-pV2f3S#*Qa)X&0QHE zIr>Ews%TOTux@R-nN%Rk=kDHIjabi5MXt6yl&3!|9pT78sKDVfQ? zKG_vw`=TcFy!)*koPwD6e2+w8@8Ad&l^-;WN&rC3^L3%cJHeqLWxcK|X)CmS2O*S} z9h5t_e|a|9Yq$PR-t%TjUIp>BtC>6yK*Z`CN1q&o781dC-_>cti|ZWPQFv;Qy$U@GKFzastB~&&41%^=yWyg$PrT!t}(^@+^Iu8 zOeXh%0%SxES*`hwhwEYdg=)(z+u!(C@4dj%86pU+K3lds%nYgsp2og9B#wOYRg@6@+bSXG83BZP_?Wb@sfs_ zZ~B)ITIRSdeiB|mY-vY530mtX1>q=-cML3#+7V}5niUsNRBSU$W#JwV%Vs@c+eGYi zJx4Z&_Q$L-hxfwO1Pj$7T7S4jaR2yt+pA|>hj4bN%kj-&`J@Ytn>tZ;(L*SMUO`%8gbzhYWGYl4L1gQd4bSd{N1NKAI6N}b$0QJB+(8Gi(&G_yH41OGgdi5DAp&?anWt*nfm}5?|YS^`+CL%F-+X~8hIn$Z9u`U zIQ7l#*afeEq%vNkZFS!U8S;gZO_mxtVQQZ=$|@aFZEbw_zKYY>Nz1}n$VCJ~yT@x% z6+>$MIkQ-Znq5h5<0iJ)k+BPU1SO5R0dRXYST5o!7K^p;2VAESi zcd}M+D>d#D#*0F>g88v*3KQ>f)@(;$t8p-_nHI2;m9W`d8TKi_6aA@jztU3}&2{-} zS4OT=!|4zCfvS~5AUiGw=G3eBmMeI+TL(LXS`jkC@%ZYiv^~VA0{VCp;Ryj2w-C%) zCZ2pMH%(3@nNKGe@CNm~=XA(E35^I7&$~Nx`3#1BJ3O4>$G&a6TyKu@r3b4;{g%tV z%F&8&p`^^q_eQZ!)OJcTl*TN9gCs=_a>8iSZS*`QR6m0f)0iLCPVi1QGrh4_cc)iM zZJm?5GK@B^Xcd?m?uoB_%`2*Fbl^&JT&ao&#E~^qM|n#JxVxYV@jY%)$wo1P=K-&Z zgAPri&kr62Gi2@hMYfklih8Ik_Y4gsa=9m9WH3P|i2zNTg@Gx)?&=oP&t~ZAwxS7a zHdcsT7-xFio*M1f-Ud|4trzeGYC(^FHUn0o&=&p<({LB@iIlIt$#s2i(<53hIS63D zOVXl>Z;V9u>w&tf{ZBrK8ZV(n-r29CR$EnE$8#LjyJ?adNI|BXMMM6=j47Z}G}?>; z6L`&gp2|p0Bly)x^U zs}|k3Js3-)jZMkRpS;lF+)BYm5OOvkse?r+F8V!4`hnDEuw4(f(+8GR>uCq+8Xlau zT-7;m?XYi~>o5{w%RqBhjY+V*iV}CBK81PhD88L5;YAT>DV6s zXZAu81FA{fT4kr3B|_#TiDtrE%9cLKBqvSze57jViT}#*hd~QbF5&bJO)}RVri7Mp zeaPWUi>xEmfrmB;mCl(FgE&BVWF-BXf6UdYHt9Nr>dFYtWG87%Id#K0-F=>J-CiW6ta(LmXg*b0F@{w{h$UVNrfT5cS# zi5*se zp1?Clm=hPPBR7}{GQAz;TjiYMeW4c<7XIcVqoffTLrzEL6E#Qt469rO3%D&&K|8T~ z-@V6(_FTyIYiwG5y5W_1$qxbRd^#Pg za|VI}|`Hpd;{VL*b}9=PF{=H8?9gocBD zevoA`UI$k_*Zbg#ed4uF7?wYvPDdM}6?*Buv05)G~#$U4Z^@ zW~Dq+W)J&$x?Z3t{@KvUreXhtwv06F$vDj{iwGKi?Q@kGy<-8F&K^=BkoF94=#N>} z^y^6XWoo9v4@Tf0Vw~pck0A5R+^uYuS++?_IG|6x$EC*d_%ZO|}JQZ~{ajwahkQfr?3#qy?Fs|*!B$#Bmx{`*{v3$OH1@k$&)v+VmN=S#WF79AFraHDQP%f4F? zE(58r(UCK)=RBi`=Idfo3t=iiy~#oKG}XYZ)^;;9mz`XGGu^rMZv`qfNN2DxT_jE}WPnm4mbt4+hma$g6r9 z6kMy9(n=qdEXP0IE>v;UdV?KV=Gm;uCj|c-O5o3|srF!qJ9Zo7qKUSX=J&9v5f=b2 z8-b|cIdUvD#QYbvW2^+KtIBKNVc|v8eewz0bK?s^LZAD2Tj&HIc5L6K*w6`?LOW8| z%6^t-*y#CbOHoO7YBaI0wiD5xFuI4X8WTZXNCd!0AZfi9uO~Q|HS|4^z%t(`NPj^; zwFbRejMf@UUy7U?d`;YYG$pWFPXE&WQ5YvU|-=lx%|%Kr#1@e%wtx3qD;f4v5l$+ z25r0oXz6FOGIw|Nt?sFsj(gpoy8%i>*qeXgPzdkmc@6s_J8~kr!w%na&dUOXg3CDEe#Z zTb$JEg$UqoAMS}vc6J?q=YjD+x_)DmOUOz8fHT1bQ^W+g?^$f5hwmwJH$RILz6%hR zJF}(5w`JH(lN{apLZ#-3m=^Yt^`2aLSK8=BkTv-P;lW9&WXU0JL4`N+7uj%2R=Hf; zU44(T>9z0MvUMZR2J*9EnGhH9bc*Os?&dOz!5eD?&sY0d%MsMW=469s1V8VY6yc^!Ijx3W(fLBdi>-X1oAmQ1fK?$$*ySQE6-yLC>@M9?yfv;e31WwEz_kWo%uXvMd@t9U6vh&$ZII zM8}roV~`vmpo-0+NVrWM4}E_Gc!|c7&~uV%$S84<=hjE% zlDqhhqvse23Zi=WiEY)p%JVAp3ciW4`!|8e3}%+niJb($Xxg4$+4@<4M3rDIfwK=N z)HHe$b;_r9;1*X3-u6tjjSV*?#D0KZz>fE^3?Br#WZ zk4NCkFcQ{8ltFg4M9E02Qop&fSu(`0>q~hrk1DEAdEYaC)Xu*t6QB-%sT6jr=m`-g z1HFpPZjCl~Jv&JdF!IhDkE#=z2$U-&+KouIt{-@v#zwkIhvTI-FUOSTntQFagF`&X z?2rb+YfDA{V(Zr3Ly2{i0n1JLQi0spo`eqsZQtPsdE<`p7JXuQO<-P#JB#|zaKp~a z!7GM0$|$iSZuKnElzwNHO4%sZFmYJ671)g9sMj8~r(-^uV?EA}%%CRnP|P;9P!p5= z`J~Omm+hD}9?Ko1gk>ceXx3v)M{D}tlG1$K-DO5o#PYAIBY_XgW_aTDj(E|hkEm#t+wf~ z+Wgss0ce^}-}N>I%H!W<@!{8=xmC{+F0NXV65Q**%-}*#WDvpya*0J7&e{b|0jCb3 z)9EHmG}RC>)yTJPs1kj7wCwr?B(hy)K*)k{Acf z{2cez+WKvzzV`|Db5I*$CdneH2$ntymHejhHdVVfRm53vYoQr)!A5Mi`D#;_UNgL{ zkB{*^qkd?^92}rxWq36HfsFm zHDay&Co2EC%{xc$5f>_@43dSX?)+V@>wl}ST&2LeK$P7t1-?rN(O=?(8b3wndlyxB zo$K@Vx08G;^f9uc)^r`3z~X%lSq(N-JG65$!B5&S@Eb1r4IlX69auO7*Fm*p5R82% zq^oAdU^ViM6)nSPzg5F#s>u^=;FUk9L4(QBZSm9tH3>&Ct{~5(we*iN0Q75KfC=L7 zmBiLSEK~`*ru#8?rDfN9G4=5B76#4Ba#y*y4KI)yC%LY;YpP%MXW*AADTH_%i&Pn|Q&b7l=a;4az`7o< zPeCKHfRmBMJ@}dJm{z;9(c#Yu)D-RPm$Whb1`cDG7ugVho#*Wjf41^>CGKj5o3D&YP;YAU4ti|_zf3;mzjWl&*q z4Gtjx|HLnpgnqu?_4^WL6Hx1$hsi*EeISSATOADV1G1EmB|U?)rBxx0-!O!uz@JFW z@Z!Ib7=&Jn;KQBar6Uy{PxcqtVZR=nzj?4f3IoC*a36~w%=M&!+3>-(%NvvyJLszd z5E}w!AC89Jy?Bx>)ZWBx*_E~#7`;hSAou5^I>U9xatOYv|F;es{H7h)FHVGUnf!6H z|9WA4KneGY1*KZullIX#=fvo|3CDffjNSzHh0aU*PDt)MJ#skuDK+1dpZ8|Eu z!E|fTzsh~&`?tKQmI2(0A~*~-`ve(k;9x&JjRgd_my<0_(WwIDD=iGmYsowsc03k9Ljv$v`{Mx^8p|Lz(5l5xY`GR0PO-27>mHeiiZF$ zs09TqppgwdtFype@&$$f(|mC3CLoQ2Gf)8Bic{)}U(s2I#G{$DocQ04Ns)RY<6;4^;{^yArL^rmSm!&%iCCmR{ zvr;89P){+CI)osT!;@ebCJ$!@NpI#)Zul%Y3d6wApEtYeLUvpHM@svxg5 zr>cqpW_-X#U~5G&s){~=N=st~^|41ifJ_D+43!&9 z8r#>LLA(HI8gP)vfD9=I!2%RhK{Jsqpd%ZK$^%3acjd%U4KPs1_yB2Iw7ltt$sW{G z%S2mB4V44NMA4V4Ut75fK+N3w`l`uu#GzW?FDfLF$pwPg)`6CYmy>@Ik|u)-5CMY@ zxQ#rzIT;WD70|&x@tYs-IGe4X+xlrTbMHB(>Hq{vG+GTC0POOSyzQpk&4q73Jk~=i z0`Q)(RgaPTWj&#Pu?Ao!&XW$i(*!Dsm4U{9+oRJ=mPG2csOB?uDC-9h%y*!Z)F`x$ zwDn5EiMIhQUS#fyWva_rgRrcWCN13ag1i({h!0@>@6~~QS5DBH?3BF*^~G#4S0Z$> zkDLLJK}%^C2!=k-nm0OE&!V=99HC1(Q!(;QS6V2jFWP-m%D}GzbX_5Yu{`( zbKM90y{m^L8m*7wzJICO@OK`pT(&t#s?C8e+S&XEa$ffwNRf)Nk$ zt=J{_uCd|P`s|;YBHk=8RW*ycsv){6hY`ZLTFR1znGc+4i?nX+ejob? zRDJi=;i9%lj_>hHeZkZ_JOIfxlwi%-+_%Jmajw!h@1Kr(zZZ9R5STa%YTwJ0T*n2C z0f=AW6MnRg>aS>hqRx_ebVBgN8gk09eGkydn<`3)!80T4I9O_)Y;qyclD4vOie?$5 zKhF3B?6dR4^i*3f+j5L1sg-f0vqvRW3nZn2Qd^LftT7?oeXU3YCLv4S-*_tBtR$kb z;62oZO1jxfWEw-3#~4+so@J^gI+Z88%D%ZXd%1T-zw`m=Km65jOD@A)pkc8RDve7% z)NI1}Jp|qG{tTG+>D0)Kd)ijx7KLoyA=?4l((J&&SIh3`0E^QksMB}_`T2oHK`Rz* z(infGouCD)Is-s}!U)D@h%dtWzzj@Qb_LcSX3TRid8M>sdGOkO!_&8K@y#TBa{ z&01jVEn8E8fw+y=>G0#rHqZ)*#EkYH_jJ#>ez_ukdAEvAU^We6f+DWF zR99HM?jco4SQvp*cWJ~tE)JoUl+@vTIYu?XdGN;PpCWtqm#TZ}kB%w*c*ej#47uGG zdu?d=^c!GOXQE@@+~h-NED1yW{eumi#JwH$;28ZhtPjZBNGj^z=<~a(xjrzF5E#xk z^F6t1G}mL{8#5fvxDRO6^-C!)p-DiUnrPfuiFw6C+LFnt zSKcMM72I?j@6Pw;e`+D15NJw5KS~%8@)wC{MSm#6)=GtYoz)E2q8 zCF&)#@^`++}7Cv2b{KF0X$3KEyt`61t7Fe)bqz9I+|cBU$9tOg#F18v&7f?--p!A`FOiRF_el(e zFpzGvcmhnbq~Za)SZ-<3CQupm=pt|Vlgl=yumo#b#+s4}r@+YpiPQrgC8^wu3@c-(an8MmJ!x~^)hke>So76J z)2Mwu!SiUPnXTw9JVq-4$7U}@DDC-M|E^@vPy}^sUrxhHZ zz--<|1PPvNw!jWeVFp+i>3-`3F^nAcjsD-aGp0{7VQQ?@vt&qh&BJ=HMv@Yh;OF(u zqwz2?2+rklFe-p>?ih(HD>5Gb3M23w8Pp3DM=ybGNCmvP&tJm3r`+Z}PzR@N%IYiu z4e>6BnPwU%*?HI)&I+75sQ9_caqx3B@$V5ftPVC$c1U1;$!JI|mt+6ti%Jov+JeWgZLMDOBjdEwLceI^#%enRm;Ki>Kt*vqIr%J$V#$XCLdWK1T0>De9UH`h}k+E%`HzsdWVpBbTQb zTvq>_Oa6K*0u{j+i($`_)BEel?+*kjfy3F2S$6+*`2X*5us#LKvOn%kJ%-Xd4D#k{!kHUPqF885Ht+ z$~xdfuyFjoM%dajfP6F{lsItruR-|d(jc?u3cOzff>+?~HDGK2v;{-$rV}^{n#@d*|2+l+4J>s(6K$rmlt}uaL$;BSD%0)} zXo1<)8kwd1pR={JMXui_Cv`{x6Tv?&7_$JJf1jDBhT(rc30*D;DY)7+kME5C_$XsZ zUmv(**c-6(NWy=<$l+#kj+6d@N-u}8-YigXIgCax}B15X( z0-_MG?FJFP#JhK(xhOv{rKmV-?(-13e8}+Y>L>X8%7}P7pb(%<0!U$~u-hqzf<4hD z*scMuK*6YDph91<0~?h6L5b5Rcj22u@V~Eb2t(ER69f99F~xC17~(EU{B?+0fgl(6 z)nb*n-i?0-y=&_ff7%|HsTG}K3DH)pgAYGL&I5oSR*N$tkq8N*gSM3mm!E?_KWYY` z@s*u%u9@KFK^ZMVYpw0;)*uk^tAn&+MNARMwS<5@>^|PNub>=w(RW8&Gp~p*vM>d7 zhC5`Bs*j^3N{8qE@$#f1cYXVneaxyS?ezpFT)MG^ub}_nA1GfUTV;5eIPC1iNmJ0W1O5XN%g0 z?vN`g2sD2D=)W7{4t7=!_?-rdWEuGCrpa98N)}SMu+f!mFjdtQ zcp29f?*MOgTH@E07N_|dt|6o_3I)k8;F`*|qZF~d;tf1So=OVUjTb(j6{(k(fc-5QVYSr%Wn}D}JAe^mzg(i>_k0p!Z8r(4l*fkFT zXzNy#O%?d)Cs^vAz5+~jZW6yy=bs*3tQ%kllJ;`t?P3}9lEP+ELQcyfW0}@$r)Fny z<{q}aswnyfcIC-3y&#Fld)qz$qJrh~>#1pA@=3P6pvR$2|J#3@4k7233rkZj%d8oq zcc3XqTQDv_j|ytnVIo-C4Oaa;zBWNvUxM18Ct%l!0TsFrZPD`u*)({jcv*Xa^rx*s zjU~Z2hto=0HU)~$U>sB5A6Wb}#^f0MA=wOf&!ftLCm??n#4aj@cG=H3cZzl=Ro>0EG)^gDcXz=EuG2_zqZ-8mHYa#kx z@#f4VBb3D24}2HhV)=6hS_b&k$UkO;F4==ppd!6+%saLg$BZu@6dZwl%t#?Wm4n$K@1foeVC^}JNRybD zD{9(83VoIXKvf!ahECzp@u=W!Gyn@ype}d|6*vlp^h*K|B2@QepLo$C?1N;Wj69e4 zC&)HyoVm{STg4hr=@j_x>~L2!!B&$5vk;IA^(56cfG(glojSo17%!lu3si) zx>nRgs$#(e)aQ2uNZ?Yk28w?SGSu$R-Z9BFgjb8GM;ahO2EagQuzF`-?}!O<(?V9^ zy4H9fCkp8-m%|Yr@j||_VpGJt?rTAShmY+WSJkd z30r+>m-lm_dGXxsk8r#JRnSrM_rACZeQ;JvbZ7Q5GrDE2$QSs{jrqV;Kd;pcGTw8o zWY6TIwiYHz$NdRcDc&NbP>O$v9ou)?9`$9?vO?}CAtlk*A2eJS86 z#vhWAxTeeO<7ITVI|BBGG2g7cVIj02%e;TS>(uVA-A0n)m_f`wSvZhDGVI)#%kY^a zS#T#d=lq$yHU8=ITNaI=s#($S>us)AV(M^Z)gPmu*DhZr$f2!w9 z!Q~4~4M9u+<>iPOTFw`ppem`IcO;EY_u($QwflVe1ibZf3D)D2gZzmFgNRsED*5c^ z%SzA6+PEmVADWLi!gOO|Fm5+z$q5d` z5MJHw!qjpj8<&4%EPspHwO!e(Ur?Y+d`~@{S@t$%AP^?n1nY9{&D%;$gGl9T+Dr4! z{ut1?1&a)p)eb9*`wSE^OB43lye^U?<;_Nt=@_ zRL}28EJG4=aaH@0@(VYa~g!nb5P zRm#FCKrv$p$X)G#6seT@fivTTBMVrbwF${8UqbzMl=_bEfpiPFqJaZedE6-T7%MfPuoo^&Gv5mE0F+}uPlOjlRu~~g<6mI(HXw8tBTLBJmapb!HR;}5eAaCKd`nA?>62l+nWTg4YzOB^@M~m zt8i8xJH{C=kX07?<~%RVOBQPn?7*jZZTW#_x&j`Vc{*sLNEhM@#%r56n`XC4&^@;J zh4CAcE%LH6Pty*&ETd>YWbe$1-eYK254>>p)N{u_JjND|g+sjCV#vbO+3A9YLeck% zQ^A}K^B$>;pV->2fc*inh zh+b>taE1$zwVm%#Y|ZwVT9&Xzdn&!77&+Gc4A$ID){#@89t11inw7ye;SFQElvfkU z2fE~X2fPXzj}R2$p7{GiKhr1y(?g!z6{xJT9s0esSSth*_uY#bpeIncsMA=MhTXFu z8jdaXFJLk&>HPKD&?c$HK`R)BMFc7hWWxR#_s%gbNb~N^4i-MwT^dQSuQgx0|0N$S zVZO}6~S@0m8OJEF~-yXn!YkUWR^ zIs4lMm%X0X?;fHE+>+S1ySM{}Ru>Rj;D=_kj+sK+whwq)zxMP)c1RlDZ@olk)Kn_j zeB0C%w9DdwN`cM-4=;&|wOg$I^`d}Ac(*eqM%4ESpUZ~=r5AR#(#RyMCzlf0Y#OCd zLu?zC-T?Ra%{hmR{v~nY35H`fc5WQCjF6wd{J5G(OW$LTg}XTO@A1B+DYTWPhzk$# zJ8=}f0s+dy527l5Otxy|)v5KyGm(s9;{zSaY4>P12J=AXs+so}FG!q@$*Q#xs=C?A zUFM-F%5*!{j(65Oy*)zIXxq4TI15$JTz2F58z$Mm1d&q{2`Kpn!|lj~<=YCx+*rNN zNKXa3AFiI`rsd%9=5u$esy%YrYmSb2q|NVPzW7Z6TjFvq#oy+YLH#LC&&Cru?{+$; zTuleCh;eL2@mT;rxz0Q$J28wPsqthSc{kb5$fc{QIaZ9xKOxgS4yRQW4He`!`nKoR zoli`nPMs)deG|`74})^&>9YeA#O2>2D-W5Z)1%{X@g#Kxq-nMHOUQ(B`J3?Xd&^2+ z9$g-tfMgeirfe18!_79@Yb~gtOIJYTe>gCamN^-fMi5)anXXX3Z@6hx8>sA-cZq`D zIsWT@Lo>Ws8huU~VZ$I?oamtFGqIX>?s$4mY86(*K)n!OR%C2XoWuCY(IxZ5VhE=t ztB;_q3C!q_zIH;8$mhPO^$qa4WL8Tx8+gog_ds{ytD*deob!hd99I0rm)YI>fz;%D zT9ee`F>?SW=wai*Y@q? zWXv9K%P%}H$-Mja>N2OZIc&P}BENbJWaMOH+oj;y1beAnJU05aoJ20@NbUyqaXTu5 zp9br4M`O-xhb(@!jCk6apzM^ko#Ck2zA3D5-jUs?h?fNTNmy(A(|m%1r;^kbC)mXU zRQF##Y9#i_(IQzJv~RjqC}bnnbhqWBk@{DXv?cqvT9DZCTT#?r{TM&Hf-vLM+^X!x zBT!E)*NklFaaDS(GLmEBl&y?m8nt*U{QJzRfeLx%fxx(AS+sWvfqP3_cz;VXqk__7 zIs9)9ch?RYUcF@6O00ckJq;PEp?kTv2>H2priSrE#vK;{�r$)PNu(MM*QeHCao_ z?vgb9$|#bMTx9QLMHZuK?~g5UZ+@gv4~$j}Jd1`#12g|r+cLZ zZJv{jK$cbD0vj%4FfT*X&b&9(;MJ`_DaSp-kFf#r9f(P{b30E@s}rl%nLoW~?bh5H zjX%z_8+c7Xrsa^cnYL?ZT_814On z7NObOC?k`bD4m+)zNZx%T6JFX#%Su3UhC{~WxAQvxlYWB9BI3qFKy4_OkgMl<*0aCysd#5=vk`~^p=nbEeO*8KiJ6Up?$)jbLKBb<_O@awNe9k;<4_1lJ z0vo@}8@U>6&;=w@2us!mRyJ1-Zvo%$j=jM|#GncjkGrv{!jg zIVy%&B@TmS(FM!O9v+(4peHA1J3YXEu8b~NQ)6HKBvWq+#oRqovaq*ir77286658wvvYNaLg{!Y4G%225VX$vo5eZ{o-z*&o5CN?LNE^cP0 zA*7d>xj8=Gf_88)q_Jv-%3mG9vMGBYohoy_206Pe_twl&F_j%Bf7Mb*p)l*Ral_iy z$F!dq)vlrE%#OY^kM51<(q+6m@O=-D05rk+RyyLe_%Vwp`Wv)=jD5CM8j(PJNA$W? zRTd?hczy9%oNosTUm^fostYo zTEtwPcl4Eh&MymeiB$3}5m#^8t$g_4)MB!bpKvsuX4-h#P>u;1M;5vml{B!@F38sy z27olvQ&l8;`Kx&N)m{TSH6sT6C7j95SW(U03+YQG@}i9wWjb5gyAzSKWOcOP+4XTy zQO&aRsDLue8HkBTWKYPjKE?KHuMc^1I~il_%(tYp>t@^CVNw#QjLJjvH&CXMWg~sJ z`ei_e_5S?)aVH8zHZZBNTvL@I$DSSwlCu@(6u~5R9 zVxryPD|Jr}(#&$6GgZ1N-}$tM^3s&@VpM?{&bdjkahnM%jo}fOmmd38BG>aDFRO(7 z7uexMJJ%>ChvN6yd{lJid4=^o6-eE)E}_cezpq!j`ITv~wyIX=d;wp)28Wx?@YMW>!of(^asvKA5K( zb}B79TF#q*GuyL#S{ZD>GlE4b3!R@{UOo)-&eL_{NoO+)&9%vhAIX`IZVE*%Xsc_& z>#tO`rry*#v>%0yhSUcr*x`5AgK`18WuMTNKv-2%7-BFib!}nUOP`Kje6k|(#;g;* zngYk09ZdFxd*j2nyclMmB~IH6`X_p{fVN)TrK+o88C6#Hi&Sg2Pl<+=t!}CMrYIW^ zODR3Bw8zlwDzHCMT;iy3GR%0jbxGMjv8OXLzBgSLxREo>POtpLoJCnWQL6^amf3mL ziEu%mNV{wU^{h1?-NNlyKh9dW>l&TIR)O2);I<;nHaYj;LJ8)8T!2YDNj^kZ-7Cfx zw{U-`y1j-;R#@5XNiXfMjj>Bme>Wxo+fm%vO)!MR-Ed2;KLDpjDW~e^1t2$9VLdnO z14288k0`{ZUp_qy6*;$6vKUbo8NO6Tt5%6~^a2@K>UDxX-I=6@HgVV%-Sm?{U%*X6 zKuNA})Y+K?3;yh|R_$}Rnu*TB;7XlZZ@wVM+PFTGY9tO_Z0*|Odg=nho9Y%DvHWx^ zIJf3Q`FvN;vO=#iy}VU!-r%}arHXZq{Ls(r5}@>~Dk*I+)>4()ji60-eCLfI=Q`&@ z=MH=l119rTPAx1O?kZb8;*PS$-kZSmPXItT45{wSMgs+j@I1ubU)4(+Q^>zlRh#sE zTg6&*w}Y?!cPVeD#8p-;GD0j+Q~}DTkJf{@w-JN^GN)WNr7L0(C>XdacSP{`8TT4P zAsVTIz_X`po3Q8hz!D%%Sw-4y4{^*@jnYYue1aC=9RvkV9};x;+HP$@HXh>h;aYX(xdUti>ZR>xOhawazKc5i0h&2mdw6cnv^|LOL`B$3R@*lUL9$%-1+iWR<3 zU{Y)1tliM!RVWUU*)%<*AG1Gb0doxwsMWkd0~anSfyz1Vs0x$5zCF;aFHw)`2Sz(QRV3N3!-bPpVk*TL!)-lk!CKO}3D{%gj0+jUB{f8Nb_X zeYku?Mw0>1!*D*;KE;X2qsq2|%Bk3Qm1kINZT@w<-mG}|uej*Ft|e4e77nQ6)r#%& z#_T4Dub_-5%)l9x1*Tjp<^|1V+&?)|N%A1bUsG>Q!s>3)lyu4vb19y{wNHlL`&uI( z-84EkdS8O{T=*Wy4dr`zK_$;7!qW=@9bq}QW z@aog`|4C{Lnm1e}-#K=)HfAcOpH47SZbGHksu^96bosOccnvbIvDw=#-gM=_BC8NN z6U#(+T%Xz27$Rp!PM`*%_M?g>H5Z})>-aJHtn!ho@!JFGgy+g-G4J&E<>ap=;O2(B zfg6M2!1>K*qP(n=b+pYb~^^#4D|~YBEnFB_sRixcQf2zL2X9 zNPI~1R2Wl#Bl)X&V>wV<-ks1OeBYk=y1KrME>cNVlI-JHZt0$987ZfUlTMzA#kEad z@#ByxsN4qVtTCqSW9^v$ITPz~oM>IWIsDI{=u$Stqf?N0*%~tXTL#FqT!=G$TDV)_J1NFVXR3C9}r!xs> zRu)DL3q4v6euc|Tr~ZPjPnO#7I7k{`6)h~*Os-C7K0ghzH>S;#)-2B{i#<0wvcpJsS;S*NyLcO$OgZu|1LG<4OS zVuIByqQ^M}mC+SOe8ldMJwd08JE1a9(|(V3pVIkj0@xn+{=1cC8SaeoL{mF+ydv8-cC5C+jD+XT-D-E(H+RU)$;sMutL3J1m)DTX z=ZD@6wr;Nv+F`53lJ}`$$rF*JQm>J-cc`e5zy7DK#dT;nW+(adB8Ojsgb3r3TcR8}+@5KG z8Dr7SgNk`%MWD7-+xn!+W%c$8C!I13wl~a46phd4KLfj~jcSn2^=@C$Wp;bEq%Ru? z@^V4G!EU)1TGm)`byxSRLq2*Qk`~41r#=$Nm%wTb*zNb1R`ts?`wUITnmU%T9qsm| z%xF8k%|tvk>Am>`B(^|0@WSWpg}Eumj(OgdoY`B*i}|CPvW>Cuos&LevL9R>7)T$* zL5c{oR7GZEnL}&}m(qU=Vc5eiJ!fN)qrt&E6#J}z7{UCsw&E+@Ky#|lBV4oDvrJ_+ zk~-aWdlf|SsT?#kN9jq)!J6l`{A$+|i5r$gj}D zY3;NA-Q!xF-+M~mCBA2z=$O!^Qk+#mqD`mo<5K)Fp@Atb(mLk{G#R({bUlMlCO|fQ zyTHr`ov6|Pxq56LHi4(YiYxQ;RJobWK@4&8;+KkFRxO3b!Xk$z;G~pXueU0H6LgN* zx85D6^3k4a4y-oXc+^7QBX)3*(c06_I9+gInR_r2J+hrLAxGp1qTOpI7=7u()W3DbiTth0tM<_-n! z$WP$wNroq*C4N{*6MG8P9{VgYIQ=mPmbjDdd<&)O50j}sL`nyzAl|_n?2GQL3zfbP zx0OcB%s1S9RZDg3I!eSvf;9l25hw?WMZc>SAfI@t1^|co_u+}|UVtOuI+Ug?um$Hq zX_|K29DK`I6``a8_}YWi=| z0vE)QO))4?@TnYqsB{Tt$t0+)h2_+9gNJoeX*J}L;}m|yz-f4};V zI`jW4H-v;QV&vqH2lImYPc-V(`H;C`A5rnS#Ur&wbiStLqBUNc7M1a5UTk7M3r8ddK=X5Kedl@_xe3V_u0t z4B$5Pkd9nT=ijX9r3H9Pe>F8b{C*Ao1C@Bog2S{&6uSSqSqGuu zX5I7-7d`pson@TG1&5QLKKj2W{l{(j-`#0T-CxC}zi)J6N#E@3EGos63=|steyC<_ zmRJr378Dp&VTfXMoPnnY6wD2O@WKbw;aoPurW01Hn^Z$?b zzA`MUuIrXmQjii*q`OloRuDuffipMi zqdb4lb*}Hc=e+*#(gl0p_g;IhwdNXYj4}CcABoirDPs6U<&27Z|7v!GM@Oj7N2YcX z+l%mt*aDvFN1O9FfnoH%jpXky2QN|?i`-c#aPGgI{kvcOV>-L{I2u?V_5j1L3=9DU z-MG$8-sHcrb{(i_LS+_j7|*8hf3CP}5%_DS&B(=+{pPmFSq(`mD^hc8f0o-HaS2~m zqHae{@j(xmGz(Sy7+=l+v33(Y&k+U`D6VceX>$7&x#qm~5>3$mJ_4wgOSLND{5*?7+)c|8!?01laes+oef3ZWjYu$^b_pku!!$siH17BlG_m zLcQ-H#9#aFalA%zBuxT76Cy(^KikmtXPwy5tGf^cbbcTE-}gSeT8bI7XzEi|O{-1@ zJsL16B@x#~P=aj&VQ(CEmp>L&{e6JcgA`Fs&@&Ia;=K<9y1fs_B~52#p_Wg-BKxyC zIi1qF&;N^@6fRXa{Pzu}$JpYgn zv|Jx~lE0M7zddQvumQV=*QTtLP>V+BV1_wnwmu1RR5gM9PM9tf9hHi0Vp*@Ala z9VAXYi#HdfsXG`;*4OHfQ;UHRn64f^uL@|Wx(c6|{+Y7^53mseG@m^u!KRK>;5r(n zU#-Mim}=B}c;|8%@@<>7lBfqSW$L~^SiQ-tdw*@FwH1*siU|24b-`-I*AFDtnuwzj z!i(Aap7X0+@5-&t`y-{WFdk+_YRvLIrdkD|T@rgc6!p9KD(P~v1VFOGcy+1oXMeGv zqxvmRVV$QaKbKmwp){b=untg_h==5KD1HBqA}-Jud(L*wuAKT1qg|PD$O|ONV<7aO zV0U!)^~X9)bU*Fe&|c1yGDB0^zo0w^E#=?!XiE>x}cfs z11VVthmZJs{B-q%gg-ZV1M%5cK=xips5@XhL%jR$JIUyhr0a zHtDj?X*~@TQZ=27x>enBuIP84pf+FE*)y zhkS&TeVM0fALDZ;G$oIgPjqw zzc{9uQBgY-WH(f0W7MYY(VJnq@e(bFPU@{J_q&I4O284T&oa@L-{)>jOl=>=V-7XyD$p#?8MGtX zU2tQkF+>FTZ3&ZQZN5-~R!&WYO+lOzKz%dWq`g@iAj1 zt`43{NI`24gq@wU5gmBnwyASLabto3{3ZWpv46wGTfVM+-~6kA?wW*XUnF4)i3Ydy=>R+-ZTSVVQ%SFr%aYwzk zASbyV9jPUHykOsa04(A(Se5sd?WzsL*R(X1wNq%`Hrr;EOEX&9 z@eQXzX$_^s1~;a`n|@c<34+Fu+QY#qSBqZ^L!HMf9^5+py0Laz8dj_iRNjK-eJQ&j zE`IsX3gDozES^z^g53ow1nnLn_fM8-@7UU9{fIwX*v$fptd-pVB&N?TnKYKyI!uV- zS%RIG;ysyftU=R=iUK*KaIhL2Lk`VVvcTvZ>$zvYZMhrwaBqFQZESB`z590Cs|$E4 zhCIy9KAKM+cfWyBE z6Pv=@4p45VdpytAv|U*MQt6%ihe)wx3e`}3y`Di*|0XzOOK8|EvNM+{)ecK73aSa2Zzx+8Xo`_4l@hvNT(X|c_wixC|+62 zoLw_NuY?1b%y&n$vVJLyfyXG>uSk}paU1noKI`l`>U{3ku=bW>>oN5(jE2Wg?oMpZ zrCnD^xUjpwS^MzvqTHW_Dg{yi!7!x^DHCi$nRs#PoEpmvDk1d)h1C91JJC`;{EM9& z53(_7%9XJDb=Y_)vJ)O6@H24ndaH#b5{?y)SVCEcJ*atb@@h-O>9>wXhR-j#|H)WX zAY)mn=1Ncu^uSh1;+cXRsAsl^ga(ZC_CRFhIeZP$1S@8a_RkGh5{n*H*g5%5Q4i#S zwzbPj1#=E?!Ap5(mXjG7$@+_!ZGg`XvZwU-_aZkAy0STxiLXN4n%5w`NwJcx8V5SB zxz8KLFe5nV{xz*cfk>;DV`w)?G%`=v29dt zAuR2)x5)bwfQRq932RG4?+Uoo<=M2#4#;d?{!TZM*%A|Yi+m^a*mK}w;e377`%dtp z2~ltNI;0=wf#iZqJYnkjr>x}PBShoS=;EwNa2ZS54Yow)g~p*$wY_;z;G2t5^pAnj z2vQvBTa(kDjdmLju-J7PtuS}41lXz9z$CQ-%V_w$M%5LEFj|WE+@2F@8}1A!^mm}o z#n)-=A3%cO>RrAcjYe$4w>AfhgH%ycchrYuTFX;Vwq{90U($O04J?hVe6)-Md1I2> zcWC`fZ#44_xB@3TVq=M8JkF3G=)JuBIJBb6kpD@Y&|p^QqW&XVL1!_=pGPrvQ}xck zObU(Bc~OxfzO)lMvVhlKmf*g`L&5a?3ZLsIWDc6=>SoNOpG6Eh@UGN3c0agx z|T_4q<9+ zs7^sr44B{24Ei+~=%iZGt#*15Mc3)X(y)9rAf_p(B;fQTl{)TrBYL>RoOiMID+K@~ zvANaCmX{ssW`bCI$0nER%a+EUb0NI5?<@GG?Ykx*@%297cORh;@*D!_kD|(i-p~Mn zdIIZ7Y*(&*Gd1|OqJPRY{@|@lu_rA@1(G;8qOBdT?9-^w)p1}DEN%T~fZ&ikC9nxh zvPLlI>T4SdeQX;EDCify=>&2X|BPXA2GiKlvz2;e?jJC4>{@u7?b8n-eYl{^q%Jq^ z`)SX8tV4+Fygw~x;P1dcgbR8TVRZX?>x;AgGvJY}M8;QWeo%WX;pk>{;WCWXn(CtT zxT81mhUOj<^vk~yh@V4iBOA8fCdN9de`Jx!?G~z`dHzWt{Duz@ zcFJ* z+UbB>n0yoiMZPG(|KJzRWPrFXAtmRx?h3{xt}OQ=Q#uUzKqo?Sai5K?wVA3%3<;pB zR>TZpQclsTEZ-M|{TG81Z|n|%jUwhTMVcBDboY+*qMv=GjJ3rvia>B`ni)}gZzy|G zdf@b|5pAD&bPD&w)M4UPsaKcy+~}qYM+({j=~IH-&7;Ac@gLu*|98Co-s(E=%RuC- z`=AN5lfd2lX9(_q{QMw2#U>{Jts4t;av})h^^-SJ!WYnq?vMSeyVa2IB88wq%)9Yx zlMQ*wXGn@BR+)|Fejvz(-^}Sr(?+=YHlnalx+_#f*H#P5ERrTpKxPxX19b`x5C{wW z7ySXxC~z$U2x=CJtRYD<@DVoOK*2^t_mK$tbFSLgAP(>528VJ`&JM$VrMyRUSxcl! zG4l&mkS^kw#4Z5jG6wln z=;`Zsc0rx~z5ANqbtp{UUK|;D%GhfPA&Z#cIDom$*awX!1N`E`bPkj#Fs#V@YCn+W2Vm_mA6_-$pZ#~0UNH8n(9!B{mr7upc@k%T}t~S zYky3_7m3H+VDPy-mIm^0(ex+TvrrWHEaGR3qBY$+x!>vI)dL1>}3ZW*nod#o%-a zL2=4mE{)tsbob-N-Sc4UMgJV!Z?a!q>CO*9!LL35s4+S;B*%>(@o%!Y1MpPiF4Thu zKfF4*TN;l$;`(6sfxr~&(asxS@0jt>W^AYkG1x(>oeadrTKTwJZz@)KI-3DA#}BaG zl;xKr_2xHhe=cZ>u=-zCzI){>cuB1jnpobS%a(eNk%#-3n{EX3qUAtjyDs)ggNj6T z+wr^ufz6rENAj-aI#(qLDBM>|(FH<17kIo2W!U}cYN%eV3-V);C`|8eR;<%DW+>@{l z0)lup#a3fP5I{!f4Jh?i)i{xP*Z{B=gq7E)1s``E;2LV!)Bh5-86!ULa*;8 z_;CaAZI?&+06P=vz8(OegAF}e%arakg5>jdB&ghTp)x%#*;WW26)tm|UH_R4#KnQQ zBa+Att0XU{L?Vh{G-<#$`XOSLh*#P8@HRL#*a4wdq1DkR@ca~*Ggx^ZTTccvON8RI zj{foAA;7sCZ1B>)@4qCGNAJ9*;D5paCRm*12hpiM%NH-)x%3x4!^C}wuf+V zEbuWq9hU~>R*Wf+4HWGm-cQfQbC;`C!PzSW+`K&RsM%qU}*?5b%0Lx89~$6StE zd|tnEt7c#<4|&Zd2}?mlx%8EXiBPXMYI>Q$b{MfO-51QAB9t>1Z-(|mb z@8#R#H1{SH%jPfS__ML2vg^(!zO7!5XDgoi@VzWIG-aqYkqL7VE;u(+?RDQ$j8uon z?WAEsx`Wo8n1GC15)mf5qi^Ys`+b#>_eoE+zz?#NrAZ|EqQS(+z&Az9SNw*PkooB~ zQGsq6dQ@Hwuh1_zLMBNMA_uvaP^bdDxwrc~NRd|NA~vm?6Gj9PJ-&{jW*$0lH(_kJGb>S0!gIo(3;jG<}*)*K}3d!-@#dk5O@(Bee~ zgQKYSs>;V)FQ+hRm}%&(e0~$^`F`+5*@5uz`xI3a6gE8tnG0HHA5~#I_KD>?=`7ox zXS5ApZ7Y*{FSN#T!}&cd6oqj~%s1?NG;e8fQe17Dy8dwYjG$`;MtDtQkl#%c@`2l^ zuHK_e4Gdx#Lg6$Cch6sJ&6Be-S^AEMHOcf;#f>{SIn{D0Bi>mo43^dOqXn{?okcH; z3P4kg;O6dJUrK{p5M%>*Eq6}4dw=+zO{L_bRN^rVw;-raVo)tEFDr7on?HSS(GXU zvFmE`iUk$6^6dh6(Fz4qSq-tp!5T2~bLqcLuxY9>$v;W;AmTYChI`*cyxexuOP(r0%(S>z z@<^P#PdW*jbqZ+q{R>q|%2L77@l>gKiJc{?op>n?I`&ijlw`KL?TUW37n65xu+<uLmr1H+c!Kik>%ckbQ1B!L62RQ27>8Wa=A3BkQ&40kHa%Aju?i+*Weq~PGOwJkJd+#Gg;y$dGy1rG-F4jzU$omdYrzex zR_dg455V`$HsT_!4d^|J=vWla<%>AO4;?|8>T@E|#|%L{D>k|!iJCL|9GojcTdo?cig@}7P>k^gA3j;WvUrbt8eq#nkS2K z)l}stPp^mVoBr@mKK57)42$h@JA@b0*ETTk(*LD(ij;DvfdMsC< zqo}E)FKuyYeQGMHf#=F`Rg{l)nMyuT;4x9RP0Y{g?elAH!JQ>;6(*Pq_@v4-;onJg6SkLtDt~hX?d=X zG|E#{EC@Dk@mUJ^atg#$RQWMf;y!L3iKU$ax#9H=kh~fjd`?i7e#k4EA2#9EbP4=) zl}0RY7iG6p(ph8`J3{nAy3 zW}q3TF_v~Bi%;W>?pb~4-hsR#wL@W6)jm@0qe7hmzG{&^?1EfTnH}GIq1@HSpgUd} zyC${)eo9-gY2|Z#|JX&nqq-k$NOQyNUh6;{0YQ;KZ072MoWy!`>G=eeZ!HJQ!L6XA zD?PvnW$-STsSZVd7e7V zb(_^_vV(F$ecdzt%oe+mO@N~L4sXBg>rXD*kwI%qrKyy!Ro}g=IL&vXw`^?8sP!PX zI?c*tf2&t8k$Qr$Z!S$Ow&0Uurc`#&uUu0cK;_#;xGl-ya_q9JAF|I!Rs#sH`thtZ z99j~ozSFuW0sm*ncuBf9;wAi{J#~>T&pHbyRF1_@*GE)oWc_~Hop3Lud5XA$q_ET5 zlPQT!V$DPw!#5XH^(%OU(Z-DOwPByKmG8Qu)ewg-ym(f}id#rqQ3{u^CEm)^W^UwO z#<|(X%M`{wv)!z@05h& zFX)0k57h$ZoZ3nmudS5Of1kT2wM%2~n>kKMAGjtseZU$xcVsJ(8d zBbcmy3x-5Bj-O)AW_ej#>0mPnSfqz7F}f_H;PhfdQw%MOx!99<^W~75bNP;pW;vuN zVw1mMl+B+iXKs4T%Z!fMhY`Si@`;fxg`p4^-NxgkAlulMsPM$bsDd->qXX;dw_gg4 zNb4HhU%UG*g>9oFM+sk|v*m`Br=1Vu(3QTv`-(kS!gdy_jb403gXu!nartgdZuVVU zM0RQsn8&KKhs*l!Ka*%K9Ey$T8eVwN@Qh7x$(s1u7?v=|mE}sb;9zhQ=p(Cn1J)%Z zvX^pL-MN%uJhn~h^P?s1LW>sXsMaSKh|-zPS#7h(FV~g!_+EEqaPSqDY_M*>*Ng;!iYxiB33=r|`gOwN%a%5rt<+wPH?8 zt`XsRp_+F|8>0}bCTQ?DUlhQbsgJIhQzXt3tbQ74l|pHq*&P=kp)$gX(eHEWn0~-g zJ&?I@PzzR*PThGX%eL1$%Ab)Z#;wRvc_rrbw*keTR#(IzC3NlY?wTJn_1mQ zF_)}7yXxbxn(FEJ1Oo95HA*7>#E{6ryrzzmSi&NNfdaLQsdDeyDVp5`Kj`AHVpjm1 zz+$=LNu;9sF;JF|a%GMvH&Wm%ld7MqMOZclmA*Xb+wcjke3bkVe!-8d-H6zykI6AN zf7^j~R191j^SDY5?%=d*#gyV|O`qh`-cjJDBp>DM7uZb6fG8t<(RZ2;M69;cB{O@h z_*vWSCvGdJLul>t;`^Jo7%jbCyb&Xw@YAl?2)yP-+V3@>TZQM;)up;6u^Pq+G`gpR zJrbaBe6}Fm;bZ(IH<`h;-Rq+LLWGx-r{wo+DL1c~!B^)B&OQCO<^pQUz_3?&N)Z3@*g!bt?E>G@xT(Pl8 zA717-hbc4Svl!l0Zv5i8liZmVG7gPj7;Xy?h7xkQcU^8(#AOJ*THnTU zl54$#6<;zT#_AN4Qj%0`-zUlhirev(FKqP{-?WevN0UCn#U`{+(qapBJx|obdlid) zq|m`I?8;7qqz>I<>Knn))tD?)da35U{d|-@&bnEfZf>%nTpa>?c@oYl511PSM5<@< z4s!de!bJ&mO-muecN+_FYEyh$5?004?bE<;uFsC2l$EUii#4kiS0MGz(;7K&b9_)u zX7@=#)-6vW4?Vkx&z#|_c9UHf_>F7#I29m3KU5zsd7 z^Na8BNdGh-c30aYza~sYU!C#NadlCvKHf;RKyJ*;Fro{6MRD_QLq)vl8--bTh1EJ@ z3ux_c<42zR&V{9_}~QC8@vCVubXM10KpA5lSP>cJK_Q67T4?xxE5DOHk(+9PaRXH(bgLN zWKG7WRsPjW?o6Ab2b|{k&$64?sfR~}I7jycS1Uc5W=TDL5=~nJ2%TOVuTJxC=avxx z(IUUu=vy5}s<%h7!YNzF9>U&W-mdI=RS)(~0a-?`!og|r#8-22x0c8`Qc_;ZEA2zp z*p^&-tmz#?hJoNt0jv*66tZVq7e7-_Njzu8Mj?-2b^j2qEHJ`Im;IdGg_7QXJ$C%I zRY#+<)$PIKbFL-x7Nm6M7rA;7{pmd&*|cfqMA~}h2SfRZD$Cy9^?ZRW%L5&yxYcNR z%pW+au5CVXn3hmr)`rfTMRubnHTtzvxvGA|?RoH%h+Wg}_xhAy$8`wgyFEZTAbz<{ zJU^g#V(;ZOCHZduFBO{iMj8n-FER;bKT@=PopAZetVQ@ERf#h6aLfGl>rlX|xRn0Q zu*>VM$ZADCp=Q&_!9*R^xnU37ven&7n3REJ+h4;~ZW}*fj%mp?vQD;o#~3mH43T)r>Y^R?=;fENjZ3^3Vqm2O^4!ZO1XNj+sohf@7;)IfDX-Fn zXFX!2w&P4LoSzy(c_ylC&;Gf`5S{LvVuN}=fne9*?6ALNly^!~eEP!?i|n&Ocvs?# z9wun=o60gPg7=Yv%siTkV+U)lQnPj7@+He2;gXS}g|Sp+P2UO}ccC{Yja7@3Z;^Y% zt+Syzn{hxa2>WTzOF^|{#u@4=iA zBCDA`{+I2M^sGEIliX+7lgkn+HFtd0YC`7xv6H5x^stZr#C#ZVy;CTZof7y2q&Iy`)VziT!0@;{Eh1Kb04ziQbX%b1PzV~AgR4DstWjsd{ zlSxp~roQwTt-K?H#toHml)-9*%u_jyI8dd!id+HrbFN~W$}cs>O0I1J6f=NA9Eb*&x(m%6H<7ya}rA7d1-arwWc1#-HU83(fUcU2cwUndX)~{n8F_@Z2?~Rw?l^ zL|qh%HQ6xoEVoi&ovKhhs3oMo8iioDm$>N%rW0oz@JmOo^WEO$BT45~*N9w5KcA>& zI!eQ%fJeohsY$mSDLjpp!=?cKMy3py)wFUN?=2A8kS{l=s^+pJgeIVXc?C$5|ErM$%4K@sZLFH>Gz7`Azk@5*tGRZSC~mMQutfoTbmF^fKuWub^n z-T0W55R@-SArLdaICnxRh@~=+*B^zSg71{c_2gH&t(Z6z6h$*O>JF&n#`jR|@>^{1 zioqzh+(+}PIl*|%m@T$HxJC1QVl&wWd-4Y&D_76n)71hwC)2g)Bx=NmmZdjJiujzR z^meLVpgHD`+EP7LzK$on;&iYUlziSmc zYL?^oH}MJ>DJ+VRsIe$~{g>>CoC!4S|H(B1{w4{j#lac0dd%EWnJAR{s0+AGeuw%9 zjSLm@E{IAw6#H-ft2qSGiO+lq`bo$6eS0Z&z)UVFq(mNVuH%8-sM(qScJ@ECuK%w) zo2MWzybG)tcJz=cEW}Jo3p2zUg9aymF~JPm>wOPrPhj<#9MZuCJ1YLAm9CuMC1qT+ zuvaa1a!`j0M^ebsZn zsI5R2qD}Nl;TPrdm_F=0f^L2j_dG)H%ZPwjc5D*7dZg7f3b+!-x0i1GvRwZ;wPZ-0 zhUWL7SBHO(gu^#oM*}3x6YE2pW51r~C(AQX2_gtviNw50*7A(E8 z&@42a^5RG?hjZ~sqpbmnN%ONb;Xe{4K_{D^nh-bKUta{W)g^{4!v$$2z)0f_;VtAS~Pzxj8Mo3;b{&k`mG&1;yUq1T6r65%ioGN-1@PU ze&EF#lVRtC7w(4`+wbQGw5<>Vq8O*L_DGmyNa;iLlydw;GXE^lwus$?_Jz%(5l_Mb z{xbRre-~>x4CZ2FjIk~3{}tItj9bAt?JRwzaU_b>GPL0nbuh;fTbDl{sFqO#F|SXi z`!6nS85<0;f{fjxBN8VLs&a62PAeX{g;bype4-(#oBzL>OX1P2^soF8uO1G|-U9f< z!dokU1RsaZYC7=f9QXbb6dis|1Eda!cJJPidE|)e(~)v^ z!W?J*s|h7gUCQ}B#|doY=f8i^l$}QS<*gOsznTK-SZIXh$nN~S(XU^8N=le!cwzJC zwIqaT^!KEHed5nwe20V8Qw$}F55L5(Yk8n8x|#O(S37KVz{Hr>>GeHwEjhHcO-KE@ z>gRVXN4s^MeL>>Lwc*fa=_3EnGa&ykM6ItNb$fW^T3OV3ky2rXKMxy#5?CDYVGO5I z^~kmVk=<8Qjy}dLMML(ggnj>1_@DP2g|&5?-%K#BjvnpF+vRAHki)kfNI>>#bvMXl z-VS5mGjMs&@7lzd)t8C0QQiJ^Mn_(P*Od>^!(VadXC|Aaj{IN?BIPsckr%;$b_@Ba zBb|EuyRUY$^85A e|M$A?2dV*+bTK1SWh4sxQMjZcQ*zPt*8c)ByOAgW literal 0 HcmV?d00001 diff --git a/format/diagrams/layout-list.png b/format/diagrams/layout-list.png new file mode 100644 index 0000000000000000000000000000000000000000..167b10b11e37e761de81de8fa9fc8c5c9a30e4f8 GIT binary patch literal 15906 zcmd_R1yEhVwk?VU34~z5-7UC#aCZp7U4mO+w>s>(9Yo)SKVfq{7@Co8EA0|UDceqxc~ zz?Fsf9(v$ESXXr!ahUQk;w|t6#Yy&!D+~-S4fF?APM!J?H27tsspF=jq$ptKXwPC| z?r3Vk;%V;$TEoByc?y7!_7-j?8!jz9K1i)wLWmZb^$0lyJ!jw8ns^k)m zE*9k6EL<#Xlp;^b$;pLW%q<1fC8ht_9efj}w03iI5@2QZ@bF;q;AC-hv0`QC=jUf- z<6!0BcnMm(bcHy$nRvc*aHaakApbFrq=l=Qi;a_;jiUoObX*ftM|U@2N=oR1{`Kb{ z^K`SZ{9jLUaQ)}9zyevJcUak3*jWEHHs~q@y(%E#Xz%1=;pz&;7vUCqZ24cV{jcZz zqrIxNqnjhx1s5ALIR`fj7tqh-n?9Kkq-v9a9Kidnj zLKps@HsT+<{CE{?v&d5+)_=V+k*Afb9DiV71lQ#xUuk;6?qwtS;7C36Q3R_deiuK` zDnOP!$ZOO2{G*xt$15pJ*eAX()EFq$@2JShpVDhJxiPTPB!8z!R>c;R6-PmUyCoww z=x-jj;BEZV{Kt2{n{H%G`Nr4Zb$o1WWWUhw_R^WrPc{Gn{2fv37$1qTnv;9{mV=R( zTtgNQgAyOGg`zs7bRbSB21hLhm+W#5i$xBL#U>0Z-?(KU3;)ZV{ar18*y; z4d?~~dd(t1?{=Ib(61GFP&@Xde_(sJa)1<9uYbnn8GaI$GeIu0qPE3ct2pE(*cux{(sP9^(vpAZcT3V z;u+01xii`6HNUg3Fl-%4VR~I@+mLX4H#5ZV{>9%@bprfL?Dl?2o);PaCTIoI8nSPFf z!l2*aG+CtdoS1JZ%TObFs%yH!NYwf}`8Nvx``gaWP8juKbumZyr+j z>gsB<$3YI$>+*7g<{V*fTpPdZtzs-1`GAqjqt!KLN}r4UKUHS^b=DJbYocdAHI_u} ze}C(bBNHq(k5)p-<2^KEhr@Vvx(*iEeljMA{zt&~`gAz;rE47?M*^)9VFDjHxflW* zJiJ=C#EMcT&oZ;}pJtB=tx|3DwTG|#~RsH$M2hLBz1D`yRO!U#IsHoPG(dJUc80T~yOuBog+sp?O zFY6@qDEh&wj4F*^UGu`goOV~)yJ+?iwqt=(NhOLCFLlM|?lvwO!K8NqK{C*V3eacFplF1Qp-yX{roS2xP zQ_j-)W<8NeqwqHw*^D}>N4&w~V8M5Po?qWH2eM8exx(qVAjIFwhTW}`C7uB`iA3<7 z-9S7AJ`NQkFRqJenTb+J5M2hWlwp-=??;I+CtH`xLlY;@ADfybPR{CY@zl|+VQ$va zj40W$l3(i8xvUK}kOi`!!gYr})2KGqXC)4!-TPgOy71}>f7)s!9TTEg?@(4?-Z2IY z9{02@p2BB*IE|Gx?)SUhnfJ0jmSb5Z=_J_L@Xj)S)7j0EWf8Eju*liN9lz(jCuS|U zm18JTueG1U#eOpMX5cBMGm4riM#lhVLUq=l)!N~>=)-NncQ7;i_|?^w6|hOco&$wG znCc^6_3HOWGkH^(<7KIpvju9ibofg-mJ+%`_C9b}bd^PXel5$N5f(v(8L>r6EIu1Y z%pb^g(00Ru&aPGX?PO9#R7!DwzR`ln8F$*UXElg{mWCa^j^=P&IC~TK)|LRSp|)S_ z1y-JpX1;VxajqX>nC;Y8HA?n1uhqV1BOGR*^dlr;Rsv0E;l4WzH2KH}vu_A`?CW3o zk!dXRghZ4~KwTJdo9(>8~m`b5}27LZHgVSa#J+K)2i0kL3!EZuH~y z8*wz+6$>KFAXK{bQA|~q=g22!dcb22H<7|Gw?Iw=(!=z;d-Pj8)gthif>ll4B=^pm z_C`{`aAWSc$m3#me2Y$3Q(LLp|^r3HC)3?>aXm|bes3AZVi+g6(&D~-lE`~xw+2*L8-7k&@-zM3&A>mwhe z3L88UZ2Rrw@hhp+kp=c0CE=VuQzJt3QJm;$4U;_);;XGEJ|SlbLDXZdW^xd66djJu z$lzwM;mlZ#+P7~VM=>H5p`Fy}b)d82fwk~|1F z>isk|vpQT7$|DPmSG{@9S129!Qamv+Y;9L6x*5*GbTE5G22ikRjh|tA#k5~8{f61= zUMZkQ(m1DFswUv0p{1NxD7dDD@X)L$JLyqTzfefl>F80W(1gv`cgWW?YCc5&v0A;s zd=aQMmc{=IZy;HlMvS5Uii(czXvP+X}0ww--+vyL5nT#??FCo>F@p0h$ zQ(28-_kw2jV&#dr-v+}9)*0suu)hv~L70GWz#ieT=yw>gPsatQ_=1=bXt@g4=(H?8 z@`S;Dt_4!Vk?=|K?b8lHN@3cbBd36(7DYQIxGbjngl2Z?ekw+`nO8%YL8GpS0Ucc= z=jmn>FXk}RNJ6u8s&LLsbqV5VDToOPx971WP4TE!deId&5OmF)P-Yul>1a1=E!~vk zAWlLsA(!vrk%k2Jvp5LYBa?|%^US}+zwrqMI6#KjM3U2TG$Z7R%I@z}vb|1EVt2FX z=;CKF);+oI#xFG$e;eDYmYjy*8H(XIFICkWMBuzv%a^7I;3udO>|OpkC1H2jjf{5# zA>^@3cj4~e$6>dpIZvXG=?r;0l*}+>att>U5KK}yaC5bhYsSqD31LekK~^1lxnuFQ zeR)~VgcO&pYD?9Q*XN-10(8hv{{t*%`&a2I25MsZ*t;1IWG(jvu1|lJI08!Sg{D_4 zVG+A^geGF$#m&QV7(>L$*sh``c)3Uf4JC!XwSf?n-q1nuu5Fm9P+nRI_N$!V-PtrH z>NcqbYbRscm^-|E|=Vb*rO;%P#vb89%{_58!nX;}vImJFqn z#|)wNGB8Fwe$3Khkd`M=A7)g;C8{jL$6}87xPEpT?#gI7{H-!X`dIP0g7h3ca*rk6 zkH7Ue9K&2H)`LJ>6Y+$uHwv%ci=1>=^IXO(a+YHH?_iSiBzar#j9kPA=PG-6Cck6& zc*-<3?94@yr=6U+pMoFm4*b!Josb#Q>I21wvM0%rp5t<_K*(n3(Sj++{BNtm z``1)|Ql1BxG0K%_+Tr&~N%e$VNmfy5$OXXIY5tCOZqo6DQR#eoqw}{1Z7vBW_xxMl zVhGwYH^Wd45?{9WTQtX6iGO0>SAfeB>FJ=+Snnj8)Ft7gSCxbm z_r$m!4S%IK^0<&UB!siM1jUK-Q^}t+$yN5c5C__X&Bfx6e;TlE1z8GZJUe7;Ih`b6 zFjZ`NBQ`v)$kkT{wX$Djro2q~!eOm~u-uo8E%LjDBdiz2X?Ai2y$Ax}@;h?iFM`r+ zr~q{D1t9$8zHx>O0MSTLbIwaUQkHCiRa>aGe??`31)4|(289;!p5TfB7{PzFH_f({B|&?975 z2A^H8W6Efum_GuhP(QGf97+n(lYqJZ|K<^w3MRRzeSLkQAtBiU?u@jwv?@75SI27r zA?v3yztQK>)G$q>VM|2t`2f@QB$LP9#o7cQD%ZaQ%NT@rrNx8wn!AGn8R-+le zI%#UG#@{(q6!nbF3KLc9;S-M6AwkR={JQ z8Qt+uQ?1*U`Wg;Ab^H(#tMzob0e0e_Pf}6TNR1g-J8$jQhEgVSug`XhZC^5HA&4yj zLL=V7$7tn`f#Bmjd(!b99`F}wmAjQFRya7gc_T+|ns-Jxgf;i6e+a5e)>9W~(hE#KmDSh$5XDY^PcH*T6mouhARDzV32tIYPv0xmfU)kLHF$ z#bdzo(Z#A)nv(fDv9m+!~F(fi#Q z>6POTPc)`K&~7ppe4n+qnJgkSvr9oz5fYNW!Rli`K}dw9cN`m0^Ok9uP$MQUPm)0!p?`5RyX zFCklnX~Zj9=a2-29lW8BEWBa5A1cRD)2~^)O>e`Mse-|=|8tqhs#mxB!72yS*K45# z5-BtH&E_PxU9>myg#?_x|3f>~{dN1B8cWL7q8`eq8LdRXBGELf)b0R>+FSt?ssqDi zFp)-{QxXQAz|KsDC~qC#Ma1(EMnV&9fzxs%d|;|fk7IQf-4md z5)u+Ar@HW)Qw{C`Twisn;M_kFs3KShg$x%+y#J)vetSF&SZqJqFBvjlge^(rB|M05g}zrC3{pNN>ubRNh=1yvE|So54t!QHYER#$te@M<{l3zg>+JC02g3gpqHc6Be^@aj@vY zwsCi{*k6K1z+vp&;38k@pAt=)=5pq1`0dpsp1e(6Mu0Z$?n&U6SoQmpanbyP75Mk@ z2Kab*uS?WvdL#;#c$^tM_UHYJ+%M?B2_BSXt~Tb1`IKF&;*gGoOXbV8wLY3DPYs8P z%Z#x%*XG+y%-SWrZ;ZJPgN!KSq99o>O~q+7)?zV?yOzvBoy?khd_s#~GME|S)F@6t zqmZg8FUuUDi;z^&b}cU#gjyG#T#9Yg>#o~I%-G0`p?WQZJg_e0bIIno&|-Z2+qPox zC?W*=q#|))RlU4+-1b_WVN_X;oE^G^w?C35Rs=spw;TCg(YTmo1-|-}MFNjKQcjLY z)x7BcV35_di9bXsl3pc6y&m!qd%5Eu4V#ppWB#fT*%&`IDlA-Xmz9*S3x|Kq6dm0@ zKIv5en@J3Qbp4v=YInLjpf5j+#pfH}%_F|2vm8v(uoz0lyb)x#`XtE+dua`5vBWcArQsKXsJze{EnL6d>{dD=`r4QmX&06RLe>hEYr*|+ZqH%jYh>|GKSvpOE z5T7|I#D09bF3=-vGn&DjVh$*`6n)5*H0F!4ZjG?b}~ap}_b2&B&FhXRLFn zei2q58ay>tnCUy&2pJg8k5W;%y*+_C%gHJP*nRebMWL!`KRnS2vr9v(SX&MQC4^}8 z4)sxR8A(>0`5fl?*z-SS!9hlXI}G&?dv7aTtQ) z>~*Ps83>X2j1e$|TFfN%hBjcBqVLaAqtBJghy}y&3-MXtFl~ga4*GOb?%g%SkCkcu zvDykbd6pE#nT@jy<4&wdC&WG~mV+;B&4CE;h-gkCr7koY=9;+;{tx%^4;`=3+xk=C zA=Lq^*Ms}BD5-zXASp)lg`?Z>Q1^>k=@m0nRY%pfq2rU(ekiZw z=D;N$F(C-^%5Jeum`B-|6nohDGD;-^iv~y_3~?zv@x(Fj=LHW!an{AO&K%0<$ux-5 zcRCbt(5tXeU-f&@c6IFdNfeWg0v`7>o)7b7TkuB`8j>S^j?BNoukMa|3YR>1wh6p7B~I*EeY%3`btma zGR=(l&l)L7bxS0_plq@;;>Ad!RyC?R1aai?*lD8MV|B%-@t>y;#b9<*hNX>+0(QyUs`t=?S;(lra(Iwz!O@BvKf@VU_0jsRTGof=UpK2OPS%UVC~r5C=I`-?Y&8 z-$CQhMOYJuvr+BI8`m>PYXPx-&(IveK0v?S4nV1Y8Q%F9(LsttnE(>L^+j= zRfSaLb9KzuW5%lEqPCKR7Hi2DJo3EbS^p(ahg=fy@Z$$yJHBS zvFjWyjDVAwmhN1WE>*p^9A44fgpBSdakI5POAwJ+ph<0>h6&^IUw6kFNNF8iA*id% z$YF?^+eQ{8ZyMiA>{-i!8C5PVe@`^OfP}f|`+X$-(IEOWCF15w_IO6=_zwMAytvQa z(UZ_3vz>7AClft~2sL4RJrWV7)H`SsHSC+6GfT|G`N`2cqNM))v>rV_KdVVZ$cWkH zOJ=f{Yvw59pyZWZ(uQXfx!?x(BA`b0?m&)Oq7z9`1JD_?NLS|Q-LQFgXM6(nXShNY zp1k`gMyZLrQewUWOOJNcQ-&7VyDz}(1DoJO;r1DQeABEIw#I%%PEC;`TsQ%J@Tq|8N^Y^txP%po`q---&&SIRh!zCr`{}>@fQ@Hvg3{zvU8EkJA~& z&8CRq940#N)G01pVcD)~t2!xyd@?7*5!&OzB5G&~z$Ouu)o{#_qqB6pgTR0R4 zeXJU_B!6u0i1-yu#nIsx%jj6}v=(Tw4iRZcc{_f9)(@;OuMLz|kJBBX($tSC>0o0g z=jSHpJ7J$)(TxYt8@y_V$-xiY%tKnrks?ste4Ra<$S(N4*qd!4nw`6Dav5E7^qAz* zSTtH!K0o3@C!Te4kztq21fbJAi4=tJfy>}Pmjn8qL>VB#A7&ts7 zQWhOJz6<0GuOp%dnWbUyloflI#KYTXF#ie}*|B@uprd#K!)e%+EB>7Yqup}E;kL=Q z9&+dj>ps7J*NMLFT{XUQ_N|y?UYf`Nt78tf$}rf#LMNo#sGw6&uBVyJpva0EB|STU z^%SC9ncj+brGHb^h%Z`5oe^^7*22(Q^t00C4U{-1Adti;gon6=uU?0Xv|ty@Gifsus8cA42F0#Zv= z>o-FbE7ue!#)FqOuFq5ZKFNwP!LGJ5?4IcQ7I3tt_)A$^UAm{>M5}1GN`@b$R7+zy z$t?{{sHPp|#EfV)Ra>E&+i<|zDpwBRawtMoi&T(}tZ#|30m4JT8CZSUftfClGmIh7 z)i@IlZjpvz&vg$2geR^YBjjRKi<$NPX%=xF8$m4YBJ|F2Knlt3%b56l$$KP5Pb3F zXLyU}{SAZB%LfYwyVIBn4S!iKB9-`nXUMmgd%VNi58Cir=8))3u^0uXOWeF0K0-?h zSVPGYAUZl+rs^F*LP4o_Sd5L0&@#n8WC00e+xyp4tjm(irZhcUfDZayjHcDJ-|5cB z8@6I?2@sjdb$miVcf3=ovvZtT4&7@WVn_2%gSkj)UiatT=1>kcACG8{KWih&9LW%1 zp_VC^b4|5Gdm7vB!OKJY+g-WuQEXTS*{Je_#psyMh^3J?bq+d{B{r>&7pUN%jUPB(W+$V&L%|o2w1ag>E+Db&rJjJuC}Vzy5mShe8_P${^t5}24Sz2 zm$0XTyuAVfAEOG6Pp!bX|HTQRVp^`yJL|!gH_QETnqPHohvqG2QqrwSbI4)6F+8&( z+u8Q@a@If3XN$@U8CdppE$;XC%NPpeRv2UY`>(i!MQ!X#EPbC?W#cj0*FR^U?SEOw zL$j9`;I{Sj#oZ3deSrq&6O@^Nj*J*RV~Yq=DLdm*e#L_1%x)Ds8Whijjls-QI>9eSl?NRzn1dch`!7Tq#UH#W|#o&sd>T2QKj5YLy#-r#!j2i8V z9Yc@iT-s*Y>V{|$dfHs5Vk~l}BY0%i1m0UzJwApQmPxdso%6QInVfryI_Landc>fd z7#^Io$X-blEHjrDW9-l@>c3flVZ3HBj^l+Vx&^ru+uTZ$$**|8OlEM+@p-~H{h26> z!S!jB#kX*ed2>2w_AIj= z_?*8uUXksT$>ISc`?Kmj!~$NoF|a}71}(`v zW&RPLq*$b+vvrB8R_dP}ocpcb@iuz>^<{O{ET2M}5eJ^YW9DF?(e>|A8;D7-`V$-$_xJKe2|ZV@x&LFNn|;-ppC#_D zNW%i(>_XGK`;`THPQH7>5vMep8CEMc#@tZFuQ<$;2(fVau2}DKiF$AMjP&jR4fcSA zZag6%1UXB<8+{6gNq?ZTsO##4Vtt#;e?Fp_D4jj1`wc0f9GHdHH%@c3uj(}m!Ed5m zHb&#_!smF(wNhdtWIM1B9R4&t6Rcd?GFB231Ifl)%oj*pp=9bq+~obTEpAux0~mlJ zgxV>}!_m^IHW>TqI4!Fe1d*hD%Guo-N@B7yHkJ<45aLP)Mg#A3oBC_^aCJcSljfUe zv+2nQipqJO30;-OdqpqN-!bd8qO-5bi|%2JORR#}{);0nuALV$LYGe84}Z7Dq@|G= zvnAVci_qk5okXjY0b+Ie>}6cWfuN`7@d&@Cp_^)Z&N}bJDK4L~--iYA=LAkWmb*8; z0&7D_qHOe=CgNScm=a{V&ikYn2eP>5J~O!fuv*$H2otNp=fv>*qLa{8;gQ4VtDSZa zzcck&x7t&UIe34jmG}Ps&coZoKQ!Nk%!zWbbLBUR>R#HHRMoMotE;uK*LiZKvG#^L z1o${#d*}o^JSwS)NI1RX1ByHueKICWI5s2e)}%Vxc>a_9F_}1!ks|5UibcCqmTGS+(>9Lc{J> z&hvk0^e74x^Pn!Pd851L(euCo2ZzlUp+pUhd>0_P?q3};KYF+T;NcLI zO|hXKt^-;j<}D(m0EJ?-ASuQBK50f0>O-7B<<|bc^B8Tv?9_wR4ZRhgMVTpwkDl47 z2|U%~JCd3CF4-N`|A?fq0V#{o4gcW=4P8pvv6GSlSP;sIpZzMF_s3&2bh$a;BZWkR zh)XIl82IQA^}wVTD961X=V=DM{C9W0w(Ncom%UhBeNKif?0al|dL|mkVKJ6eA+|>7 zeueGKP%yL$mgfaP5S{$l_VO`;^h?*OogC&ppARzb*LiiX^CF@tuHO4*GY)Z~VSiK(P4T0#(0=(ui8=+WmF_&%${Qtwx`U@m zva+8wTHmS~=VevNhq&+wL1@Ke&1aYp8VDvx&J!($?B~he<9Mv@FFAkQ!j>^BzM9pb zvt%*MWijyb;CC0~HR(b~TXk*}rxvTmq>>&;KuZ#<2GH3X9a4WVJC~U^Xnn2Zk~ zg!5QgiD!<PouALRiF4w6teH-~z^&=dM&@CUfszjU(F>;G)v1L_P$XHJE;UfF$ z!J_|?Q)*b7SO5UDt^bDS9@rGfysrY)bttH)3e`Cv2}AcG6h&5y1{UOq#zWC}uh+dN zH^z+WA+|A$FHY8n+z^)*sKm=V#cPeeT#uW^=H?ph z`#j0`wmq|6oAdrvXZeYBgwi>c_>2Xp%Tv3^)H%%i{(;lV<*)3PtD-I%hmvkpaKJ>9 zjqX<~iC?0W!P-z_)M;gQ9!wG))yzZ~av(KmDz%|V@S4qNfjViazlA^dq3sJ(XGxl%UX$+|R(t)) z%+V4ZeT+0w3t}qEpQ#g|o`X*i*P}5c9H{0>!I0ph*@tw&jr|}9UwG??rE;&5lRQlG z`MJi>uNrx2c@}S4_>7AhHB6iziY)r{ufL#t^3rb19Ml9`JMDY!&KXmr$TJzp#(-;eTt*Z{_i1PTPS&Tkg0g z*^!OPhsHdL{!J1~0F&AN3rtpNYxvcmU0P}b41uH=7V2@ZQggB%+G zG4tGqxmGHAwdQR*t|jVPWm^7SU!@|lErcFDydsE#gq|rG?f_7^iFmBY&Md=!-1=eV zB+`rj2(@vc4EY!kINXPcJ1ahs?9d4=y@b<0|0#A41c$CT5*D>Dc4gPk zxXjJeu~cP@=j!Z18t`@6)OSt$ri+K`IRZRZ8Sbs0t=mzfmaLu}%?-v|k;i97C9{=l z<*i6)_RpAHrNDF|06y{T0fWBbP!zM8l7#Q^~o%y9o;=Ji08ruG@bm z==N9A^WYVkVQge%tNTv7S>FM_lT$~fN==x#Ac?cXce>nC)<_0-fK^_jzS{}qri&$I z)+xAjW#v}O4BL_wqk0d-EX5q5M)?#*ke7!RrfO;?_raaYw^{U$g`?t4v)(!z=aM=* zdw92ByxQP-$-{hIr5OP-w%Luo8?lBjYV_Sf+}`FZl;<{Ev8MI~WZTa#xCobUk|whQ zq`wfK5hjBQLSX5v`VHTAuvvK>|B&8V9(;YnVfB3QX};6ta)2Llu=#Lc`*glpqCEqr zS1^p0+>5@wOuugBlX=KV`~0`sFKCLAa5?0Mn-lq9= zf|wX^#1D3!s82^W)oyfjNehn5H#6{gz^3OMWP9a2p1xfbOB8ZNi&sxfBHhjn5-xw^bewmvP+Cm}W$pPSWam2{I!#cm)rX~#p za;RYQk&_uT5T7oTXwdE2uzyjWGJ^s}59JII6C7n;$7veZDZBk2N5m+biUF=G0`NS( zMxL?`X!E{h(NJ;z}-P;gu6)O&b)2XbwIS(o@bYnEf{;3~i|PC1CN{UkWx`{ zaTbLFIqW#bXM^TCtEZ+6=Tsds!Mhu-xBm z%u=wy0A%$JpyIv%H_dgfKwkSmsZDp_!qe)mM+jNUeONa;v{J%*6`$6yIF zRBmxV_zD!=OgtM_6zVt_BbQ7e2a)vQ3_v6hMrpy6;sgA^iNCJ07<_b47{EKc%6sz+ z>Pt3d0@TaN^im&XPejbTWFmx&U*P(`-(|_sn-JP3paexStDVnru%h__)GT@LPwmZ7 z%vR&DsyE0Kzk4X@U!*!i>37lG$96ELlI`e6`KsG)bNetl-zv&%AolxH2&!RhQxoK7 zEY=rMF_Uw=U(X?F{ixG7lP|+EF#r{An{0lu%F1zfRPW4IYdl^=_-PX+Hs^Bl8;$l{ zqIjOYE#Y<~gaUW9>-Kz?l=5igHszf~O`&&^3>WzFxRP*1RC$I&*-G793XNW@{eN2P zO(x#X0$1tR+LoTw0m(k2W7ZWBy-{c2y{%bM4cWTgPRTu!_tq)lgcYIuI}*%JZHxz}ONM#9D(&W`XqOOp zRfX^+1(OW(hrFVO*XaFsO#$pA7!;idn9z+&E1D=~GgLYq=2dw@abQeFz?g2Xb5fxV zLV+=HlowEdgL@7wXRbJ~;RlD)0$PFlzqxtGw9V}F#kVYk=m%Kxwt4+x!+y0!W#c!G z1QlN<)QF?24sx=DVDMKqCnL#9UVDR_>4Z_~wH;d*b z5557BuR|SJU#bgT3^v%1%W-SQUTt^7l% z+((w|?-|P$Xxl4|Ig;*%QWa+BINqf==d?sg5uSlao9);S^%m30tLeOeu2zCx`Rq=#=%}`=IQm(@dp*qYwX^uFZy*u!Lb`Xg4Q828<{}#di@2U5H zxuY)XcRQo!USkN}$pKbJl8ALcg6O1T=fvPDamo%tYVV`c?+o5hUzMd2)Ia87gH8NT z?_J%2*mg=1sznF@|M)-A1#F-j+_0%!2LeLR&bMegdZXe(Z2Q21^ z0TcZEGBvHSI%6dw?J-`N0ZC1z|?%z-Kt{`9+H?5om0u9N@afa{$AY zi=c^#&+G2xrnsB=r}^(lf&l;-9H{`q_8nW=QyVM8ouo16o_Dtpu~)6zwVmlHG1Rxk z*-!w>TLXCG<$)<7G#%y(j>OJQIxg5kogA1n;eT9j!f3$uxX5141^!&O{B$z+8Oi2G zzw<|yEG=C5YsOdX1Ss(xq6jPd3p+opI;wx|>@T)I_&pf@Jo5kU$pHK{mZ!D?ToH{E ztH^z=-@)!oO}%Mjb^AzqB8@BUWQJ*WsB2)aKwAL~wK2^5%bSblQ?ds4jhvyi+z$_b zdY1CY;TbJn$N_39Bqs?i3q8bpfAgHQ<$fw}bk0M&Ovj}2RPLwq;uZmM%1tcm3>Ma* zJDh|}^wVd8nxDn)uO?{`Ek~AiMr$>0_!#Sx>TQ@%3Zlj8W9{KNErmRmS6JK272n8> z?3sYYKZmz)rZzT#vye|szZ%^HnI1F(_U&ozotJhcTZ;L~nPV1qY$%>dVt2Z~fS8NZ zzBA=5f#uz2|O-lVf#H4@=lmg}KYe1-{KkFY!D=KF0-vMzN z(ynyw@apXdWHu9~j1Kk&D}b`wlghv7Ltws!JPH{}-IO`fUIJ literal 0 HcmV?d00001 diff --git a/format/diagrams/layout-primitive-array.png b/format/diagrams/layout-primitive-array.png new file mode 100644 index 0000000000000000000000000000000000000000..bd212f081151234c01f5814be4a4e4e1e4841835 GIT binary patch literal 10907 zcmeHtWmJ^yzb*_zBOpk3Bi#%=gp{;|bk|EaNDU<*l1d6PD2f4!fRfVE2+|1BEdqjc zog0|fF!V&z--xuIo4PI$A2%@Tl?7(9o`_sVeEAp+PFZ=MEeQ z_&0-Pkskbq?xUxofcEYy?J8*Cda4@xprMhoUHnB?(_`HN9fqCt@A}@=)R46GaO1PK z^RTh!3v}}Yz0uI510}&nH+x@e#y~e$cOS_>8Rp9#lHl{jV}54F%PzhyGR${1br=;r zyzLo9`9$~xm}T)885yO$?HnZal$24ggO&`lldrF*BtL&ZKmcEWFrSCFBfp@8gap5U z5WkQRFX+MR6Xfn|9mwnM!*Vsq-{UCR``CIrd-^(ixHDdiYi;A<=PSd^d@<2Kf3DW) z>+JCFOzu9YZGjE)Uwp$a$S1)6&)DEq>5He5iXLvB-u6B|V0>9o>C2w~<=MaUT=my+ z^6>Qlr{L{utLE-&?+sq|wZ2%LtPtw$|96l7K9;t(vpv}L<=cX&xBv4Qs=qY<#m4{R zAg)gN@+mlGSv+a}f38dxuXJAM2O1iurJ9nwejxg9>?o|M+&?>3lF>VGG^JmLSLBQ<|wX0U71&086=lKBHCA+ZtlkM)~joSAi{HMqJ`F3e)U73PLsGDPli0Ttgh3xcxux;0p zJN@HEt@8svKV-YJr>ne2sC%XPk8fWY)pr%pC3I5qnPDj{PwjeH`BzC7l|POQSN0b% zjZfaGrum=4|F?&cS+dXZ^SgH-grdu5PF=$eBBrMUa#$5D&pF^`7d^X+}2qh>U& zrA`se$GGZUARUEqcJUTsu2XkeZ*a|!%m zQ$fq059Sig26)1!gEv285J?M&cpuEhmM=3G4mMPT%*Ie`@6RX8ZH$+5L^XwbK2H== z?@kFjS}o$QKbT{j1n*Q2YDA=E^4D!OgDce-(YwuSP=-*r@NwYAB{(N0<u zp{@J-S%S8*U{QV}#>-!&Hq9Ke?dPJYNv6ZkWu_O`k5$MTACzu2w10gD|ntMhwhDY;DsNpa!lf<*rEmg z`Z)Sbnmw{o{?!=aBAO=KbI<6EQB#l4{p=>U*At-Sfd?;|w^xVuZ9a`VO!!Z@{r-%A z|0MuBJDBelw|RG!H;{DDvwevv>xL&D^;8qEA9Vg>)>+uwr$S9TYt};Jcq=Tn#CeOJ#3ktCC{7L1A2ogx zzJr=~)Se7fQ9tHM4u_`{Mcp(mt0V5l_J@Sf{|@P?;LxuRt)gF~!pA)!+2F zG!!_*jcx0kKE$nLJZamt5C5Y;a;EPt2Sro}XVeqxQs(IH8D7c(-m}p8L~(`eG>0hi zaue4E%_!#&A44J}mUBs^p_0@?GEoiuC8 zHv_4Om*X`;Rl#r=C&U;5UWX_&~Vp6$4lX@Ejne@Hj9g`daPND_OKze%(D>dImTOk?IG?z*sR5& zEDftKpQsZq)%<*ipMjBNuZ|&^&Qq-OPoe~S^_2X!QM>u|VC*l|en_*BU0augGL%B0 zgS>U=ld#>J$8~nSHp_Vd>S>nC1^1k65=Ke-> zSTRsZoseD{CTZ06AX=u;nZ=9*`)1dF-gGD-rQpQA1WW2l2P=#>-byL=9%4t1=EzHh zG4Q%Q4h|(pga7n`t_t0kc=ar7_u5}#3Zfomo$PN*K2e*}N_fQEUWd`(3S(iDFHE#Q z)fSW{$;ukP-WVXEd`9^~(6as(t_!E&_Pp4H>mapCx8JCF<`x$1Ae@sGBEfKPr6BZp zFVCa?aO&RAlG@GsC#J_BudQUxGd-jJHYdP?!go$=5lM!Gakt~ft$vnNI&|hbKe2Q` zYP5_EjIZRUR_>-rWVNK=ylRZ^oXz~y__`P15Dy)~W{6Ucy^XTVY+4I`WJ-ACs0e-9+#4%uv`z;3MubBKD>JMa!%D$w#L=FqO}B%S`68vrW7D;c!9>bJPYqv? zkK7=9T?4X+uon{*89gqi!Q_|6ceISt>s^(uCQBh%NwYlAE_camlm6-`!_Yp~lbviz z5Z1r(+w)1_)(5NW=2T~A`wc`?LeW_9F^GZwjSMLyvvpE`_FWseF4_(Gmr}?(wU_WY z0yhJmmmWp!)!vtn@xUq9;Ng-2CQtQ38c0k$b|AWy8_9_w`R- zTSuC9%ns8xf{wM(_m-UyJX9xAo`^Pm!kU21uMSZ9wO6{&$H?U?n*N}dbJ&2}S7?6s zC9NG^6r+g(Q|uy;#OmV4J0WL>%j0QqRkQ)&uP_!V!m)-%!CMCkNmRn@fhM#;3#sk2 zK0VQU9iJ^xu`>vA?Jce61OPcl05^?Y%L=Grr6rv@b0Sz={TuXUq2Y>@c@xuf%s#{s z)5w+A61J&?wDWDvQ!4JGYa!m(R`Oh;!xEi>zv3A#7~K&R!d{J%L&YU_ZU4_SLOR=7dy$m~$6f zMz38Y$FUE#RGp6bq2%p%^eBCH;`tj+ghUjW^G%fp%v6%)&JI98(*EKGl3?mB@|%MS zj-AYg&^WxXNPEYK)yv>`BOA`biT!2axRq4<Y!fO5HbFRacG`9B4}kz6e-W)D zo8WZMpsf9WK@w+D#@3#EwBleyI`rex>kTp;b>wVssY4Rs$om;l0PEVE9Q=k=dVKgo zevx$l1MMa^{5Q0#M=RmYT8ou(vZ@_E7g#95T?l}@`*)_(%!ZQ}M(-^SA;W3XM=Q^L zqo*5%`=o`JbA8^GHsf)>tC~}$aeFNHZWWxpGyupe0fU<2=58N{+Y)KSUB?9YUQ=TE z0OTusFcVeiYzHir`ML+G(x8^quRGc@eiLq$iS#l@05zGqe)!^Kx6>@FBC-8*$v<(=E;oAyZ~$wBKTUg@%MKW)Co=zw+3N z+nGX(rafH@g~s*jD_>0_^x6kG&wYCUsg`6wKa(l>=JDqBO9=KL!P$vR$wx!_Uf?zn zH7R>x18N*GHUX1fDqP8{^GQ;CH>}DYHaiX2g>msetUgql#0vU$i$btL3t-qo^ymGa zZMHAnU+x3)2D21Y^&hv-)DfRzFTZ&z6yx{?K`0v6-1LO$QMcS_QLeYT1EtwBAAygK zc7HLU>}H>CU>_2tuhf0?wPA;b`6 zkyv}({0(N_oouc-u#>G8MY8*&McNDmSY}^=Y5F#uGPml;lhwe96wBJ_gIoF3+0j1k ze3il;{%iF`X8rDn=}x6_*|RHF#mN`({U(pKGQrDrnf=Kfkn|ElucshhbHitE%XH0i z_|frVKYXycM5-+igWY<(*by&)d3=(-|IQOF|Y7trq653)JRNHc5i=!?ybNZ#!1)m zUMI0GH8Y=H(MOtMRMk!rucbXCkaG-YTe1CwZ3TcIGV@N?`#P1B_?4uSaLXq8R=i|m zjDV1I@F~QG8@@Tn^e{B+k0oOG%Hijl@$tg7M9sF%=@&88wmyf=L}0&+Y7$0!&&_TW z4ag;ABu1q2hqE;02JpL~T8OaX$=!B`t^D&7j$yK_@?`1lj(+n$&NFEg_&aF0pYCh` zA9|!Aq$bJAOoK&)wa%cMbe*{{;Jb4X60enK2$P;FbBrvKM!rX#bS#>a>M_9yZz&bv zRwB;mS1hGp4R^>)0&%-D#;0x}aza0QwTMr{G1sdNU{LO9Q5OakIzRoC^Ld8z4hpAO zJK*f?zWJ}KT1jq0aUIU<2HEHnnDz&?KSs1 zfXJuC74OV*?)Wr_eS+^9E9+Pt!idHB6BjW>b{>nWry)+Y&*@&betaemL+QuU2yA+I z82}xzz0`uE(U*AD*B>HkMEJde02uH>Yv!Twc6P5wSmaO zw%9bMCxp)=EH1@i?Az#<`Z@JvYp;W%4a2J=g(;A3PA-C!SdJs`dL#GYyy3)EhHXR4o5Z%~GV7wnSJ|y+o zI3r{jU5Yd_-Ix5r%z@x^?eCh#K3KGIM$RulO=g)&l(|iawUolvX>; zHzhC19X)w#A$5q3P1~r9Wup-h$D0&>`qQ#c@+VLI>bTHRzar6>6!mO$D~`rTsY2_w zTJ0lZ9$IC}(o=-PS{NgMiO6-;n+O*%g$NydIIm&#j;N1$eaS z53HS%4;CpD%4)v(JVV~NOxL3@&`oJP1vKUvT2nLCdIVYs0!_A__W;&=>ksChb(wUN zJEm{a1y}bo4rL^S?qNj-(s(Yh=0G3d$Wt1-tKQDZxX?%=wA7JzGoq@Bl47>f;FkAR zU!-kw;o*MhH*?9lMjyN`=sjL84rB&VqvH8RuO8QCp4U`LQ7W`F6VzT5l)?@gkE2_G zDA8+(8EJ?~3W?W1*I<_yh9?C(z%nFx_=q{U?u^~O#APcFp$}fr65kSpx9a5;sg)wD zq~!8Vd2ze9cs^&kCf*?#o75(+KVrilx=kt0(&{Wlhr5(78zIx^`(VQ)_wiTx2$rB^+fJl8`TYgmLpUICB~l0Snk_ahyF^&4h3bGvugcxXj@7 zvlA$rQrxkHsvVvk-*d7;jNDzD);W&+5FufL`lZSV_u8u6)X!K2aM&+MW()Bl><)f{ z!j`hq1ZRT;gnBtYJgmx?oaNaymO*6Zl}*Is+_){lC`1xV6G4<9zTp?rw6oat8ePL` zy{z`p7WaKZx~?fXEX7d_Y(k!fAA$t@8!lG0jG7;~ z^2NUy{Zv-4vNdU;Z=}Ktygp}$t5}Ck$#tr%8H0GJr15Ow(}X+CoQS6oM4rsc9;{MGN*U14>;Z*1?`CeU%< zOarD|)fF3l@Yg=fkE@PEOQQWXR_>^+m8{YJkOh%~nH{r}fnN4kX80-LeJajlxYFFqaOqrCFs(jF_G z!q3l6J!)28j%4;_u5W{i!_o_NRvmr_kmp|;jqY94UXa!}`mkyj38yUaf&%x#=FuAK zyHEo)>e&c&me3C`m=YhubKZ4vWXNLQ)I*8I=(rj~)#*_eM6e)++{F1sdPv&e9-^NB zU$!s|1O%)-iB1XtcO(4RUw>Ek`m1@dW#z7{83BEx1?+WA)$MPctQwfN@5b`-3d(y7 z0EgG{D-`3r@{W~B=wEXwFzt<@6>#cDW;^8Ev^)UGs2xPbHyM;ve&oXY?eb@nTTv0dsC^)!f;3fap~gZVP# z-&%RQC4JwU@aQX~$A5I!I);q%;a&<}QO;jn(JO4P1vItA#_evb3>Il~5v&5v5&|G| zFOk)KDu+P8mxLE}S;#E{XG5%6TamE!(iWk`&#u=Z8p}o5IHH9HRP)P33R6i`2kU7J zigZc4o1P``byXy~S@mQI0u5;$RN1aSH9I*5_!T+`VUPGiZpzc4dlSs~2o#TUyI&VQ z%~`<=iF|%d8jz%YPr8P3$li2a9zc01mTlc;1+}Xjt6VF4rt~N7*COB_|1YQY zZJQ(vF*VXK;aaYl#rVE%yDQ`eU;%hcXHXzGn0G9;2@s`N*ryxpfx4y|dp2vyE!I{{ z3NzwPmSpL)uD9vR6yq-MKRdaBN~^M0Hqc~jx_OAVc9W^laQ8072D=}iTF3MO>?20% zTIR_u^D!VHsUhv^PWL8s-4#$`)lHlsb%`04U{L9UG~EU!2k3ASPgeZ<+6_ro7^M(Z zEYFfTyZ*_+9sq$Ao*LNNkwB8j&_3iWHjEUG^a@3a(qd8u)sKUcO%YIGIxg-#MQXM1 z(S7K5q8!baUGW@G1NS`|ZWiSVJzBg99;(P0=wB~@f(Kv{fj9#NW;5HPr3>XC$76$W zcJra|*TAnH4NkY?j46KsUkR55g1N5VZY$u+ zn#lEv3@UL%Ws#9shWy5KM`(;)uvvybCqT2S)yQ76s0UTR0{LQ!UONB7amV}^A=lE@ z2MFxB6eh*H?pER~T5%_(Y&^hS!40kfp+15Ax(3}aT~P@tdrC&a*S_fNii{k15{2x? zwi(~SLC1)rO<7IajRb)+Yg|F|L|TP-6*g9Y|4DXVg4)*UwYF#)~b@Ye+tJ4-T1^AA_p0 zVz}a}4v5BVnGwNe5O=gBVnn7lKy{v$VH(E72dW|KeyySf;pb=X>yQ58G@i9&bTEl~ zM^we;|8s=*(UWSsPDb@o(ixTbme!KWsRs$nHINa)@9jgNS}0&frJP7D+Ranmy3LMq zr#uAwjr$NYT_mklEK%Zxe8L&c->Q}3-@ey?<^oF3Y%uE_vH>2wXb71JVdP&5cVN&~ z3lN$b`e7T7bjRwUMXC&eGZBJmBp;rBAN>k7<(7`a-P;ou=S#d1_aA{qF}8K9BGQyu zE2L_>fS5R&QsRz-gY$Wk)CSHX&~l(XSWJ0miMP*Qm)c3X@+ie}W=r|6kJaUlsmpf+ zgv*Mnj%5Z;d&tqUrk1`=vr=hQ>3W7@O0p}{+-)OMg(thVMfpT(|Mkw(v*(I1?z@!qqm2-KMh1< z+Pgj#!}Ls7ot3Z0Ct9A1qnBKMPn^U{qN61?_@%}@H33^b5FSG_PVPuJLueBz1({hE z((IFd1We~awJ=4SsF|>*9$M=%;xpA~wJE3F zT>aZIJ)r|J52#;A^m0y$$)2#rDd%oFt@IZZ5Y$I#RQ41Xvq=BxS8RzT$_FxY#{*rQ z?3oL-vtOa}2cA4;0iGaMDQO+huNXl1HV^m_Q_i#_f=&k8MobPZBStt|Ho63`DyZ6e zAW*+PnSC?DXx1zi2L849C;hFDDg*G(KY*`^3A!%sym9<{o}wX5WT-!B|KG0?jY z4}{v|Y3LXas4`UkYYc+A-)#~HE=u-}RI#xt0tnd0q90yUdfB9eL=p?cHrHav4{B%c z)7-CNOUIC=5jbdXc7|NT@PUYhM^h~#uq0R>usk2wcFw87^ip(KIj!6IdK-UX@S-4e zFZrdZBBPb7W+!gJLy>_Zo@>s9(1D6gv4I0<4buI_ zUyqdx!+;d}+q9Jkv3XFH)5s`@ODu*M%pIrCUL)>J%tzd&JakT^gKC^)->j=Pkxyb_ zQc;-MfN!z+#%wnc2?Uo$2^Pajk%&P>f?SaaSK=9)X~s=C{6*2(bf8ZlzV}&wy+nj>KJecohCqJ&*LZZ zQ5(3ZlHi>oXrH3m&o2;~h}SWysB$A0P=qFilonLmR|N2vBzE_36r+eF75ECi?}v)l zl_~~A*1CDz1sM%)fU8fbuGKAXkbghXKm&m`K~`yf`C~D5CrB<_fA-Wx;uSG+7=;28 zmEqm;&t-HsRZujB2~2{AZh~St{Z3$w)W72I$)P6218daB>0d)Nl>r6ochVj@PQ8j3 z`l?`y3J60K%MqalqbUbJn@UD)Q2~%7GDt-Ys`>5WcWvKEcf(Nelk9>d{g=fpBELO- zwa(n@!CNJ!VBkq{&mP(n&V5F{4e-JKFr((yg~_I};_ zoH5RS-+$+I42LkC&+NG8bzS!ztEsMtiAIVB0|SGptR$xm0|Rdc0|R%A0tcihif{Y| z{(<$>R+NFM8X?~YenE9pGW3LjA!2y`3#+V6e*$Efw%0ZAGEh?$v2=CjG`DiKu;%o0 zb^~(5z=-*Y06#igdzn-FIXk&{iuj4s{*yxl`1v`Ri8t!^OkH z0p#HD^mp+x_v3K!r2AJP|5J{fwWpo@E#qDHvrrX?`IJ0kErP>;_&Q=XoCc=jV#=dO!NL+cM?* z4<8=a@s18*kV*tzKb(bh5!^ww{{})?~0#4L>!m+ zp8|lgxZz+4yK&w8X#Uf@f-pO(f13HPBv9yEv<(!Y*8fV_(w%cDTZ+2r={$N+f`f!TKM`{c?d8Rk@|1LYQwx1XlyGCGWZWM};;a z!vc#H>wy7WG?kw7)|nrIF6U1|j=s$@=J{GxaD^^wnTy57iH|jI1EDcVeeKg1rqCA z_Z|}8CHEWA?Q|V1%G@!@bH(P1Ni6Rb`Hy`uk6gxhb<_)|ZnbeScIPIlEwkxlw9T+Q= z^^Z`vSq;8BQfB&)KIp&u;i@p2GVW1iK?w%2hN68`;*K5d;r@J7pUvXOpBeOVmzJXj z2Y1r*T`jzC6Q+-6nF-btX`f&FZ@=5lFy?GfpXsPOY=3%irTx}+y%AqmJMH$n{pqpt z9f|Yr$=|{GA&>qaguj0O^@X24H{jRjQIUg+IKx8X)P^1u!2r}3QqX#aKM}t!7hS2` z9}Y~PUVNeD7ol{nxj!2;1*(WeX$5LgfEXg9SL41wY`d{bo+77pRKZ9yWwDGX&&#!1HF(r#xP|0h%R|-)~(8x z&%fVq3+R{6USYI47)l^|++>;7{73cdYBhxFDEareX0bexq1E;~{ouozUYFKWg(%PK zlZVHv_K>S(UtX?N-j^-~v_nhaw~C#>l;6%6yH+wqpUCsw?iF)2%vsb8@t{*ZPz7C# z^Nxyz^Mp!7Xt(W`NcQTg=xfdRQxI}|9&g)&9SoL$6JO%jwYQwJf;s`+WgxUk4x{9ns@Ww z*g%`MQdRsm`175^gfAAIW$>LEmwiHj0VRoDGLNS4i~rW5C9oQJ-hRK$mEI}V1FYn^ znGLm)%diq zu}5ZImMxff*ri3Sw3D!V5A)BqZYGLK+@ExL&%VmUZmE*BEZcd!`#q`$ z>z(n{I4~;^70m?2dV`CslFel&Y!=vT34$wrsmhG~Xvx#jewn<{c*qTwd|nsM4f8hN zruE1iD{6|Eux!e+tqScy<6^tHo�X797#HVGfZ!sz-Zfh5U6d)dv!`Wk70jriy+N zWSTnMZ+;TRHNRS!U*cIN%=(QJtQ94q#$$2sA$+?f5StL@jh5!R^q8h5V0dvz(yJ!8 z+&$~zM{r`+yWp`NNjx%>$W;TZDuILI7IcIKKB4(PEIR_z(S;6ps{?*N5(=#3)2HwH#Rw(6)@|k5?pu1cl@N9NT z%TNXTHgP%E_A?e*@`lYF7mcGA+cTa+f4AMah%<;lO|U=B^xWvZI+f`BZ2t^dk|b80XOb%<|L)qVJti)y-<|SwMeJZDz^e_G3W7O1xa`*(-=6O0FT4)S9=?(XWXNL|$PixKLv`8%8 zb-m?@jS|{S5Pa=&79e%wCC(!|%dXO`gtho0!`t^&UKC~O5pjh3DO|a#{c{EuDHTJT(wqG-Rq(<`{vOn&xsy=;wRebB3R&|w6O1S>{ z?r!y|4HU`}L-nwyDE|~Eu{V%)bC&7bUH-Lg0EdDq160P|yp_r>b-!}erVoC}Bu)}w zTS?DwHh!J6dSB9Cw7HmpYFANzgZ8kF9b!;_K9T`v8^uvm*Lxj3e+1kmRBtR+);b>J zwod7|KU$24i~Y335xzgqf0Co=0PTP9T`@;oC-)rcAP+i)zUOF6Kb{#e=v{hWBMm_! zuaKvfP48^Vprxg>gn(VzA1)0{+`5BcE$Y{#^ar68KK(Oq348taMA{#YT*2z23`2(U zCl+(Qt3kz|O$gvy7{Wa4586Na*209<%HUya%8Xo$#ATZ;kkeMfTSIWD#DOay_yPlq zptr!TQ`o5J{?Jx(P(D6$h|or`(u!>sG^MAAV7!P_v;K%5=~{_t!Pj6Z8*Lybj+KEQ zxr|>!BEMM1q-?y&H*@+WmO!%XSEKZyYHzu7xZhbnQ*=)J4-PiHqd&8zw!-o;X?KIh z5!=~>I*6m85+NRj3@+hw3y=5*ub-SJvLD0EPH`jX286X0FF$1;3A68j?2Tx-d3yCw zr3d>NlU!cE=Rb?pc+6O&-dQnqk1wZ@jhad>1?lcC`Xunrz{rFBPt04u7tMVi_oU7T zKS39S%t_7ho(`O&R2_*87&Ss(W#As1ja#Jwo|P>_rWMf0lk)exw9~3T1#xf zi9F#Qg~X5<{OF`Is{1wMmfvlnM<7Z>4Y5-ME_o!%xP zOyb1Q#3o+S=*uq080|T(C~VN7^|&TUP=&z4hoG`ZaTldQU#A(#@5QhaN{+a6x`82) z%PlcocBT+vO~r&pj>@4SZzzVpOR+4k5|s>zs9cL#UD)kQ#GQF1JRi7qBwLe=EoiL) zMCl0h6*DQ!BEmrelc^%ZyzUWRo9Q4a_W?w@r!aqd9{=*#i9!*xMpFVX)w*CP>2^=i z%`#v4e5S#J?+jHJr;7<}iS7%uSs$@c$w36i{LF}(FE|ub(}m3@!G5W*MwcYUV>Rd; zrTtFLc#)#*yAollWmpszaVp;>4dw28FwO{3!Y`6Ivt6wFr_09nNR>R6c{@YF@0R34 zbUas50`)=126i9lV&H#C9yU<0z8*DA509IcLtFx`?2td@rr_+pbXtv@WU@Fo9U8(k z-HR=Bq2sK+^B8d_Hid&~TT6N^_bfHIEt5ygz1x=rB0ihIj_u z6m%;@r2|8vsMA#_Ex^n|q;DR#c=j7{yF*tPKkQ5Ly$FoNv_k;TzBe~Ylh7GOu?WPk zk%uv++N#}1O`6_#{ZJ@7bCc;C0A6&E;0HFvB@0I1%fZ4us)N4H-33?U6@sDr9S<`U z>X~#`0Vl~Z9$9iPQ}zxSy9C_s%_K<)C*f`^xu@%1Ng0oe@nr5duVsA(^FC!5{d(_M z)sL!kRJ$HA&Equjw3t%ob8l2cjr4{hkr0~2#YP|ASg1V5%5{+MIs275YBNYCk1vpu ziW9t#V*6_B95e#KwaDBqq)Hc7z_GM*8#Cqo_*gIxxc3p*UvH7thBvN@dh1man-Mx%p4`yEJf<{_|r|`I7UhZdW;Bb3dIY zLLX&iy$OH9qJOK!bAF?`zNG&0!!ZrW7w@EAvCt*Q5|)DG%&N&Mm?|-lTXJhFYv^IwIgv3}i90-KDhlgErkQk~_k`1e|CR0KB4l2pz;UW< zB>fQ^oY$Kpsy|(Aid2ZfuoG6BdFm`afh$*%)b*pIFHj)5LKF*z`y0iBWhQ3-4JV6( z$`E#xW(lcfXI$b^B@{+zRe8P*q|FbEtPTD=1x|JpO0BshB~Nda@byXN&2XFtl7_^p zf6a8v*jYg+Ym9DT&EsS>CT)3&K?)9EYz!{44ybjgR`vz|S~pgBz(AH_LKX#<@78nHnV4w2H)qjy&6zroQD%?5u0U0fI>jeo?7-1$du)g` z>x&dbvLgsqa8BIEk(qgCG_?ww(c4m*YiS&V(29YH|L`;tT zE@_MaG>lRsZ+R&Q%qOA!-v69%8h$a!MN#MQ zV5ivcIDD?=L>r*ml$sOVDgs{*37UYyMiAv?>*FSXothWKOrrWyahNC5+%j`0Gcv;b zGQaT9%qu1}nuzcmVrmaEwsPG870M+h{>i7s_u=0#PWxe>rDrxHk+p302hWx*PWi-~ zKGvtVLxvUu4+N9F#VPN^ z6#PalxD@Rk94stRJ>mEOm;*5-r zRFXBlN#*uH^@)UqoT$_CiX3y1L25U6s!eq%R6HHnT*jJc&^al zkt;1N5NAy{yXmLNZ!*GOkE6AaYpU)rU#q7h_eoJxt75ER&5n zy$pgSYN)$YRvOfzg{sc&3u18c;Jx($?a;-IU+i?SLWRqkpDDwr4Du~)mTF<>jxCtZ zx|01D*Z0t@@oI@<(-|p74&^Yld!I$;-d-9GJRXeV*zvcOMlAo>93LK!v~lWuD7Dbp z2MEHAtnKyLEj~&>&}(B=V=@*Ez?m7}$~sDKf_pv;G9yAT(-) z$X-nI{jRu?SEVGBBYMyj(HUYs4Rp78=}%kyA%Y$pJS45ii?E>(HKf0Dw%(Dj&)0I! zyF8ka-Jj}OvANA^e5sPG zI=y%9*=G34SVQV#+U!+dKNxqRI@*d(jQOy{5RfBxJ%bMp9^D1G>$cynozIf!M%}BL z4RAcaN`HXzd?gW$%HnNgLyrS%q6=aW9kZ);Ttl+MVi*)gL^eJLG5Z@nt(7pz>s(bJ zk<8u}z@1IYSQW`vGJIqRu6G$hBt-vZo?pgii)r}2!nIywm|_~^&}cEncwlu8Z$uxMucx<;Z8V?Rzn6UYA|G#=kz&)Zm^#mF z$f#N=u@ISkx2Rvj(YwwEcb**ScFghmfzX%O977`aa}HrK#qsrF0+MdC_E_HhFsq-9 z>HB88VUsgf^H6kO@o4vmpoVaVK0jEFmvp;P_%nTVj|Bp(G47H>4kbusJRLaE$cm1{ zR#T==IJ^=S!q#O*3!n%e&`kWxcwy?NCFh=Q{3tt&PEQ}_Z?5l~e1ux&t=Q8aE{ao$ z#oY_2dkob%v%}4P@{qv_=Ex&B^u-0(6Ib z4c>=RxklwX7=dJ7^kpHgw@j+s8x~ez()Jw>$HT(epLg-DW~yIg_CZU#-FCg!Vq)wo z&GXt>ZkFNN^(hs$=*xC~Tn-=){`v!U1@`o;7;XY;T8BC1Z4CZ9OJ z?-f;>k+6jcX{hB^Nrr^wvRviI)ZZ+AYnviSrqbGC9oj_SE z_zYK(95-xk@GUrYZjLlz3?@hFV#&KaB33A} zcG??@-j=Aay{hR5`<`}Sm}|y*1O|?!dgIBg&&TuAa($9Sqb8mV58oyH6sdF5IoXFY z2MeGGh9kSLAsC05*jbL|n=lSLrGM~vvS0tlGA@?O67q1?eX_p4ui7p9F|BXY3xqoL7c_+64~*%eX~shs&{1)~3djaHZsi6zCTa+qvE14AV}`|-0X7AfngC^;z+E1 z*o6h$2ieJ9u#rIP9nrOCpZq`Cz-0!94?33P|vqNb8%SJdFmugRY z$fC!2dIX9(EVk0TPhLa5(lT<5b!|wG&fBSd&&Dj2ug%uncIzjW_d&LgIUVeAT1oh% zWMCBy6^!GT{SruK1|wB!T~3OTaMKx!#+HLW-bkVm#=0yK98ZT72X|)~X-_f3a5B|5 zpg?gm&>h_X+LsY*OZE_3VMX^TgfYiP>BISiGL7*nE+8-HC1yu}CGD^t%E~ zB{6ip#m+CAI^xx;NfhG$&SS)&srv%Xlk>Q@noK}#^rP*4H>SH#c)WDLYi`KxdAB7z ziwlF#g*7>8m#NDSZqbfm`QLG$5ELErCB=@5wziV$PZ=`V1Wg!MLX5m~yqcbVW?|Hjl1hADe&&gHWy-OdAP@#k>X>UFdrg9th zo<}e-2@=40(j&Zip(v{4A{fOjgvn6F#X_Xpj-7fQh?OC>RH*vOOPF`}#V<^+Yn6`>kvYJoy{DGSZM{07 zqO4=s`DZ2c6*1?&bF)}Yecy<4p}R+>0-3)y*g^XUpVtQZr93(0dN7_F>Dw4CvPZ4; zORr*O+d=1-PpI#`*uhukZ!3M-`NRS&R0-rrE&3TUhZf3CUyY#XsN#{&e@@71eNU%9 z?4)N$FP!#vNm*yUZ@7u<*zb_%Rpo2>CkI7+ris(J3CL()9B7{q1Zyifw4pCo#&~Wx zef$+EPg0mh$CxFCot@^WzSlbvWflu@#i{PU zvp@;zeYUo(-5sQ_SFq-3JzTsx4)qy?t={~04Y^x1xlehm_a-DV;!vV%g+fav&rFP0 zi)b^r3&A?xZpV3$N#_`qWMEsCk35fr_%pX=DhGk@ebbi^1n^-*q(jU^_F(0u^|P6> z=(dM~Ip)U6k7x0=Z%pM7!4SF5FVL8HvZtlI3yn7QgyCT_|B#6={9=)s zH{#fKB%%OIZc#u)4;(BmLAYv4LsJ(Y7_Gn96_LZ76O!g?yu-2i9iv11+{%)YojQbqqoESTjHDhwo|( z0@)m0PD3H5(xarLPH%Q|T-$O7=ByqoUnGtUtVy2d1zv*r$Yyb-(j>9r7#+ZmYo9w9 z)3@ib5KKpw{ny_d;q5V3UY6Y#GQEO42flK&BGm zldZcSqo_+8IV$I3<;Nd3FN#ohv&ml6)#Ii3s+{s->aU$uLyCQ|~q0zI91BJ?xY_J=y0K z3M&=Z0#c4kRUM<0U?w+!aIYh!K%*0H`n;-a`1AtdQ1V^b9#iNeLqXVFyR+-?n<$tJ ze^7XBTO4UkT*jwbaUFiP30Szxd@QL36x8&@w*oPr9C;8kr1D_ioJ9te&e!I@o1Nc2 zx+#0DQXuv7QAesM8iD@gX`xB0!Ha=96nIIyYBjK!ufDo8-&c*zm{0+#{4w=@Mjoz` zb`8a%T=3m*#7%Lzmsid`+_^Zt*P<-ys|7QB%A^5+x3NVgS z0{C2*1LHGm!@!z6$PT`gaI1-STU zaR|VEGJf!y_}qTSQQJq0&eeJ%Rp+2{53rsaQ~+!Z*qc8NYX=|icO?O-3&&`KnpvUr z2;rw!7O@Ph0NtqoH~Y$$LU0FQ=!HgvHVL*v0M!X)$i4eLQ-0uCwryTBh{ip?G%p~o zkxa$~kN^Nm1QmJvAQipt38uWDN@|I_!#aHvcc_x_EdZBexdZx|@@M(V$J1EuE)3ph zrua`cpSmj!AeG_uS@GWm6cHgnH%XJju7`L>oxbmvJOMI@c=8iTfWwG?e04hSP+mDA za=;$9^DKaQ(}~LDB@ekpcV?RdWH+9`1$GfgJskZGI5zK)-WK52+(gJ)rvlgf5>6H|Y=hF?j&`KlprDP^r&z z2hfV-TP6y}O<-Mc7!FD2pW*E}B7y-aOx^x5lP}JOe4s6GY0a6X$*KhNHz-QiD$b2M{>O(Fg%l4~wqt4^5kii&BiSOu~TPC(I`%cWJG)jxhq*6@{DtaN^@RAQ#d@pKd<{1a$4r9e^$=09XPXZ;bWQ@-=<}DvEbw|D!-5 zQhVw_5xZLQgz%k|S--cD+H_FWaRJl=(xSP(EB>RxyX3l6+}GWEh^$y>L6)@N#0cNB zwVz~5?*w`*0iEb)jiG=MowrI`21G%IBaT(gj9qbtRLk;kfGTUP?e5t4LSxt#5Wpyk z%5iN8ux~E8i;HCwjViPNZ-ctb@r0rwuGc%{kgO%INi|^&t~XI{(JbiHgaLrt70`F_ zYX|>+vs#Yppvi?h=iIn_{aB(x?%s_H|Am+_C`s%LMR^C%xB%p>#9eF8LpasFnN;&P zex7h~)dE|lh1YM6zBOdbsqRp~3@Y6x)5zA!gfYMQ+@b^neXy0hBMFI@R=yFt7?TaT z?xMo62Oz>BZy) zIjjO{*(4k~`%lo~Th`ugub3p%Ks`}%O-73NAq)auwO*=_d()>|eY6h%1S_g{A%)IL zJQq4s)q098wDuO6(vLdmri&sW_s7GBQReF|Y{{Wg7RQn5yiQ83BkqThd2>{q?5m#cA$ntf$p{qAKa>rnddy$j{b zFA=OUKNf0d%JC7ZKchjbxziMnfW2oL^0}ia=c|uT&u7V(zLvF9xUgBiIX3U@zq2VP zObntlD0l}@@1>QPR63YWbZOB+eS73&Y29#*Z0K4uE z8GWC#%~XaTfab#dA}R5Cb!mj$;%7RY>sxYxe0D8Z@z32dW~pHoVL5>VtYTN`B!LIm zCOr%5KW{PD4GHr9RP0~{sLE?jr>>3e5F8G7GX_2Zd(yrnvhA+2X(J90R{1694%0*C zYg3T9R922-fA5Wh48Fu7dERViT81g~(xm}}K}Z}+iJ9?0wb5R*)rzS(E? z^FK};H-PHIbCWx{D`tc~O|>(QLF9gb?duS|Spw|~ublfG!FFk*U!5yhWhVTr?__WU zy}7XzKrS_s0M#e_W^zGB0$BA~+-JD}CW)=w7tZMvsUu40xLnnG_CkcxGUGxpfWjgI zldl$gJz3dubhik`OSoaXGap$g0oemwDDfck7BM#H&ax|VY)I%==+A~S8j0^(;-~nr z=YV_+xyBG`QmW*R+6hRul<;2XK1(Ioj*l~jlka7v)3g30x$q6*g!T{YWqVg2AtHeZ z^?zegh@>O7&-bx3Y5Dtqj(!o@9+)yq2O5x$8PzlqKb6e|R6i|6pXj>nsLv`gUUs3% zMz#K!0S=gg0$weH%5;KzGWhr{h||kOHlca~QKZ>2vn7>w1vaa+Re-_gZh8h`d$LZ%9IvpYQ@R{QbZDNsE^C=d@n`j4RK zVYTe3=%s%9Eur)@kGXMq{^LbyII=!PMMs-(W<$-cCnvr@`GzdT&3s=J;&t(S>a%yFongS~Nr7(K2JwGg+T-^0aD{#4EFLnS!Oj$N z3EPdHRFjsB9fvXi&WE}W+X$Gaq0zRe^19)nS!{R%D2R0(YMbMx5E`B5VSOqG7*TLB z7OF3Es+E_<{RQ{iBE#PHe7%974e?0bE85J~1dtHqD-mNX=xU&(bJ?xaNy5C{h141l z>HKB zZeZL3bu|Q+y>z2HxGCR-`fQ-l(uK@(6g$?x(UZR4D^A+jjKAf%ByYh_Tzxq9C3Gc< zA{!sIP|c$kGZU7Kr38t`YrN~p8x@A#TOsIh<1fi*F1Y1-iF084vL}l*W_2YBo1xU@95d>-4vC`1XlaJ-M<{wa^VCb2OhY?eRndEF z0P&iq+)Rd~!eQ`=Z0^h#ki`OeNX79lvPaj{v7-62nglWAh-J12xWf|ZeF7d(i7q?* zH-q3D(-b#A`({_rDd$|F4vc}vS>ZQ8uIe6uBdAZ%vKNE4L1gTNTVm(0UR!?5_O(^U zszQ3>bs=-0lgs#VR5E|sTlfj~)fZ|k z6#aeLa_Zo_0vBy*jM0F{GsJ$29|R&D%f71#B&OG=AJOq@8`eUC>F#Z}rPX(?BL_~9 zlEi}NuxvK#O{A@QI8ge#@sXtFYu$KlDH_by_;c6%oYfadGjsch$BNaWZqF=0h2XZ9 zpYM5dP)lUowNPvl=L^4BoWq!_-x3OR@LHGQ9LKwoW7j6DH!KlU=Up)AY5+)AzAEHU+rGbk8+*5xEipqJ4NJ!9Ntq`t=VG^Q-T zXjLgnV+haQ1acCMwikS{%HA^nWRkJp(~v14@! zw-4D_20_1O5$bSv;3u?#78WHS%F$>uV}0QAUc9$SS}OzVdy=Vqxz&ySJqwXERGh-f z&6jvPSsxj&GIDv%he2dSW{@*VVDs$}frvR<4r}==zCB-J1rM6ithK-cpqFOO;m5G5V(?yp6Xr|XW(-{k89A4@FM zF14iNx(;8zb<9V>*oqjc$&RNppN<=puq}t~GlV#CZWh=Z_!-pr%(+;cP(sr7qE$GX zt-4Uzc2?8!3^1Ytto18>3QE`yt;tlJbOFmkB(C_=$jxzY*bJZpj;d*F>JesBHFUvG zyuxfbFvlHCSBW4$=(G`Rs(CSl1RfBY*42RuZF>~VCtdhuao14(f>Ohox;NcWmU%8t z#Yu^r2UZ!>=RE*^A2rsM5EfKR7-?y%O_-?eB7cv;fxU6aTr=Gi0`@#i%0AykYZT*(LFV?Ce%V~=iP>OY%UTB)dwfHa0Ussj`;q&`DoNJ z;FU!TH^i^6g98?qtD(kHs2R_#T8{{l5goaW+$Mk;g+Bns3d&=%x-n#_8k=|U5ZVL? znQN8N9Ib_{QY+?6(kibvH@pG?lDK-*)91J;CUTkTM$c;4)TCb}_b_-IO3(18aC9l) zpi%%|FXpPKi~;6p7ujGZFeJMzH38ePP9e8<+|iCG*~?2V}KD-g)cb)<$K zBYCa+faBXN=i=Nu06!+%QSdt(A2de;^ZAJNK-he$p9T1tCLYR=ud;BjXn_61b694& z`@#2hUm|i)u4|>iPrzLUTGe_#_qyF8f+#?py8{8jnnL5+5%3pqQV1(wxFDo7scpzCHThoN$>H?mKU9NSX4yDV?I3?MsOl)C!Ku^vC-Sr#`NcRfbb39hS$D_;s`C7N;s$gOI|%Z+?)MR znMhymt>N#pBq?GgAWZO*?l#qzB{S|dl5pLDtRmM276+nwV0{c|+BS>y7eLG`Ok67H zk_PLXzP$dJ2LJkwC+}mJL}q;d%B=f=9@3c>f+sEr8>@1NF{XkwzA*O*~r!Z`K&Vr-m#=*d4@=IEY6%vj=ukkD4LON0bxV zD+qUJyaHgFbLY=bckVszN7uvAJZBf}6Kz4tb#IaSYg#&yVqlr_6r1sd$=<+#MTvUV z=1Eg=>9C~Y@!8_pYf-k5z;nz^1}qhSijyU;JFtb8k}9qJmo2t~0FS4dPVgM#BLKW` zm0(JIKsHuOO}WKZN}>_el*8AUa-8EykLgiG@97FFL@qNK2dug6$vK8{!}&!X0XEOo zT(mg@J?|1y<~hH&LaW{Ya>UUx3;4FdUG{4X}imcC+jl)CAs@g?(qzlKb^JP!{%-tbr4>6F|o!R{`iR_ zvGc`AZ8RKxaNN$dYXk%Ij)*r_fW`|6BYvOznWmg-Z0aTH!n(pLMs40o5YP9_uHWK8 z%U!rJ=lJ{~y9}`GDkzh~K$OSpu36s`BwDrnS|IiM&uV5cDwbYOBA@a(u1Dk!y=!b( zOtQPCfU)#aYt%eZ7Jg$`h&*D-R;y?oK3Fbs$BTw0#1Ug_>(?Afbprpe)F03b^C6Z- zYa^3j1fu#~2g`CyCZ`IAd0lqc9$JnD3M`4LT3n-JN@fG3J5M|$dw)URvjji)lz|>H zR}Y>4+EvCsY%w2xOX7NnCjyzsI1g;L)h?Yb zO!DS6TAWxVl5XzlJeJPct_(kgWzp?h=Ku$F@EgmS&(u8Mab}}!e($dB1Wt=2q1x1? z+Eo!x?35rbPR#`_?5HP}cwLrmFJ&G=xIYP=!*nh%mN#*Oy3$EZyR1aD5mSUJLafHX zSG98L^oloC*Bq4Y-snyQ1;M>if3lL#fhXI~*eZ8suotD5UE5~5gCSi>Y)sUqY9#;DTLoQvmxQ zvS6_yG-b_X{8O9SFMXWP-qCj&iJnJ;4rsKmVk%1#B_FSg_>}}?^S%cnS>N~WE|nHSZC=MGg2q3V)DeAvHU}IQ-EPUHk<^wxuw+`F}^}DR{NN7cm8N*5auo| zGAe`4AxF2P+vC6>kXvmO<$cSm6<;IGeZBdTn3#S#yA^(%S~kRT(Ln*9WaW6MZGNwI z9gLSpzm#N`4H4e%iFMR%*KkR@xWoQJnX#k_@PJV(qvu~nj8l7{Q@a2Lx!V!>nU<+G z{lE{=haF9H_KaeBeKW>ki5Te{L~3BcvBf(6ywE$5xLOPE@9 zsLyK9)ydEo@rmq$tskt~%`W(N8t)&*aJ%^yKfTm$9#$LFJqooU&=W~xeczAGhrFpz zPdyiB@cp2FjU298*ly{lx#i3t;Q6~GaXq*XOl!dT#RDLy=bLbg7G!3O+I-7w3S`OH z@iKE=Ve|m%BBW;x|H5&L6am2)UcE7_sofx1WuJZeFaskahe5MyvHLO2xdp3bJ{|XQ z8V#yYWR(2)L#p7VY>M>|lR9|Tr7weP?b)BPE7abA!hGC7!OMPteQs(2(eW0r2N^3= zcbPv-bsuQ>1NQ4&$oCcbmOmMX0Z>91xFGScm|!B`#;QWPh9!rrci@U$DD?*p&w1t+ zg>fXzn73xg5#{hHfF~kr14B{4u;6MguZM_ckvt?_Di-MAsFHErGVb?38BV@GhT19N zf_}s`Q%W7h)B%1LykUI{>D9}!qH%kpVu(3LExeLY*zMe%TEOY9gwc3)K&J@k*<|C7 zskl|8*}(yGD|52z)EEj3Yq^hSJa3A zm&H;a#`D)#IWz*;^o;*o(EpCnzt+|@S&qT~!~%GW@V{&7|Jrs5*TIn?35fa$_)33m zJsZgToJg1=C5dkk^k@1j04@hJnk_SojUpgnQc1MtAGj_XtC)=yNLLPzEdc=&IPp#p z$K|hyI6{x{+z@J7?%&j1q?@uzN51T|3CP39DI}iAh%`Uv=SN0WD)@*{FU@Y>#nSNi zAh06_q8#|7@SFZUKQy(0gMh(V2%7OXpu!O?@NnR)-C_HCqP%zkJa9i^md^b>KMGy} zjM1X}Od!Mm#88k@23U`b`gH>1zcCi-!Y3p`JW z4bR{Gt)Br902oX}={Wsu?Y=EtF zk`s759zT+=L$yg|uCx0x5P9GiLeyQF z)B#Gw@rqTj{wwck07^G)^^C$G3G11&+TlpyX6pjne~He-{!HoAJz};P)2;*Y3wR%| zH;GBAeq1gjNj~~@)ka1<=ahYqG5I@s+hN-&{s5Sn38t~a(=0B4a;*Cu444#Zr=LD0 zA<%~Es0&*I!16ObB>{Nm{{qYfK7fOb9s}YD#18_!fOqf4G5A>1_L8Q3rAKI_!psJ;cm%-?TK1zh1OOqX=gO;5PBLDymMpFfYY6F0{2Vp3Pto@IIO`@f=R33Bf0kz6Z>9N;y*3)>Fv8M9xn0*R>VD_OhUZmFmtF3l4JNHbb z*pTf4PAAWeMkjOt`4pIP{EWZ8m&sRVqw>q~R-pJ_V1h`_}0ip+8vI89r zaqs)nUVuk`V5SAQ#2eapb{QFV7Ec@7W>qC`U|>!&nS`SBRNtxd^wUqO@(!n}aNYp? z9aZ@^lR&OIge-rI%(`I#V?d3Z=RITC#gNicL~RqPJT^RZ2DlZ6nz3(xW?liJXn^hw zOSQpadHYwg#`CXPVNt!_N_n8s8d0cl0dT;K=FKbLR<{) z+LHNUMRiDOY+g6En`N1Znx0|gz~cUrJ~>#j{vDe@ZXU399Z~DGq$`BxqDTy(Xa!=0 zGLZ$VkC%<+8vg|J02)(n|AP~WOx8;D5KcDdjP=5GOXco_y3s2C#(K#U3#_x77% z*D;pxT>zi!OQtcxiY7m)HShT~u1GZuy{c8Bt|xfpk4msiUhu(PQwlP;#aF-q7jF@P z?C&2TnwP>2at6`S99t`CTsYT0+Q{VQxSDXMEwjyj7J>*e~twav-fA}*C zC?1x9bN|Es=5suhvdZ}gKtSOG%91X9mOI1~*+ZmvFlKA}y^-NS@Cb@E{B%Vi0=iym z3VxKg2LL37*ofiIm}Lfc97CFNC&uhkIX53CWuXZ%flT^S88@E~acIE-GJ^y(+64oI z-aXQ~A0_}J&<+u;fohUFS)~53!g2rLJ=3upLzg%Q;E$VY76R(rFX`>#IU4$Ik*{oW zoO@(u0l`Y*{&Le5&+Y1S(2ico#}uJwH3F8<8jSAutc-6prTM`knR8ec3t8VbEVMYd z^dSe?A4LJt0t=W;yNnbS@CBYP@`-r@*6QFoQ4KW(A!;{`?al$4!cSf#sG!%ktXoC6 ztclths%W0gF7gN9)Kt?*EEMrTc(H)(ylo;H@PFES>!>WZXm3++cuG!n7FosX6amKiz{6pb}qrFOEgo)uGE+XwTenmFRRcxH(hx6`jIx|lbHp|8OF6XeV1>QdQ#S8VyF$YX-967Zp4jv~CfS@#gmSpuvM!33 zf$7iccFcaEwZUkDj_tD zDZa42``3Xh_nT`ZF>zw|1Maa3ajMl(7=*|Ttma13d122oMZA|ZDJl1re33-lX}s?G z6!{DOzK;pXwR6;?S5qrOsU-$igl&;Ek}po^}a8F3GXTpS`^qXS<{rWFxO0 zm8Oaf@%Gw$BMk#8gr&7VcHr6syFzCqcB;2fFn-lk`%~7RAD?(gVv%18`6)K(c#+Kb z>vK?J{fxCOzQ?ZzcH-1*dtY`adW|EcE?g3daWsii4c*|5m|+mVIj)mh>-IRu)|W!& z;>x74Ip2yw*Km?R-xD0kdS4*qIMqfE7Rgk?sfuh|NyJi9)%kPjR`!OY;%msChH=@6 zcWlO?Pf#6Rp*v3!QRRlV6j}-1T{Oq!v2r>4E|RqA%uT|~V3x7-K|?QKl{uKrw2-9q zsN>wOW@MJelOd4ZQZSUb<36*VXEbnKm~#heo8X+vx4f8-S7+&_#- z4^dL|+Whb&?{z7ROt%d8?ROQvvI1M_z(r`UUcH#?=a_NbwC@c~1yn6-k5?kWp=1Or zb`VmAsS-q1GW@K)zX*fkTUN~irXqhr|pg!-NR(*|Cm5@g#9W zd|6=>!c&p8)Xh9@jYf~f8yoSpGPng@O%~ekQ-nS}kJ#T7`AY;($E1VuG9d5vCZ!eC z&f@E0s@c#BA|$r7lvq7E;VV4lVJ|*uyhw;7-@OpWM8f`-P-}LOJm2a9jyj_$ir(z@ z<3P{rcZ+2)&lQF=F5)=^!V$@ODmR6j_ks<%mZ!7#S^u|QLwroL02VXuU5?+?943f% zet*NOnkss*?%tFclwgrMdCT>E|G|k;p#xRRxN0}<<*~LRPxT!C@9pBUV?763Pzeiv zldATrqEWS^->Dxxr1wFTrec2IC3UY}jnTJ3^hZgpMeVyUu%yh?i{*b|Yx0Zb-$7k`-aY-cJt;^RlbA8I6bBARhbKR?D3OOSaBXl^e@&>{Vz zkI3o4_a~w{AvP~0M?4*#O0BQP$)h$!iZXGt)jGlEb45I}chDsalS0A3?h@0w%oA>A z-#q+N8|pRcp~N?2zXygJHS}`!;mA?RUYEQsCK(GydWYGQj{+DtEolmled6OA;bYTe z1T88_-SG!(YfAmmrwMqL4Fck*uko=yi%Co*@s>i1Wngzn=JpQU6!)gSU}qz+>9C_A z@7D7=p#WgjS3?7B@*S>Ugs` z#=o%vdZ|FD3l%vWuqU7?XmmxRf|=BqxP;f%p{F#g{ege_su`zEdYYkDUF2l{#OHaX zsQo7gjkE?Rl2tC0>@VqEWy)c*(>v(oA2cYkmp!w#*5qXEW!K3aqc6`&KDDS#?dMRd zTeX(Yn-g+3l~QiSXLfkmhsocl>q1Gmj#~t{QajzdY=$KLTJ#`0&zW#8o84L#Woq#Y zJ#>J0dvRb|b0u(}xSoIGMYIdbz&abGch8 zU#3ep^4Nd>9G$1imzq`UHdSNkW_)IRB_fWxt|F*=k{iXI4)RRt{4DB~lLV3{-iWf7 z_}Zvqj!!4i)@G!e70`|-z5L@0D>R{{MV}Vb%IA?vQ;z;AQ z=i7W`AT$80Qn3o`!*}ovaPF0bLs`62r<5-y%m)-<2p@Ph85ulY&R2^Pb}b5(0tjeVDWi zjKT!tSO?cKjnV|c0aAD^j!5mZ+9%=hu1pi$=V7QqpX6E(Lfy`v0Njd&nO(Pd?GSk!9*&6R*-*ghTOiW-QKKN1EU@ z#qz#z4V5VvzH%l)?R2rzqs_hGIGT4%Bdms86#BbeDdg*9#fiHpOH?-L^8a{n-IP#bk zQ9vakY)V4@-Qxn23eQPA;yN7jTY1RwUg}$N%_@P>$s$$%K0TauaS9)P;l(&d)GX>F zp=Awj$5nB>m*8DH6H!U~IMIZ2LP^arOjHu1FUEaUI0FB9QdkC#WO$C(C(it1VrLa8 zpUJP#rhfd+DZ=nchWWXfzg?jOhWWoXRLOW}PI{32n8mq|Xesc<`2G&|?eoBSFHPx* zA2$s>;*PNlbx%EhaYlQD&TdO;(m3uTKNQkjNBP8O9P>(CAcQ#IaVA9f0ow&tfK#&jfUk{az5UbBMcGrL)c#is7wa`#Z*M_3)Nl`vPRWpv;yJ`M8e(=fjR}M za|#Z?Gqo*~X~5VbB)jZ2>dbm~6jBTp$P7p>kRUf0hI|i%nF7wj1ph@vJ@yQ;#>Wd7VHb!a+64kL{R>L$q*Z3rXdG zBp={@hZvJ^ZBZjHx|wBx&D>7E1}>tx*8tmh8StY@1xm>tLaGGD2rf^J2hbW{1=z3f zhw&4G`PW)_t-B7C9es=Poy@Di_8^5d$##J4u802u3H>t|sdgwqD+g==QsI?$P-5sd zjs*-H2(KjS?QY*b83;{k#8-R#RJ4Yqj~`D3f*Yae=qPAOW@vWVnjuB05b|lRzXiEA z{`E2$2bNXft!K4+RJk|U=SmKvPt*E7!lcxlE>?^{*bkKrA_fOX{;n;hheMts#n&N= z-&dVIUL&FhSg<}>dIlV*0qR6;FBGG8RYO^=eDxfP^QLS&QbMl47dsxZ6g~kzTmZL-t&msFmfTlrEQ}@v>Bm)>g;mWt{ zTR=;T1fQq(|F{Y2^!Da?!UvXqSF(jMq**`dRcS~La+dKTWVC4DYs2`v-Ea~Tpmc=V zB3vqV?e1>@i-K2<`0@kf$xAa(RP|;S(cuwrmW_rUv(6PGs=H0mIOoMC4MD0o15B_a zK=OSLv0Afa;_GX7E0huu%Kc1r-eju60h}kKE-Vh_CQbXA|hTLsqPI9b5tSSVppB4AiHVu^Zo5dsO#kR!fa>Ypnd7WD%w(^h~s zjquF+NRby_wo$9Y4TlZs=9W!Z18f9QM9R?wwe&N(5F%Pa2RgRUIe(zu=@`Z}cySe` z7R2Ja5Fi0|oR#2pq$=h%Qv0a9u4$0FEK%IdBf8r)a2HDH(nvJNQQfjBijnDi-ZxSB z&r{$3tn$tA`jBPz`-yR%?|-Vxc}{X_W9$K?OS7;cYJ;)(w5?2~uk>@lK=r#KBccX% z>-4_U?_k3!t0E&*x@4MgD}KEktR}cd1;Ga&&j&H$1PB*EHWXLPZG4hd2|y$JO7Y(N z3YAy^YIdKzLC@Tb!zTxX=@QUKq-A#*9k@Qi+Dl{ZnW_b@`80E*&QgNN=$N(i-6_4C zUrdndDbRAw@EbcMwV5#RUJw30bjeHVDVYJ1FC^(u(?FlpK{mDds#qy=z?GA|gk>(7lofY&4OdiedoOL6zW zT`1Zc=~QRd{|*73tI}a$kAi_}4TQK#iXD=AQ2psXe261qReu9vp_tCf!-QDO9p(hDq%-?xXD|xEiBx#dJfX*AX&~ZF( zdRVb^wsbdEOLCt1Ts)%%K0hNe#y%01TQ-k&A+_L)@3hT?OUX@&Sumr`lkwF#zv~+z z?n?14T5>Vgh|A{a=p>=6?nc9|wskB?rFsu>FGxi`3=|QRSUW^lYv_7?V9gQ#P;0rxbWadYE<*`KpwbqWL)68fOB%6la|7 zzhoWrm{)I>)|?bU=dI@=OflrFV4gmsxvlWMDD1K-tF8;!1dfTo8ZMj{k@wi8229>R zyJ4rJKl-BSdYuE*T3uWFh5^fqj+vlzZ(e z7`N@TH}>PG5B46t^}DCVr>)@-8M=8RS2E1uK<&MeL6<(M(;Y`E&!^-F5j6RX`v!7S zik-Tdt?(I6rt_8BNC2mJ8ac+0g6A0tsoju0mV6Qm8%Yc;4Vh{jV9)Qp@Vz0`L_8yt zCx;d5RtZC{`EYWD5YD6miDMbtDpwa20-={)2vpP&~VjMj^cQjqhtD z@ZrDMFvE0z8Nyg}cWV8K&P6jt}8xGqn?zy_clX(^VdKaviu^`5nuP z>6ur6Zl_QJAx~AobPh-9(bJY;mgens2n-PiEf;v}kd0Vy3k)g2v^`??ZX`^YXar_H z3p2)x9X>0P(k~Z$ZAp5!##X&J1Y7cHK@j@d{D{q90;7Xpo7x@wEYm2x9-w+x?k!#J zjs-Mv9oamx-(H8kgCg=eHsM{=|L>pvs_mGIMX6SuYEQ7!{Xow;;|hEcI4md4Za9K(%p}?joOZ5 z->4GqOlV}LIQ!QTIwAodYreH@`Oml1ivjD^EL_Ozw|Rd<{{O3ok@x;` z3%J66>_2fs(={=we|_k&Zs_8Fiwr>pZuTKk#tef4NX#+;~%VSGRPrQ}wNv);VjBkF1+) zVFt$CgF93k<-byD-#Wv;5@*<3}#OzG&h;{WkjS#_BaDToWN>R%~FA>!BK3}*ek6+=@6rc))cyz=jq zlfuNvD9#A~b@XH5`B{~XWJbr!$tN6;83(6%)baBwogtQ2XQ?M53Ae+jP?OdBgD{^ zS581`FkG9^yYfuP5Jkc%5S9$3@LFHH2P}5ZL$Fi|gdn4kgFVFCJj_}4ZF|1NI0)kM z5Xh`YKsiuA6+m^%dO@=LC*2temM~vfm#!Rde3yg3QK*I3|2?e*Z(sOI9sM%|P~Aq? zfg@4dI_omsU?cS%Tm>&cov_L0q(TJDbr!%KK|BO1nFOhjZ||bVYn4&uKoyuwEe8N6 zKE!?B3-*aNDT0NdQhk2PX;MriOvMn17;iZw&kbn%Ui>E{TD;>vQVzQDJxD>U7G*Uz z38-}pmoE4KxU9?U5yZ)eh9UQ#by3k^xUtG43k32*2;n>QrW|!b?6&dOXQgbwB|K;< zv4q_mM79AHlm$22Kj?S7x)c%w0rF0`MgWP#1BQMNS1^2rgZwQ|rcUFTcgjJaNP-a5 zGJwQ_(wITkFf}CQS+nl-$Hx)QaJ(RlWhY} zb%+%>3F4&#VYI}GXe=Vbsv(43#Pr0L4dfMU3Gbjkd|6XFLU-?K5bLGOL8*Nte@~lz z_~UN&-Vo>!0{Xq-u*Q4&Q=7uZ3&*DrhWk~1F)_w_T=H{7w2vHVa58dF{HQ{210dUW z*<)M<%K8wlzJUBNQ}Ks)e<##iFAYd<=VWCLouj{I*sfw=sF+~qlaU8aapeneA~^CV zoM~|QKLg(I9&*nJ54AsOgR4C_aW4fMGd8NmNGcbH_|_ zimgBIabp~IVVXKAR4Wf-95pw0qt&L0{d^)kz=cmgTUJq#&auFAy6-(~qeN%H^Sx`U zNK^}oQBE_CjCiq7)03=HBfhaqCEjlTB)1qO@ZT&3hrHH|?k6q#R_Etn5O()1L=)~< zlfCtbV$VpWyBklz0Ges*o;%b~T5l9&c7f&R1#Ht$sJv}WMi&cWcBHaVE!3GGi=AgZ@;`P8V z{p(N=V&RDz6-bdbp-TT5Iia0R3606jf}a?h|NEKhry3s3;b z-H8M@JiED~Kf8t|4m5d&%z^;yPhFKm@!SmgL>Xbb2$9O)W0e}p5<}jY@zxt2ZY^P1 z^=L2?ic?p8*=t0dWbNVHfjmilp>e2gfE4?nKn*jY^=|nu=eh_#D+Y{O2Q!Mi%>>wd zlao57ioE)ddSVLOTwHyrDk$%XYqgNS=AFW#bsbwK8Gd*L{XQ#TJ=jb;hDsDldl<%- zfrHEl!l(kYX$yA!WQER|lXJq$)W``uQ8xSM+ zRQRJ097Wf|i-9mv0ery>_#!M7tSlW@=^}H3l(=xC`ggccq>^)vE!%=bjYujcIjk?v z6M-0-nxP`H%XwG$BfF1&{XmpswtNI2)@$e-0d?is>9YQRN0d8ePgj$N0M9T&f$lCP z;hQ+9XgA1%@X`tm{xruK|q(DVlfkP?|gdN?VOF)@swJ_(g*;I}P(Tr01VC&q23 zpEwR%1IX`$8LG0G25<^8zsl=##`m1W4)Ze_)JCx;*;C}ce_^Z9mb^K(Ox>jY2o5j$ z?6M{9d*FV6)=msWQvRdezlN6CYPxH5`dr-}JT@J!;UT=W#CKQy3Y%-Jy2FcGXgWn+ z`_5#k@&S|5LJ(Le2cDiHM-xsf*fhwUFe0-y;ujFDV>lX##7>pEf#=jAQ|P8U zqt39`NWCHW=zu#7P8A!(+ERTxx@3XZO+15}?>(90u4Y;Q@Yn2GlQACKI>GxFS$RmW z&v$Q|_ou|h*gT0U;Y|{oGU2t#wTDr+MkI@x7bhS#4)1Z*z5r362Ck=-D)bBCKA@T=A%j7oH+ z0hRnNMs*w#*gzC@Ru7l|LR!sp@O&TbJ3W7)(*v+duZLdE{eAvN1Ul6XkeM8JvLzcj z8Sp)?^Z5Ck^kI+6)7!Jr1kp`j^%HBfsNfy7qA?&q_@3zz@t+u;J%hUGD@S~PVh-HE)kQ3sW_rH@_q5M$^ zG@F2XANhd{%pYnNfgAo1?CyKGp$V}*+eZ*+1+ohgT>p&61ziZc0uU*IqbwFQP2FB4 z*Q~@@YzkGs1hzO(%?^(EBs=s=7`Xp*hGF`3pQ?ER7D4CBz>o5%zb%JABbXl^gGvbz z2-I+{+ofyogTYM%9sy39P2m2Ggmm1VJFwOidLKoKKN&N2zjI4eSuQL7yX4nb0MjFW z_9+vd6$OnpA_&KWtKI-D3&G82&zusa%$RL1tLAH>Oi^{Vqc6tz!n{bY1NK+Kq(C5^ z7*mYjYb0y{P^4*F`<~MQhgOgAru$2zYFEDdeJ-v3GOC0r_76GKFEIt>@z0U&pX^anKvRhK*@dP)oAuxSFpOQ+~6r7x19@Cv&N)@8{$_y{d| zMCXN~8K=Z9#+QW*q7kb>ATk8Uk{C-2b^~B@1f-njKV>6`e|9lN*+ljWoLk7fYv4dE zLri3vPm1p%R2l-7PfIW_7M;e})9aaX83^sW?7H{|@cLCx$K~x|5w)2iv5ZHnDKCU1 zkWnb}jUhZmwgG%QL>wgDGoH3j8n&t2v%nF(02anSYMC+25WDxy=_>^l2(mG}eKF|( zyEiIc+w?s0yDeQ1mgYjk69PO9z&KQ`#RBZ}Z~>V^u06$}9z$*dzuMS2a2kI*#5@9P zP=xFkfcoX2?#?2hk~wm$!{Zch>PGcD~@RoP8%* z3aG`oV=1ee^W-x!z0;jRrMia4xZ^ZjlqWyr0MjVA z^3nb|sr7E2?lVa|(;6@Q_x8s4lN>=X3KwTrL3{S-VM-|?(4|53j$LoJPwdCifVj1e zPz4eI4+|rv0t#6<91wqt*&ql^FUH>u$wS*dd3uL;kowaMd2O-{+IP#oIX~yx>en%r-UM;I4ac_8pW%LApRSP9DDtWo`E66M@ zPAlN&*+fzQncjql@5)Gng()?#44!s9`xDUSQimH7RWrTl=7rro^g@4- zD=d|?=a(}qJm$CXMJjA12kfV2o&jm#?$#EIf)h>fYa!m5yHvT#yV4#fquHN(_%` zuw&vysU_==8{lE#nmh{EcP2D4tg59+ZfT7x$5szBARs%-eU`I$ftXXbvba`nUmL&w zVWg;QQyf|)>r1NhGr`Y~JBPcxKM%i>5AOEcoa^}7aZ6@NHg`$K%Ia;q=LZU2lb)l8 z{7jdX%`xg2?~7dX{6(H0T0EI7h=E<_V^h{;vBAm5muV-&#-?gzpB8vuDprNW40HtZlfBHg1)8+_TT6$f$%t(u-ZN{GjgnM*Mqd;Ro{8n33AYuk;ySDOmm_mHhFzAR`yr5c{XeZLycH^Zm9{j|&K0KZuWCl@>+5a@T;0M#weG4Oj;ZM&SU^dJYrd9qldB_h_ z{8c1$o8`}-P=whHiRb@r@jiBetiRVl8rvm{!_NR#TqmoH+}+(PbPB7Cmy$y)ms2E( zbqzz&dgHN7yzpX-T|JO5k0@_5vcRifT3NYJBrCf*h-&e;n3 z`e~x&!VeFKXT>P1?v*|4Iv}nRuL#@jz4{HD!?OU?)P~@3SKi#e=I{T#)$Mgc8jY9> z89DhIpjN^x&zX>fL=yXEEV!&CYyHF~m9_U-$vV--G?!L!3F(O%Mu&@uiHQ^4A>_C* z$Oi{PJLYZfXFV1aUe#HaAA${|m-Hw;oTrjm5JVGgJ%HKyd;#MGk%EFm7+7MBBDsu8 zh`tU8UGAHxvwTgZSf-Y^Jd)4xoN;jZj&?p|rgVQ47k;mL0>cR1{&aoxd}Ro!;!o^1 zxl$_uMwhI9Vi(>jza9|tZ0qq&yjS4j!a=ZDn*rNb1D9_+5O`J4duk%h*Z0}ilaAe(ixj z;qQW&I@lH>(G1WS9BjRNf>bwcy!=dGUq5^b=MvS0$am(xe)W7G(w;q24c=Ru^l}V& z5_S6H$B$5H&7jkFQDbQr`7d=5G%VGtEK!bb8XXx||hI+nc8 zvP^qaAm!}UFnkX4ALkT9$}os>gQQpZuFw}n`yFd;+lB}AZ@Z-i_)7?o@nf1 zs3Xx{p$s!DVR_v3ndWCyzPrmK%8A{k%?^G1&0dvx8Rys<97giQ-Das~Y+PKb=6V^x z9f8Afy2aVRd1XkTJ7efRbU40Bo13Nh&WVdelLeHeO0a&z@z&?N=cmWv@|Xp&D#KLs z;XZ!Wp;AuS1|;;96pa zHn-Q}v-@|GL-kP>iiR%I*jTVt+7m+OB*k4Os*HFHjl5q$i3HLrf~9fXE+7YY#!Bee zl4+56xPvbz+s?;4`NMaFusYJ_(|GgZiBw->?h)&PNz*IrU?hQIT-ZS^uSMBBtZ>{E z=DGCH(#1l@da*xMEfZ#G@cm-gBoYB19zM^N{YB4R?UZpb8Y2;wRL!`Zsq&j9`=Oa; z$J%nrwtP`%eb{-$ot$4HPOL;lJI-7mGQzqJCynkCQNP?=TVyOb8Sl1km*gcRB#aiy zoI0y1XZ^A}B&1`UH(+UF8A3kjS~Vt=8Q1_9A?-&IT|i0>SETZaExyYM{&`@EJM$$O zX?n$sqFzYN@$D4?x|$9x{j8n`8NG}R-YYu(4|NG#VVZNEj67I&Bz$7%K#AhU^N2TH znqd-$SMA}}ZFbNuTmAC!>h@MR?IMGI?4RyPrvF8;Ft@dJh}^T? zw<_*FN9B^UTu1BP&V(!w5~uOfn|ktx4=#Jx!i8pJ_+cf=QMX3{=46@}(EoV!N58q+ShaJal$iib>Bj}sT7{y#_K?$9( zbcmA#6CL&$y~x$JzV7M9`Z2O@9npF=znw>T)u0b zxjmUYR$YoexJ{tx3!+!;5z28to&{%>S!30D0 z0dXJubG=y{*hHONEY>i|zmMh~Q&>rkLC*Al4(>4>SO))1{({l8Gv)zV10XSh#F)lu zww?}~?AwDIfXBgBNlWOelH=mybUQ5sRs>xADnh-V zSRVC$8Rg~ZlWL$`JgLp8*u&gIu7ZdC4?aN<={%-7(a&Hk!QkVOLy+PBAy|#oymbbk z+Gm_YV4L1HJ3a^R1rZp1P8@yC0;|%p!^_^&a~o{^%a<=jL_}o#HVX<1HylY<`sA*@ z?RxxJ$@vKKAU@7CS9sHp3!}Kf@>dDs^@fHAQPkNjU(Ev2XMze={$`M3Vu`H4F^L zCJbvwvnHm{G1sw6nw$-TNemAUSB3%~Ttu1W?r95a5Ok$CyV7dNSrmWVPj0c$zfMN?B#LvN(7Z&~Z< zrOw%B-EM$hl6^u|v%_cKi!4;daxE`s=&+kW+hTQpNv*dJ#Hv2iv@i2IYbat4>Nl;wCD<;+(|?;XI+h^G{g zgJTjj4$K!&{dpP7R>6$Lg(oWNeGSxlzy$PsMR{d6!x<}fWh#Squt{$JAvu_Kn(YKP4$yok0akts}5z1d#J$UGoqF3hlrkpcjSR{0iJtKbNG-#kOkV~o!rcld4lg9DEII=Rzy!#mI_5l=gKdF_BQ$|(aNh;##r4cm#e zhtDCAeTb$iP-!GkWwxnaBb|L`rKUUu9KNlhP*yhByjc>C_8a-n<$j#s6dp3mVL6wS z5xuzRTVY}G3Ln>rZfqWM0v~nGz_|><806@Hmx92*%g=G}0^PI;jVcp((HaV8e9{93`(lEjD2P0e`Ieo&nWq*OEgejWy&p&7 zSkcrlUv@^&o4Q>+Ys}88PN;vMn@fF%y^~1SG}L^U`#C`)6S#rnIQ;?lL>xzJ^va^> z#Qk>`(>X4v`N`g6AxbeKd@|tz3%Bv2Trkxg8u+3GIMs0%blbQ3lIR0|gE+gJ@x@DI z%%P#1sGY}6_=;~!GmpZP*m!uV-Cnmgt;KaUFlRR{{s@4VqH0$7wj#5Xs#~@+F)@+T zxuAfy0E^-iglkJ!e0Ba|Bg1eiZ0wTIt~6$bAua)dZ2SEzUQ-embgkLFG_>CSpr4P@ zWQnW;Mu#8W@^n*^@MQskv}irs>;nYMg(Srg!3p#2R;Y!T=$TLv`F6?-hi8ya?GIk; z7%o{K3**(JNNEdxhC0(`wSCPqAwalBSE)&%*1)r_l0T;s14}kgfbo(+Mk3&y{&QsA z0<~YX-dc)yk@=t5hIy!NiN3Xu4SpiW4%2e6n|7}75~x^GEio}M6?t@HLiu4c-5)0N zM2LZToSf%bWE2#euiKu7Ge4&v{3@GJkv^cuD=F@d$l7;)+G|BYalGxm~#oFPj`dl!@Y z_t~OxgjUlS`Ec$I<}1wh@YMq{BIqn5QK150W;hJMh*Pu^H1j@|@OfGwhM~^QApHi>87jQ#=!o@>kEa_Mid@?&pVg_Y(`N_rt3EVg zxe%^t_4fX?E0=?IB$e+;JeG`j=OGVMd|%|p+7W4SaOCE2!R~`9f%@Rc(dYVIXDhE( z21hoIg#G5qrWd6&WiZ@@u1MIU6Oh+Jm=Z}I;7w`-YL97?S!9;}oBRdq70SkJl(^jB zW;wACMJHNPVm-~UgC3Z&8O8r$%KinC|GP|?8D2(qHd4(7PVgHLX}j<9v2@P_`wRw} zpGVis?7r91;QROQ3uXN7R8%-RIQVYPNy*6kXgZ~j0j3FCKp~v!?YZU$h%_F894v}{ z8?#2uV>px;=u%=ypeZWB2ouoX*k7L+d}ahz@4_uPD6-`B@{Tr7_AV_n5u*t_5jTH1 z&+|f=uWK7k>F=H}+c#l;05%T6>4{>jWVGB$1v__~Ql$v>PUbQ!3iE3fKc zGuF{R7{@|kf71i5=B)6iQ-o(S@`uh)<39oP(}02pi8MU{Lcyroc;M(TJ}z#NznS$W z*r*g`=<8E?Y#W>z;4HI&z3jqI;qMHATm?i9B6iCd062j|0yTqg%)&K6E%cR+T zA7?PYBXmIDwno-+R^&w2 z@=#5CN=5q~XvHxVMmDzckfUEDu9b^dfr9MQy`DM~%LOH^FCf86K&giAfzON1g5Yij z&W-ON!XKPp+o23tdT^+wa1MMxddH0)Igpyfo?HozE27Go&)&fUZk}f=J^tt>%{*{& zSpt{%19o?JmF-hXlFAR8P!Q@>1!$7QAkQymU|@g=I@7CJ@)P!Ani_OVQg0w zP%TH`H5oZ_grk4cFmL0?d7=mzz=ZC>Ej!g}i8Sx3ng}t_vV4qpV=a1-yPNtFX*uE- zU`WmvN>9FI@pbe_`%XRc4yhmi2(GMnsQ(SX`#`2D&L;@Pu`2pF-dLS9hPN=6BAuQd zNat8uZsN`QfyD8Fb_+ZEk{pH2$;$ZriI~fT`omkSSIenvU|oLNXqyLU->Rf{gPClg zYe~2Se+eSwL2%0QT~6B@SkhMXA3e9ZX5!lz8=3?stn}!tpU}`b1K;&&CLW7J;ASVU z;rAv&r6@f0DX;bE#=#M)uM`k6mZBJ}8zwpW3`?qiBglCL&^Z2VUTTI`6F;SozY!ZJ z@y&ma+xh5&uiZzMn{1Yl0k$TM#``V=s0Iec*FMd=MVcvBv--kOpvB@pSFvY^AGtBR8^s@f0Kpg zqKyizqsFhQ?0OOiMDoB$rT^&9G|!UABV>@{Wst40=)a2oLcV}oD&tMLJ0IiazW^92 zy?80#3^?*cwi~8I+aLu?eLOd3(UHz6kA+XQ1-C*@=<}tGY{@T(Tvj3cmQ{JN{3)Z@1G z&9{=z*e*V57gJ}bCfgH{lqlNtKr9uNMcn~hHi zvHvvZ`T8c4>_ZN6chJ)cPEIcwg*^=VapRhk97DF9E7SpPB9-33&aS^yeB*Unax3N%Sw1WY=31;zAN*Az$iCe zbQgRBjOC8S&khpSdB6DlbllPu$aL$3wun3UMy$$00)E` z<_G~_efqO8%xK2qBxqXNn^=!^N2fH9i^y-_yLtWJGojlTDeT`lQepljv`E+FeAH zWB~vvNMh`M4+DBZLzCBN$9dJ{f`~w6xjpO)yerxGPEOqDhZ589LG_l&L`bz?oWcIb0P{zxsh}-nO z&)xNr!NIiWo2suw?>)iokH$&(+5PRQz+(PT)BBu2-wO{P^IpLj2s$b)W!_D%KuRIn z8*MZsHj-jYVqTA*DWs0u)xNK*nBSITdP;~a^Lq<=Pg{XPr2?HT}LOwI2x#4Ot%DGj zDuxmnk(I#uwIM*jyUk=*3B_p!XED7#6}Pk)T4|(Weg|fi@~Q`AavKsgZmcFY@4E*4 zFf6&DA%!Lr6&;-39jZIBwfEr5-F=Cam==601IL7x=?IE>eWU61rLf(IjBR^-n&vv03F?0$4 z2>{c}UT>(ZRNoZV(w{C-KIQBK!BEI}@JOijrh>($TaJ5?=#3QK#1XCD{~{EuyUC?x z)eiio**zDx`G-FN;nvcP{QCK0cV{PwPRa{OorM(k$?A*sT@pL<+bUH405iS$?);(_Q|}cF|s2KtS@&l#pr0c2zLTj}}NZJHKTA{wM_LJU;9L zp<+)&t+9~z8}UAr0mLhc{TuNDKS9;qbNGGt)`M@iw#Qb0^ioSc69h`0q4-5$8)LeI z%oKXAVZ62V4Z_7nmVAuTzi`$|0l;?nnO65V(q#pdxe*7``rjZ2JDFNUEssXZ>$b(q zjThxZa6%?zfmhs}>|FVZSrE-@dGF(SuHevy=vltG1#DhCD%360qSS{5&m z^enx-{4AN_>UgOt>YyACT|fb_I6q z8?ZD4c!%<0n)XMKdzp|FQb`Ymw*@JgEtCRQ7Tv)?N4o98FMfRPMuO&bKH3%7lH5H? zGf6v9u5(%Cy1@L~>&+Dhoxdgb;8J(6qMxdXlU5}0zUn;$A9}kRvKwu+@io)=SnTlE z2Ht#bg6jxjds6oE*k&6|H%C>k&(zV^TEA+?#luT%%RywVa*(_s-sgHTQann<;1}5f zMYH@$p&mi7RuMS(#*w4_ssapl`6C-rUp#{u$4_lU&e2OJF=i5`Dw2aW-lx z2zzzrmC~a#NFcW0B3+7i2bjQddIh(6Y!CPA*Q%B>Gk{31>gqZKv9NiQokpcitIMP& ziONjC>R5^HHO@BGo_3QfS_ZJ_^bs2H-=ID=2jj)l5cS{^8z$VutaS_C;FFM$aKgkz zg0)+)I^P9NBazWcwv@s`K8!**uR+a6NR!*u*$UujhRsjD9*PzhU*km7Gmn8jiu!6J z0L!^JEA;zeXqWnx;Gqs6va=dS-_MbSBk;;4rN7TQG2%2D%C8MuFr2{9m3JVkN!Puw8Uj!9ZYv>;@e9 zq2b|{Ue=)U^#~gUA*?Eko=jwt+?lP1FT{fy_`%Sgg zQlF+sOpIzG+Xu%@Q%eix0K_n_sxJzdMC)bGP}xib$@qsNbeR&zN^~nZOr4LT@Wc^^ z!vD<<>dx!_#J-$=^0&~Xalkvn7v^Z{-^lK7RN!F@kyF!4X2-gx7C5c{#D4$({uk;~ z{I&v^u7MeEv{-C;eQ`Y8r`hg$$qYA{H%A7+Za+~3?+H7LBl4{Lk+ zgE|j3nsg!L(j!AoWyYA_YN-_4xZfa!j#RMD`tITL`U7HVaVb$)OY=A$n5nyepa_Tq zNMM<1NlyQRwUEyl6&LCL{?@J%iryw65I@E|AX$?-_4hY^gu#i7IQh1y4`UNQ{-0m5TCmRg+v7INn0J zAkeG7h4g1k0f-YhDp4oh$S{7Bh6V8SV;h7gk*^y7Ol8MJLV5r8`yr`%Rb{0D3i*{W z6$SC87nZI^hd|0-g*{Dr(4bvgo4_%Uny9a`lv<{Nu!|jDSfNK5rDJjH2T(gZJ3GL9j04Ml?bDP|EzPU>BOR615*#~D?CX~p>p~Q} zosgiiZjQ#?Z)yPAl8Ddc~$ zTJ^mBi=L_LBQ*PFm`S05p~iQDENt2!s|4L^5kYKXX5E5;TnN5lILkSm}NWB53 zixgkS4|&L0BoHk=_00WkSI2{ylC#_6xG3yZUWM?=-F0HXTHj1Ue+egHe` zTaZJ5yl8yrfrJN0bCd=_m^sY@YbxNZGjd$~-X>!pJ=FWbWp@JUG;}x=LPM-E`!`G_D*sx(|)c zUSK-;rwQ~rIE*c?(x;9a-9YB*mJ-%Ky?_T#gRkfGZ;0x5JF{>MyU5?f|FgS^8ZN@& zVRgam*wt&HRG$h)|Net91H@l=-V+PUsY?J}`tx%k7C6i&(Lj_N;&BNUfi=dY(1(R7 zM~qmBtpQ!?V=-;00xQT?5>#TPkKZ5})=QA+4F0i>B?fG)5DDAb@k`6WN%&>lPQ>uf zxG*ACT0NsO;`hM9k8kSG;dALDx_@N7Vd~H-i(V Date: Tue, 16 Feb 2016 17:56:05 -0800 Subject: [PATCH 005/210] ARROW-4: This provides an partial C++11 implementation of the Apache Arrow data structures along with a cmake-based build system. The codebase generally follows Google C++ style guide, but more cleaning to be more conforming is needed. It uses googletest for unit testing. Feature-wise, this patch includes: * A small logical data type object model * Immutable array accessor containers for fixed-width primitive and list types * A String array container implemented as a List * Builder classes for the primitive arrays and list types * A simple memory management model using immutable and immutable buffers and C++ RAII idioms * Modest unit test coverage for the above features. --- cpp/.gitignore | 21 + cpp/CMakeLists.txt | 483 ++ cpp/LICENSE.txt | 202 + cpp/README.md | 48 + cpp/build-support/asan_symbolize.py | 360 ++ cpp/build-support/bootstrap_toolchain.py | 114 + cpp/build-support/cpplint.py | 6323 +++++++++++++++++++++ cpp/build-support/run-test.sh | 195 + cpp/build-support/stacktrace_addr2line.pl | 92 + cpp/cmake_modules/CompilerInfo.cmake | 46 + cpp/cmake_modules/FindGPerf.cmake | 69 + cpp/cmake_modules/FindGTest.cmake | 91 + cpp/cmake_modules/FindParquet.cmake | 80 + cpp/cmake_modules/san-config.cmake | 92 + cpp/setup_build_env.sh | 12 + cpp/src/arrow/CMakeLists.txt | 33 + cpp/src/arrow/api.h | 21 + cpp/src/arrow/array-test.cc | 92 + cpp/src/arrow/array.cc | 44 + cpp/src/arrow/array.h | 79 + cpp/src/arrow/builder.cc | 63 + cpp/src/arrow/builder.h | 101 + cpp/src/arrow/field-test.cc | 38 + cpp/src/arrow/field.h | 48 + cpp/src/arrow/parquet/CMakeLists.txt | 35 + cpp/src/arrow/test-util.h | 97 + cpp/src/arrow/type.cc | 22 + cpp/src/arrow/type.h | 180 + cpp/src/arrow/types/CMakeLists.txt | 63 + cpp/src/arrow/types/binary.h | 33 + cpp/src/arrow/types/boolean.h | 35 + cpp/src/arrow/types/collection.h | 45 + cpp/src/arrow/types/construct.cc | 88 + cpp/src/arrow/types/construct.h | 32 + cpp/src/arrow/types/datetime.h | 79 + cpp/src/arrow/types/decimal.h | 32 + cpp/src/arrow/types/floating.cc | 22 + cpp/src/arrow/types/floating.h | 43 + cpp/src/arrow/types/integer.cc | 22 + cpp/src/arrow/types/integer.h | 88 + cpp/src/arrow/types/json.cc | 42 + cpp/src/arrow/types/json.h | 38 + cpp/src/arrow/types/list-test.cc | 166 + cpp/src/arrow/types/list.cc | 31 + cpp/src/arrow/types/list.h | 206 + cpp/src/arrow/types/null.h | 34 + cpp/src/arrow/types/primitive-test.cc | 345 ++ cpp/src/arrow/types/primitive.cc | 50 + cpp/src/arrow/types/primitive.h | 240 + cpp/src/arrow/types/string-test.cc | 242 + cpp/src/arrow/types/string.cc | 40 + cpp/src/arrow/types/string.h | 181 + cpp/src/arrow/types/struct-test.cc | 61 + cpp/src/arrow/types/struct.cc | 38 + cpp/src/arrow/types/struct.h | 51 + cpp/src/arrow/types/test-common.h | 50 + cpp/src/arrow/types/union.cc | 49 + cpp/src/arrow/types/union.h | 86 + cpp/src/arrow/util/CMakeLists.txt | 81 + cpp/src/arrow/util/bit-util-test.cc | 44 + cpp/src/arrow/util/bit-util.cc | 46 + cpp/src/arrow/util/bit-util.h | 68 + cpp/src/arrow/util/buffer-test.cc | 58 + cpp/src/arrow/util/buffer.cc | 53 + cpp/src/arrow/util/buffer.h | 133 + cpp/src/arrow/util/macros.h | 26 + cpp/src/arrow/util/random.h | 128 + cpp/src/arrow/util/status.cc | 38 + cpp/src/arrow/util/status.h | 152 + cpp/src/arrow/util/test_main.cc | 26 + cpp/thirdparty/build_thirdparty.sh | 62 + cpp/thirdparty/download_thirdparty.sh | 20 + cpp/thirdparty/versions.sh | 3 + 73 files changed, 12551 insertions(+) create mode 100644 cpp/.gitignore create mode 100644 cpp/CMakeLists.txt create mode 100644 cpp/LICENSE.txt create mode 100644 cpp/README.md create mode 100755 cpp/build-support/asan_symbolize.py create mode 100755 cpp/build-support/bootstrap_toolchain.py create mode 100755 cpp/build-support/cpplint.py create mode 100755 cpp/build-support/run-test.sh create mode 100755 cpp/build-support/stacktrace_addr2line.pl create mode 100644 cpp/cmake_modules/CompilerInfo.cmake create mode 100644 cpp/cmake_modules/FindGPerf.cmake create mode 100644 cpp/cmake_modules/FindGTest.cmake create mode 100644 cpp/cmake_modules/FindParquet.cmake create mode 100644 cpp/cmake_modules/san-config.cmake create mode 100755 cpp/setup_build_env.sh create mode 100644 cpp/src/arrow/CMakeLists.txt create mode 100644 cpp/src/arrow/api.h create mode 100644 cpp/src/arrow/array-test.cc create mode 100644 cpp/src/arrow/array.cc create mode 100644 cpp/src/arrow/array.h create mode 100644 cpp/src/arrow/builder.cc create mode 100644 cpp/src/arrow/builder.h create mode 100644 cpp/src/arrow/field-test.cc create mode 100644 cpp/src/arrow/field.h create mode 100644 cpp/src/arrow/parquet/CMakeLists.txt create mode 100644 cpp/src/arrow/test-util.h create mode 100644 cpp/src/arrow/type.cc create mode 100644 cpp/src/arrow/type.h create mode 100644 cpp/src/arrow/types/CMakeLists.txt create mode 100644 cpp/src/arrow/types/binary.h create mode 100644 cpp/src/arrow/types/boolean.h create mode 100644 cpp/src/arrow/types/collection.h create mode 100644 cpp/src/arrow/types/construct.cc create mode 100644 cpp/src/arrow/types/construct.h create mode 100644 cpp/src/arrow/types/datetime.h create mode 100644 cpp/src/arrow/types/decimal.h create mode 100644 cpp/src/arrow/types/floating.cc create mode 100644 cpp/src/arrow/types/floating.h create mode 100644 cpp/src/arrow/types/integer.cc create mode 100644 cpp/src/arrow/types/integer.h create mode 100644 cpp/src/arrow/types/json.cc create mode 100644 cpp/src/arrow/types/json.h create mode 100644 cpp/src/arrow/types/list-test.cc create mode 100644 cpp/src/arrow/types/list.cc create mode 100644 cpp/src/arrow/types/list.h create mode 100644 cpp/src/arrow/types/null.h create mode 100644 cpp/src/arrow/types/primitive-test.cc create mode 100644 cpp/src/arrow/types/primitive.cc create mode 100644 cpp/src/arrow/types/primitive.h create mode 100644 cpp/src/arrow/types/string-test.cc create mode 100644 cpp/src/arrow/types/string.cc create mode 100644 cpp/src/arrow/types/string.h create mode 100644 cpp/src/arrow/types/struct-test.cc create mode 100644 cpp/src/arrow/types/struct.cc create mode 100644 cpp/src/arrow/types/struct.h create mode 100644 cpp/src/arrow/types/test-common.h create mode 100644 cpp/src/arrow/types/union.cc create mode 100644 cpp/src/arrow/types/union.h create mode 100644 cpp/src/arrow/util/CMakeLists.txt create mode 100644 cpp/src/arrow/util/bit-util-test.cc create mode 100644 cpp/src/arrow/util/bit-util.cc create mode 100644 cpp/src/arrow/util/bit-util.h create mode 100644 cpp/src/arrow/util/buffer-test.cc create mode 100644 cpp/src/arrow/util/buffer.cc create mode 100644 cpp/src/arrow/util/buffer.h create mode 100644 cpp/src/arrow/util/macros.h create mode 100644 cpp/src/arrow/util/random.h create mode 100644 cpp/src/arrow/util/status.cc create mode 100644 cpp/src/arrow/util/status.h create mode 100644 cpp/src/arrow/util/test_main.cc create mode 100755 cpp/thirdparty/build_thirdparty.sh create mode 100755 cpp/thirdparty/download_thirdparty.sh create mode 100755 cpp/thirdparty/versions.sh diff --git a/cpp/.gitignore b/cpp/.gitignore new file mode 100644 index 00000000000..ab30247d493 --- /dev/null +++ b/cpp/.gitignore @@ -0,0 +1,21 @@ +thirdparty/ +CMakeFiles/ +CMakeCache.txt +CTestTestfile.cmake +Makefile +cmake_install.cmake +build/ +Testing/ + +######################################### +# Editor temporary/working/backup files # +.#* +*\#*\# +[#]*# +*~ +*$ +*.bak +*flymake* +*.kdev4 +*.log +*.swp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt new file mode 100644 index 00000000000..90e55dfddbf --- /dev/null +++ b/cpp/CMakeLists.txt @@ -0,0 +1,483 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +cmake_minimum_required(VERSION 2.7) +project(arrow) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules") + +include(CMakeParseArguments) + +set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") +set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty") + +# Allow "make install" to not depend on all targets. +# +# Must be declared in the top-level CMakeLists.txt. +set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) + +# Generate a Clang compile_commands.json "compilation database" file for use +# with various development tools, such as Vim's YouCompleteMe plugin. +# See http://clang.llvm.org/docs/JSONCompilationDatabase.html +if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) +endif() + +# Enable using a custom GCC toolchain to build Arrow +if (NOT "$ENV{ARROW_GCC_ROOT}" STREQUAL "") + set(GCC_ROOT $ENV{ARROW_GCC_ROOT}) + set(CMAKE_C_COMPILER ${GCC_ROOT}/bin/gcc) + set(CMAKE_CXX_COMPILER ${GCC_ROOT}/bin/g++) +endif() + +# ---------------------------------------------------------------------- +# cmake options + +# Top level cmake dir +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + option(ARROW_WITH_PARQUET + "Build the Parquet adapter and link to libparquet" + OFF) + + option(ARROW_BUILD_TESTS + "Build the Arrow googletest unit tests" + ON) +endif() + +if(NOT ARROW_BUILD_TESTS) + set(NO_TESTS 1) +endif() + + +############################################################ +# Compiler flags +############################################################ + +# compiler flags that are common across debug/release builds +# - msse4.2: Enable sse4.2 compiler intrinsics. +# - Wall: Enable all warnings. +# - Wno-sign-compare: suppress warnings for comparison between signed and unsigned +# integers +# -Wno-deprecated: some of the gutil code includes old things like ext/hash_set, ignore that +# - pthread: enable multithreaded malloc +# - -D__STDC_FORMAT_MACROS: for PRI* print format macros +# -fno-strict-aliasing +# Assume programs do not follow strict aliasing rules. +# GCC cannot always verify whether strict aliasing rules are indeed followed due to +# fundamental limitations in escape analysis, which can result in subtle bad code generation. +# This has a small perf hit but worth it to avoid hard to debug crashes. +set(CXX_COMMON_FLAGS "-std=c++11 -fno-strict-aliasing -msse3 -Wall -Wno-deprecated -pthread -D__STDC_FORMAT_MACROS") + +# compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE= .') +# For all builds: +# For CMAKE_BUILD_TYPE=Debug +# -ggdb: Enable gdb debugging +# For CMAKE_BUILD_TYPE=FastDebug +# Same as DEBUG, except with some optimizations on. +# For CMAKE_BUILD_TYPE=Release +# -O3: Enable all compiler optimizations +# -g: Enable symbols for profiler tools (TODO: remove for shipping) +set(CXX_FLAGS_DEBUG "-ggdb") +set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") +set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG") + +set(CXX_FLAGS_PROFILE_GEN "${CXX_FLAGS_RELEASE} -fprofile-generate") +set(CXX_FLAGS_PROFILE_BUILD "${CXX_FLAGS_RELEASE} -fprofile-use") + +# if no build build type is specified, default to debug builds +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Debug) +endif(NOT CMAKE_BUILD_TYPE) + +string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) + + +# Set compile flags based on the build type. +message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})") +if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_DEBUG}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_FASTDEBUG}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_RELEASE}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_GEN") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_PROFILE_GEN}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_BUILD") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_PROFILE_BUILD}) +else() + message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") +endif () + +# Add common flags +set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") + +# Required to avoid static linking errors with dependencies +add_definitions(-fPIC) + +# Determine compiler version +include(CompilerInfo) + +if ("${COMPILER_FAMILY}" STREQUAL "clang") + # Clang helpfully provides a few extensions from C++11 such as the 'override' + # keyword on methods. This doesn't change behavior, and we selectively enable + # it in src/gutil/port.h only on clang. So, we can safely use it, and don't want + # to trigger warnings when we do so. + # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-extensions") + + # Using Clang with ccache causes a bunch of spurious warnings that are + # purportedly fixed in the next version of ccache. See the following for details: + # + # http://petereisentraut.blogspot.com/2011/05/ccache-and-clang.html + # http://petereisentraut.blogspot.com/2011/09/ccache-and-clang-part-2.html + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") + + # Only hardcode -fcolor-diagnostics if stderr is opened on a terminal. Otherwise + # the color codes show up as noisy artifacts. + # + # This test is imperfect because 'cmake' and 'make' can be run independently + # (with different terminal options), and we're testing during the former. + execute_process(COMMAND test -t 2 RESULT_VARIABLE ARROW_IS_TTY) + if ((${ARROW_IS_TTY} EQUAL 0) AND (NOT ("$ENV{TERM}" STREQUAL "dumb"))) + message("Running in a controlling terminal") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics") + else() + message("Running without a controlling terminal or in a dumb terminal") + endif() + + # Use libstdc++ and not libc++. The latter lacks support for tr1 in OSX + # and since 10.9 is now the default. + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++") +endif() + +# Sanity check linking option. +if (NOT ARROW_LINK) + set(ARROW_LINK "d") +elseif(NOT ("auto" MATCHES "^${ARROW_LINK}" OR + "dynamic" MATCHES "^${ARROW_LINK}" OR + "static" MATCHES "^${ARROW_LINK}")) + message(FATAL_ERROR "Unknown value for ARROW_LINK, must be auto|dynamic|static") +else() + # Remove all but the first letter. + string(SUBSTRING "${ARROW_LINK}" 0 1 ARROW_LINK) +endif() + +# ASAN / TSAN / UBSAN +include(san-config) + +# For any C code, use the same flags. +set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS}") + +# Code coverage +if ("${ARROW_GENERATE_COVERAGE}") + if("${CMAKE_CXX_COMPILER}" MATCHES ".*clang.*") + # There appears to be some bugs in clang 3.3 which cause code coverage + # to have link errors, not locating the llvm_gcda_* symbols. + # This should be fixed in llvm 3.4 with http://llvm.org/viewvc/llvm-project?view=revision&revision=184666 + message(SEND_ERROR "Cannot currently generate coverage with clang") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -DCOVERAGE_BUILD") + + # For coverage to work properly, we need to use static linkage. Otherwise, + # __gcov_flush() doesn't properly flush coverage from every module. + # See http://stackoverflow.com/questions/28164543/using-gcov-flush-within-a-library-doesnt-force-the-other-modules-to-yield-gc + if("${ARROW_LINK}" STREQUAL "a") + message("Using static linking for coverage build") + set(ARROW_LINK "s") + elseif("${ARROW_LINK}" STREQUAL "d") + message(SEND_ERROR "Cannot use coverage with dynamic linking") + endif() +endif() + +# If we still don't know what kind of linking to perform, choose based on +# build type (developers like fast builds). +if ("${ARROW_LINK}" STREQUAL "a") + if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG" OR + "${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") + message("Using dynamic linking for ${CMAKE_BUILD_TYPE} builds") + set(ARROW_LINK "d") + else() + message("Using static linking for ${CMAKE_BUILD_TYPE} builds") + set(ARROW_LINK "s") + endif() +endif() + +# Are we using the gold linker? It doesn't work with dynamic linking as +# weak symbols aren't properly overridden, causing tcmalloc to be omitted. +# Let's flag this as an error in RELEASE builds (we shouldn't release a +# product like this). +# +# See https://sourceware.org/bugzilla/show_bug.cgi?id=16979 for details. +# +# The gold linker is only for ELF binaries, which OSX doesn't use. We can +# just skip. +if (NOT APPLE) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -Wl,--version OUTPUT_VARIABLE LINKER_OUTPUT) +endif () +if (LINKER_OUTPUT MATCHES "gold") + if ("${ARROW_LINK}" STREQUAL "d" AND + "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + message(SEND_ERROR "Cannot use gold with dynamic linking in a RELEASE build " + "as it would cause tcmalloc symbols to get dropped") + else() + message("Using gold linker") + endif() + set(ARROW_USING_GOLD 1) +else() + message("Using ld linker") +endif() + +# Having set ARROW_LINK due to build type and/or sanitizer, it's now safe to +# act on its value. +if ("${ARROW_LINK}" STREQUAL "d") + set(BUILD_SHARED_LIBS ON) + + # Position independent code is only necessary when producing shared objects. + add_definitions(-fPIC) +endif() + +# set compile output directory +string (TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME) + +# If build in-source, create the latest symlink. If build out-of-source, which is +# preferred, simply output the binaries in the build folder +if (${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_BINARY_DIR}) + set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/build/${BUILD_SUBDIR_NAME}/") + # Link build/latest to the current build directory, to avoid developers + # accidentally running the latest debug build when in fact they're building + # release builds. + FILE(MAKE_DIRECTORY ${BUILD_OUTPUT_ROOT_DIRECTORY}) + if (NOT APPLE) + set(MORE_ARGS "-T") + endif() +EXECUTE_PROCESS(COMMAND ln ${MORE_ARGS} -sf ${BUILD_OUTPUT_ROOT_DIRECTORY} + ${CMAKE_CURRENT_BINARY_DIR}/build/latest) +else() + set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}/") +endif() + +# where to put generated archives (.a files) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") +set(ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") + +# where to put generated libraries (.so files) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") +set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") + +# where to put generated binaries +set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") +include_directories(src) + +############################################################ +# Visibility +############################################################ +# For generate_export_header() and add_compiler_export_flags(). +include(GenerateExportHeader) + +############################################################ +# Testing +############################################################ + +# Add a new test case, with or without an executable that should be built. +# +# REL_TEST_NAME is the name of the test. It may be a single component +# (e.g. monotime-test) or contain additional components (e.g. +# net/net_util-test). Either way, the last component must be a globally +# unique name. +# +# Arguments after the test name will be passed to set_tests_properties(). +function(ADD_ARROW_TEST REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}.cc) + # This test has a corresponding .cc file, set it up as an executable. + set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") + add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") + target_link_libraries(${TEST_NAME} ${ARROW_TEST_LINK_LIBS}) + else() + # No executable, just invoke the test (probably a script) directly. + set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) + endif() + + add_test(${TEST_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${TEST_PATH}) + if(ARGN) + set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) + endif() +endfunction() + +# A wrapper for add_dependencies() that is compatible with NO_TESTS. +function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + add_dependencies(${TEST_NAME} ${ARGN}) +endfunction() + +enable_testing() + +############################################################ +# Dependencies +############################################################ +function(ADD_THIRDPARTY_LIB LIB_NAME) + set(options) + set(one_value_args SHARED_LIB STATIC_LIB) + set(multi_value_args DEPS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(("${ARROW_LINK}" STREQUAL "s" AND ARG_STATIC_LIB) OR (NOT ARG_SHARED_LIB)) + if(NOT ARG_STATIC_LIB) + message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") + endif() + add_library(${LIB_NAME} STATIC IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") + else() + add_library(${LIB_NAME} SHARED IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + endif() + + if(ARG_DEPS) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${ARG_DEPS}") + endif() +endfunction() + +## GTest +if ("$ENV{GTEST_HOME}" STREQUAL "") + set(GTest_HOME ${THIRDPARTY_DIR}/googletest-release-1.7.0) +endif() +find_package(GTest REQUIRED) +include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(gtest + STATIC_LIB ${GTEST_STATIC_LIB}) + +## Google PerfTools +## +## Disabled with TSAN/ASAN as well as with gold+dynamic linking (see comment +## near definition of ARROW_USING_GOLD). +# find_package(GPerf REQUIRED) +# if (NOT "${ARROW_USE_ASAN}" AND +# NOT "${ARROW_USE_TSAN}" AND +# NOT ("${ARROW_USING_GOLD}" AND "${ARROW_LINK}" STREQUAL "d")) +# ADD_THIRDPARTY_LIB(tcmalloc +# STATIC_LIB "${TCMALLOC_STATIC_LIB}" +# SHARED_LIB "${TCMALLOC_SHARED_LIB}") +# ADD_THIRDPARTY_LIB(profiler +# STATIC_LIB "${PROFILER_STATIC_LIB}" +# SHARED_LIB "${PROFILER_SHARED_LIB}") +# list(APPEND ARROW_BASE_LIBS tcmalloc profiler) +# add_definitions("-DTCMALLOC_ENABLED") +# set(ARROW_TCMALLOC_AVAILABLE 1) +# endif() + +############################################################ +# Linker setup +############################################################ +set(ARROW_MIN_TEST_LIBS arrow arrow_test_main arrow_test_util ${ARROW_BASE_LIBS}) +set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) + +############################################################ +# "make ctags" target +############################################################ +if (UNIX) + add_custom_target(ctags ctags -R --languages=c++,c) +endif (UNIX) + +############################################################ +# "make etags" target +############################################################ +if (UNIX) + add_custom_target(tags etags --members --declarations + `find ${CMAKE_CURRENT_SOURCE_DIR}/src + -name \\*.cc -or -name \\*.hh -or -name \\*.cpp -or -name \\*.h -or -name \\*.c -or + -name \\*.f`) + add_custom_target(etags DEPENDS tags) +endif (UNIX) + +############################################################ +# "make cscope" target +############################################################ +if (UNIX) + add_custom_target(cscope find ${CMAKE_CURRENT_SOURCE_DIR} + ( -name \\*.cc -or -name \\*.hh -or -name \\*.cpp -or + -name \\*.h -or -name \\*.c -or -name \\*.f ) + -exec echo \"{}\" \; > cscope.files && cscope -q -b VERBATIM) +endif (UNIX) + +############################################################ +# "make lint" target +############################################################ +if (UNIX) + # Full lint + add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py + --verbose=2 + --linelength=90 + --filter=-whitespace/comments,-readability/todo,-build/header_guard + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h`) +endif (UNIX) + +#---------------------------------------------------------------------- +# Parquet adapter + +if(ARROW_WITH_PARQUET) + find_package(Parquet REQUIRED) + include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(parquet + STATIC_LIB ${PARQUET_STATIC_LIB} + SHARED_LIB ${PARQUET_SHARED_LIB}) + + add_subdirectory(src/arrow/parquet) + list(APPEND LINK_LIBS arrow_parquet parquet) +endif() + +############################################################ +# Subdirectories +############################################################ + +add_subdirectory(src/arrow) +add_subdirectory(src/arrow/util) +add_subdirectory(src/arrow/types) + +set(LINK_LIBS + arrow_util + arrow_types) + +set(ARROW_SRCS + src/arrow/array.cc + src/arrow/builder.cc + src/arrow/type.cc +) + +add_library(arrow SHARED + ${ARROW_SRCS} +) +target_link_libraries(arrow ${LINK_LIBS}) +set_target_properties(arrow PROPERTIES LINKER_LANGUAGE CXX) + +install(TARGETS arrow + LIBRARY DESTINATION lib) diff --git a/cpp/LICENSE.txt b/cpp/LICENSE.txt new file mode 100644 index 00000000000..d6456956733 --- /dev/null +++ b/cpp/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/cpp/README.md b/cpp/README.md new file mode 100644 index 00000000000..378dc4e28de --- /dev/null +++ b/cpp/README.md @@ -0,0 +1,48 @@ +# Arrow C++ + +## Setup Build Environment + +Arrow uses CMake as a build configuration system. Currently, it supports in-source and +out-of-source builds with the latter one being preferred. + +Arrow requires a C++11-enabled compiler. On Linux, gcc 4.8 and higher should be +sufficient. + +To build the thirdparty build dependencies, run: + +``` +./thirdparty/download_thirdparty.sh +./thirdparty/build_thirdparty.sh +``` + +You can also run from the root of the C++ tree + +``` +source setup_build_env.sh +``` + +Arrow is configured to use the `thirdparty` directory by default for its build +dependencies. To set up a custom toolchain see below. + +Simple debug build: + + mkdir debug + cd debug + cmake .. + make + ctest + +Simple release build: + + mkdir release + cd release + cmake .. -DCMAKE_BUILD_TYPE=Release + make + ctest + +### Third-party environment variables + +To set up your own specific build toolchain, here are the relevant environment +variables + +* Googletest: `GTEST_HOME` (only required to build the unit tests) diff --git a/cpp/build-support/asan_symbolize.py b/cpp/build-support/asan_symbolize.py new file mode 100755 index 00000000000..839a1984bd3 --- /dev/null +++ b/cpp/build-support/asan_symbolize.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python +#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===# +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +#===------------------------------------------------------------------------===# +import bisect +import os +import re +import subprocess +import sys + +llvm_symbolizer = None +symbolizers = {} +filetypes = {} +vmaddrs = {} +DEBUG = False + + +# FIXME: merge the code that calls fix_filename(). +def fix_filename(file_name): + for path_to_cut in sys.argv[1:]: + file_name = re.sub('.*' + path_to_cut, '', file_name) + file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name) + file_name = re.sub('.*crtstuff.c:0', '???:0', file_name) + return file_name + + +class Symbolizer(object): + def __init__(self): + pass + + def symbolize(self, addr, binary, offset): + """Symbolize the given address (pair of binary and offset). + + Overriden in subclasses. + Args: + addr: virtual address of an instruction. + binary: path to executable/shared object containing this instruction. + offset: instruction offset in the @binary. + Returns: + list of strings (one string for each inlined frame) describing + the code locations for this instruction (that is, function name, file + name, line and column numbers). + """ + return None + + +class LLVMSymbolizer(Symbolizer): + def __init__(self, symbolizer_path): + super(LLVMSymbolizer, self).__init__() + self.symbolizer_path = symbolizer_path + self.pipe = self.open_llvm_symbolizer() + + def open_llvm_symbolizer(self): + if not os.path.exists(self.symbolizer_path): + return None + cmd = [self.symbolizer_path, + '--use-symbol-table=true', + '--demangle=false', + '--functions=true', + '--inlining=true'] + if DEBUG: + print ' '.join(cmd) + return subprocess.Popen(cmd, stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + + def symbolize(self, addr, binary, offset): + """Overrides Symbolizer.symbolize.""" + if not self.pipe: + return None + result = [] + try: + symbolizer_input = '%s %s' % (binary, offset) + if DEBUG: + print symbolizer_input + print >> self.pipe.stdin, symbolizer_input + while True: + function_name = self.pipe.stdout.readline().rstrip() + if not function_name: + break + file_name = self.pipe.stdout.readline().rstrip() + file_name = fix_filename(file_name) + if (not function_name.startswith('??') and + not file_name.startswith('??')): + # Append only valid frames. + result.append('%s in %s %s' % (addr, function_name, + file_name)) + except Exception: + result = [] + if not result: + result = None + return result + + +def LLVMSymbolizerFactory(system): + symbolizer_path = os.getenv('LLVM_SYMBOLIZER_PATH') + if not symbolizer_path: + # Assume llvm-symbolizer is in PATH. + symbolizer_path = 'llvm-symbolizer' + return LLVMSymbolizer(symbolizer_path) + + +class Addr2LineSymbolizer(Symbolizer): + def __init__(self, binary): + super(Addr2LineSymbolizer, self).__init__() + self.binary = binary + self.pipe = self.open_addr2line() + + def open_addr2line(self): + cmd = ['addr2line', '-f', '-e', self.binary] + if DEBUG: + print ' '.join(cmd) + return subprocess.Popen(cmd, + stdin=subprocess.PIPE, stdout=subprocess.PIPE) + + def symbolize(self, addr, binary, offset): + """Overrides Symbolizer.symbolize.""" + if self.binary != binary: + return None + try: + print >> self.pipe.stdin, offset + function_name = self.pipe.stdout.readline().rstrip() + file_name = self.pipe.stdout.readline().rstrip() + except Exception: + function_name = '' + file_name = '' + file_name = fix_filename(file_name) + return ['%s in %s %s' % (addr, function_name, file_name)] + + +class DarwinSymbolizer(Symbolizer): + def __init__(self, addr, binary): + super(DarwinSymbolizer, self).__init__() + self.binary = binary + # Guess which arch we're running. 10 = len('0x') + 8 hex digits. + if len(addr) > 10: + self.arch = 'x86_64' + else: + self.arch = 'i386' + self.vmaddr = None + self.pipe = None + + def write_addr_to_pipe(self, offset): + print >> self.pipe.stdin, '0x%x' % int(offset, 16) + + def open_atos(self): + if DEBUG: + print 'atos -o %s -arch %s' % (self.binary, self.arch) + cmdline = ['atos', '-o', self.binary, '-arch', self.arch] + self.pipe = subprocess.Popen(cmdline, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + + def symbolize(self, addr, binary, offset): + """Overrides Symbolizer.symbolize.""" + if self.binary != binary: + return None + self.open_atos() + self.write_addr_to_pipe(offset) + self.pipe.stdin.close() + atos_line = self.pipe.stdout.readline().rstrip() + # A well-formed atos response looks like this: + # foo(type1, type2) (in object.name) (filename.cc:80) + match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line) + if DEBUG: + print 'atos_line: ', atos_line + if match: + function_name = match.group(1) + function_name = re.sub('\(.*?\)', '', function_name) + file_name = fix_filename(match.group(3)) + return ['%s in %s %s' % (addr, function_name, file_name)] + else: + return ['%s in %s' % (addr, atos_line)] + + +# Chain several symbolizers so that if one symbolizer fails, we fall back +# to the next symbolizer in chain. +class ChainSymbolizer(Symbolizer): + def __init__(self, symbolizer_list): + super(ChainSymbolizer, self).__init__() + self.symbolizer_list = symbolizer_list + + def symbolize(self, addr, binary, offset): + """Overrides Symbolizer.symbolize.""" + for symbolizer in self.symbolizer_list: + if symbolizer: + result = symbolizer.symbolize(addr, binary, offset) + if result: + return result + return None + + def append_symbolizer(self, symbolizer): + self.symbolizer_list.append(symbolizer) + + +def BreakpadSymbolizerFactory(binary): + suffix = os.getenv('BREAKPAD_SUFFIX') + if suffix: + filename = binary + suffix + if os.access(filename, os.F_OK): + return BreakpadSymbolizer(filename) + return None + + +def SystemSymbolizerFactory(system, addr, binary): + if system == 'Darwin': + return DarwinSymbolizer(addr, binary) + elif system == 'Linux': + return Addr2LineSymbolizer(binary) + + +class BreakpadSymbolizer(Symbolizer): + def __init__(self, filename): + super(BreakpadSymbolizer, self).__init__() + self.filename = filename + lines = file(filename).readlines() + self.files = [] + self.symbols = {} + self.address_list = [] + self.addresses = {} + # MODULE mac x86_64 A7001116478B33F18FF9BEDE9F615F190 t + fragments = lines[0].rstrip().split() + self.arch = fragments[2] + self.debug_id = fragments[3] + self.binary = ' '.join(fragments[4:]) + self.parse_lines(lines[1:]) + + def parse_lines(self, lines): + cur_function_addr = '' + for line in lines: + fragments = line.split() + if fragments[0] == 'FILE': + assert int(fragments[1]) == len(self.files) + self.files.append(' '.join(fragments[2:])) + elif fragments[0] == 'PUBLIC': + self.symbols[int(fragments[1], 16)] = ' '.join(fragments[3:]) + elif fragments[0] in ['CFI', 'STACK']: + pass + elif fragments[0] == 'FUNC': + cur_function_addr = int(fragments[1], 16) + if not cur_function_addr in self.symbols.keys(): + self.symbols[cur_function_addr] = ' '.join(fragments[4:]) + else: + # Line starting with an address. + addr = int(fragments[0], 16) + self.address_list.append(addr) + # Tuple of symbol address, size, line, file number. + self.addresses[addr] = (cur_function_addr, + int(fragments[1], 16), + int(fragments[2]), + int(fragments[3])) + self.address_list.sort() + + def get_sym_file_line(self, addr): + key = None + if addr in self.addresses.keys(): + key = addr + else: + index = bisect.bisect_left(self.address_list, addr) + if index == 0: + return None + else: + key = self.address_list[index - 1] + sym_id, size, line_no, file_no = self.addresses[key] + symbol = self.symbols[sym_id] + filename = self.files[file_no] + if addr < key + size: + return symbol, filename, line_no + else: + return None + + def symbolize(self, addr, binary, offset): + if self.binary != binary: + return None + res = self.get_sym_file_line(int(offset, 16)) + if res: + function_name, file_name, line_no = res + result = ['%s in %s %s:%d' % ( + addr, function_name, file_name, line_no)] + print result + return result + else: + return None + + +class SymbolizationLoop(object): + def __init__(self, binary_name_filter=None): + # Used by clients who may want to supply a different binary name. + # E.g. in Chrome several binaries may share a single .dSYM. + self.binary_name_filter = binary_name_filter + self.system = os.uname()[0] + if self.system in ['Linux', 'Darwin']: + self.llvm_symbolizer = LLVMSymbolizerFactory(self.system) + else: + raise Exception('Unknown system') + + def symbolize_address(self, addr, binary, offset): + # Use the chain of symbolizers: + # Breakpad symbolizer -> LLVM symbolizer -> addr2line/atos + # (fall back to next symbolizer if the previous one fails). + if not binary in symbolizers: + symbolizers[binary] = ChainSymbolizer( + [BreakpadSymbolizerFactory(binary), self.llvm_symbolizer]) + result = symbolizers[binary].symbolize(addr, binary, offset) + if result is None: + # Initialize system symbolizer only if other symbolizers failed. + symbolizers[binary].append_symbolizer( + SystemSymbolizerFactory(self.system, addr, binary)) + result = symbolizers[binary].symbolize(addr, binary, offset) + # The system symbolizer must produce some result. + assert result + return result + + def print_symbolized_lines(self, symbolized_lines): + if not symbolized_lines: + print self.current_line + else: + for symbolized_frame in symbolized_lines: + print ' #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip() + self.frame_no += 1 + + def process_stdin(self): + self.frame_no = 0 + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + + while True: + line = sys.stdin.readline() + if not line: break + self.current_line = line.rstrip() + #0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45) + stack_trace_line_format = ( + '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)') + match = re.match(stack_trace_line_format, line) + if not match: + print self.current_line + continue + if DEBUG: + print line + _, frameno_str, addr, binary, offset = match.groups() + if frameno_str == '0': + # Assume that frame #0 is the first frame of new stack trace. + self.frame_no = 0 + original_binary = binary + if self.binary_name_filter: + binary = self.binary_name_filter(binary) + symbolized_line = self.symbolize_address(addr, binary, offset) + if not symbolized_line: + if original_binary != binary: + symbolized_line = self.symbolize_address(addr, binary, offset) + self.print_symbolized_lines(symbolized_line) + + +if __name__ == '__main__': + loop = SymbolizationLoop() + loop.process_stdin() diff --git a/cpp/build-support/bootstrap_toolchain.py b/cpp/build-support/bootstrap_toolchain.py new file mode 100755 index 00000000000..128be78bbac --- /dev/null +++ b/cpp/build-support/bootstrap_toolchain.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python +# Copyright (c) 2015, Cloudera, inc. +# Confidential Cloudera Information: Covered by NDA. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Bootstrapping the native toolchain with prebuilt binaries +# +# The purpose of this script is to download prebuilt artifacts of the native toolchain to +# satisfy the third-party dependencies. The script checks for the presence of +# NATIVE_TOOLCHAIN. NATIVE_TOOLCHAIN indicates the location where the prebuilt artifacts +# should be extracted to. +# +# The script is called as follows without any additional parameters: +# +# python bootstrap_toolchain.py +import sh +import os +import sys +import re + +HOST = "https://native-toolchain.s3.amazonaws.com/build" + +OS_MAPPING = { + "centos6" : "ec2-package-centos-6", + "centos5" : "ec2-package-centos-5", + "centos7" : "ec2-package-centos-7", + "debian6" : "ec2-package-debian-6", + "debian7" : "ec2-package-debian-7", + "suselinux11": "ec2-package-sles-11", + "ubuntu12.04" : "ec2-package-ubuntu-12-04", + "ubuntu14.04" : "ec2-package-ubuntu-14-04" +} + +def get_release_label(): + """Gets the right package label from the OS version""" + release = "".join(map(lambda x: x.lower(), sh.lsb_release("-irs").split())) + for k, v in OS_MAPPING.iteritems(): + if re.search(k, release): + return v + + print("Pre-built toolchain archives not available for your platform.") + print("Clone and build native toolchain from source using this repository:") + print(" https://github.com/cloudera/native-toolchain") + raise Exception("Could not find package label for OS version: {0}.".format(release)) + +def download_package(destination, product, version, compiler): + label = get_release_label() + file_name = "{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label) + url_path="/{0}/{1}-{2}/{0}-{1}-{2}-{3}.tar.gz".format(product, version, compiler, label) + download_path = HOST + url_path + + print "URL {0}".format(download_path) + print "Downloading {0} to {1}".format(file_name, destination) + # --no-clobber avoids downloading the file if a file with the name already exists + sh.wget(download_path, directory_prefix=destination, no_clobber=True) + print "Extracting {0}".format(file_name) + sh.tar(z=True, x=True, f=os.path.join(destination, file_name), directory=destination) + sh.rm(os.path.join(destination, file_name)) + + +def bootstrap(packages): + """Validates the presence of $NATIVE_TOOLCHAIN in the environment. By checking + $NATIVE_TOOLCHAIN is present, we assume that {LIB}_VERSION will be present as well. Will + create the directory specified by $NATIVE_TOOLCHAIN if it does not yet exist. Each of + the packages specified in `packages` is downloaded and extracted into $NATIVE_TOOLCHAIN. + """ + # Create the destination directory if necessary + destination = os.getenv("NATIVE_TOOLCHAIN") + if not destination: + print("Build environment not set up correctly, make sure " + "$NATIVE_TOOLCHAIN is present.") + sys.exit(1) + + if not os.path.exists(destination): + os.makedirs(destination) + + # Detect the compiler + if "SYSTEM_GCC" in os.environ: + compiler = "gcc-system" + else: + compiler = "gcc-{0}".format(os.environ["GCC_VERSION"]) + + for p in packages: + pkg_name, pkg_version = unpack_name_and_version(p) + download_package(destination, pkg_name, pkg_version, compiler) + +def unpack_name_and_version(package): + """A package definition is either a string where the version is fetched from the + environment or a tuple where the package name and the package version are fully + specified. + """ + if isinstance(package, basestring): + env_var = "{0}_VERSION".format(package).replace("-", "_").upper() + try: + return package, os.environ[env_var] + except KeyError: + raise Exception("Could not find version for {0} in environment var {1}".format( + package, env_var)) + return package[0], package[1] + +if __name__ == "__main__": + packages = [("gcc","4.9.2"), ("gflags", "2.0"), ("glog", "0.3.3-p1"), + ("gperftools", "2.3"), ("libunwind", "1.1"), ("googletest", "20151222")] + bootstrap(packages) diff --git a/cpp/build-support/cpplint.py b/cpp/build-support/cpplint.py new file mode 100755 index 00000000000..ccc25d4c56b --- /dev/null +++ b/cpp/build-support/cpplint.py @@ -0,0 +1,6323 @@ +#!/usr/bin/env python +# +# Copyright (c) 2009 Google Inc. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following disclaimer +# in the documentation and/or other materials provided with the +# distribution. +# * Neither the name of Google Inc. nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +"""Does google-lint on c++ files. + +The goal of this script is to identify places in the code that *may* +be in non-compliance with google style. It does not attempt to fix +up these problems -- the point is to educate. It does also not +attempt to find all problems, or to ensure that everything it does +find is legitimately a problem. + +In particular, we can get very confused by /* and // inside strings! +We do a small hack, which is to ignore //'s with "'s after them on the +same line, but it is far from perfect (in either direction). +""" + +import codecs +import copy +import getopt +import math # for log +import os +import re +import sre_compile +import string +import sys +import unicodedata + + +_USAGE = """ +Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...] + [--counting=total|toplevel|detailed] [--root=subdir] + [--linelength=digits] + [file] ... + + The style guidelines this tries to follow are those in + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml + + Every problem is given a confidence score from 1-5, with 5 meaning we are + certain of the problem, and 1 meaning it could be a legitimate construct. + This will miss some errors, and is not a substitute for a code review. + + To suppress false-positive errors of a certain category, add a + 'NOLINT(category)' comment to the line. NOLINT or NOLINT(*) + suppresses errors of all categories on that line. + + The files passed in will be linted; at least one file must be provided. + Default linted extensions are .cc, .cpp, .cu, .cuh and .h. Change the + extensions with the --extensions flag. + + Flags: + + output=vs7 + By default, the output is formatted to ease emacs parsing. Visual Studio + compatible output (vs7) may also be used. Other formats are unsupported. + + verbose=# + Specify a number 0-5 to restrict errors to certain verbosity levels. + + filter=-x,+y,... + Specify a comma-separated list of category-filters to apply: only + error messages whose category names pass the filters will be printed. + (Category names are printed with the message and look like + "[whitespace/indent]".) Filters are evaluated left to right. + "-FOO" and "FOO" means "do not print categories that start with FOO". + "+FOO" means "do print categories that start with FOO". + + Examples: --filter=-whitespace,+whitespace/braces + --filter=whitespace,runtime/printf,+runtime/printf_format + --filter=-,+build/include_what_you_use + + To see a list of all the categories used in cpplint, pass no arg: + --filter= + + counting=total|toplevel|detailed + The total number of errors found is always printed. If + 'toplevel' is provided, then the count of errors in each of + the top-level categories like 'build' and 'whitespace' will + also be printed. If 'detailed' is provided, then a count + is provided for each category like 'build/class'. + + root=subdir + The root directory used for deriving header guard CPP variable. + By default, the header guard CPP variable is calculated as the relative + path to the directory that contains .git, .hg, or .svn. When this flag + is specified, the relative path is calculated from the specified + directory. If the specified directory does not exist, this flag is + ignored. + + Examples: + Assuming that src/.git exists, the header guard CPP variables for + src/chrome/browser/ui/browser.h are: + + No flag => CHROME_BROWSER_UI_BROWSER_H_ + --root=chrome => BROWSER_UI_BROWSER_H_ + --root=chrome/browser => UI_BROWSER_H_ + + linelength=digits + This is the allowed line length for the project. The default value is + 80 characters. + + Examples: + --linelength=120 + + extensions=extension,extension,... + The allowed file extensions that cpplint will check + + Examples: + --extensions=hpp,cpp + + cpplint.py supports per-directory configurations specified in CPPLINT.cfg + files. CPPLINT.cfg file can contain a number of key=value pairs. + Currently the following options are supported: + + set noparent + filter=+filter1,-filter2,... + exclude_files=regex + linelength=80 + + "set noparent" option prevents cpplint from traversing directory tree + upwards looking for more .cfg files in parent directories. This option + is usually placed in the top-level project directory. + + The "filter" option is similar in function to --filter flag. It specifies + message filters in addition to the |_DEFAULT_FILTERS| and those specified + through --filter command-line flag. + + "exclude_files" allows to specify a regular expression to be matched against + a file name. If the expression matches, the file is skipped and not run + through liner. + + "linelength" allows to specify the allowed line length for the project. + + CPPLINT.cfg has an effect on files in the same directory and all + sub-directories, unless overridden by a nested configuration file. + + Example file: + filter=-build/include_order,+build/include_alpha + exclude_files=.*\.cc + + The above example disables build/include_order warning and enables + build/include_alpha as well as excludes all .cc from being + processed by linter, in the current directory (where the .cfg + file is located) and all sub-directories. +""" + +# We categorize each error message we print. Here are the categories. +# We want an explicit list so we can list them all in cpplint --filter=. +# If you add a new error message with a new category, add it to the list +# here! cpplint_unittest.py should tell you if you forget to do this. +_ERROR_CATEGORIES = [ + 'build/class', + 'build/c++11', + 'build/deprecated', + 'build/endif_comment', + 'build/explicit_make_pair', + 'build/forward_decl', + 'build/header_guard', + 'build/include', + 'build/include_alpha', + 'build/include_order', + 'build/include_what_you_use', + 'build/namespaces', + 'build/printf_format', + 'build/storage_class', + 'legal/copyright', + 'readability/alt_tokens', + 'readability/braces', + 'readability/casting', + 'readability/check', + 'readability/constructors', + 'readability/fn_size', + 'readability/function', + 'readability/inheritance', + 'readability/multiline_comment', + 'readability/multiline_string', + 'readability/namespace', + 'readability/nolint', + 'readability/nul', + 'readability/strings', + 'readability/todo', + 'readability/utf8', + 'runtime/arrays', + 'runtime/casting', + 'runtime/explicit', + 'runtime/int', + 'runtime/init', + 'runtime/invalid_increment', + 'runtime/member_string_references', + 'runtime/memset', + 'runtime/indentation_namespace', + 'runtime/operator', + 'runtime/printf', + 'runtime/printf_format', + 'runtime/references', + 'runtime/string', + 'runtime/threadsafe_fn', + 'runtime/vlog', + 'whitespace/blank_line', + 'whitespace/braces', + 'whitespace/comma', + 'whitespace/comments', + 'whitespace/empty_conditional_body', + 'whitespace/empty_loop_body', + 'whitespace/end_of_line', + 'whitespace/ending_newline', + 'whitespace/forcolon', + 'whitespace/indent', + 'whitespace/line_length', + 'whitespace/newline', + 'whitespace/operators', + 'whitespace/parens', + 'whitespace/semicolon', + 'whitespace/tab', + 'whitespace/todo', + ] + +# These error categories are no longer enforced by cpplint, but for backwards- +# compatibility they may still appear in NOLINT comments. +_LEGACY_ERROR_CATEGORIES = [ + 'readability/streams', + ] + +# The default state of the category filter. This is overridden by the --filter= +# flag. By default all errors are on, so only add here categories that should be +# off by default (i.e., categories that must be enabled by the --filter= flags). +# All entries here should start with a '-' or '+', as in the --filter= flag. +_DEFAULT_FILTERS = ['-build/include_alpha'] + +# We used to check for high-bit characters, but after much discussion we +# decided those were OK, as long as they were in UTF-8 and didn't represent +# hard-coded international strings, which belong in a separate i18n file. + +# C++ headers +_CPP_HEADERS = frozenset([ + # Legacy + 'algobase.h', + 'algo.h', + 'alloc.h', + 'builtinbuf.h', + 'bvector.h', + 'complex.h', + 'defalloc.h', + 'deque.h', + 'editbuf.h', + 'fstream.h', + 'function.h', + 'hash_map', + 'hash_map.h', + 'hash_set', + 'hash_set.h', + 'hashtable.h', + 'heap.h', + 'indstream.h', + 'iomanip.h', + 'iostream.h', + 'istream.h', + 'iterator.h', + 'list.h', + 'map.h', + 'multimap.h', + 'multiset.h', + 'ostream.h', + 'pair.h', + 'parsestream.h', + 'pfstream.h', + 'procbuf.h', + 'pthread_alloc', + 'pthread_alloc.h', + 'rope', + 'rope.h', + 'ropeimpl.h', + 'set.h', + 'slist', + 'slist.h', + 'stack.h', + 'stdiostream.h', + 'stl_alloc.h', + 'stl_relops.h', + 'streambuf.h', + 'stream.h', + 'strfile.h', + 'strstream.h', + 'tempbuf.h', + 'tree.h', + 'type_traits.h', + 'vector.h', + # 17.6.1.2 C++ library headers + 'algorithm', + 'array', + 'atomic', + 'bitset', + 'chrono', + 'codecvt', + 'complex', + 'condition_variable', + 'deque', + 'exception', + 'forward_list', + 'fstream', + 'functional', + 'future', + 'initializer_list', + 'iomanip', + 'ios', + 'iosfwd', + 'iostream', + 'istream', + 'iterator', + 'limits', + 'list', + 'locale', + 'map', + 'memory', + 'mutex', + 'new', + 'numeric', + 'ostream', + 'queue', + 'random', + 'ratio', + 'regex', + 'set', + 'sstream', + 'stack', + 'stdexcept', + 'streambuf', + 'string', + 'strstream', + 'system_error', + 'thread', + 'tuple', + 'typeindex', + 'typeinfo', + 'type_traits', + 'unordered_map', + 'unordered_set', + 'utility', + 'valarray', + 'vector', + # 17.6.1.2 C++ headers for C library facilities + 'cassert', + 'ccomplex', + 'cctype', + 'cerrno', + 'cfenv', + 'cfloat', + 'cinttypes', + 'ciso646', + 'climits', + 'clocale', + 'cmath', + 'csetjmp', + 'csignal', + 'cstdalign', + 'cstdarg', + 'cstdbool', + 'cstddef', + 'cstdint', + 'cstdio', + 'cstdlib', + 'cstring', + 'ctgmath', + 'ctime', + 'cuchar', + 'cwchar', + 'cwctype', + ]) + + +# These headers are excluded from [build/include] and [build/include_order] +# checks: +# - Anything not following google file name conventions (containing an +# uppercase character, such as Python.h or nsStringAPI.h, for example). +# - Lua headers. +_THIRD_PARTY_HEADERS_PATTERN = re.compile( + r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$') + + +# Assertion macros. These are defined in base/logging.h and +# testing/base/gunit.h. Note that the _M versions need to come first +# for substring matching to work. +_CHECK_MACROS = [ + 'DCHECK', 'CHECK', + 'EXPECT_TRUE_M', 'EXPECT_TRUE', + 'ASSERT_TRUE_M', 'ASSERT_TRUE', + 'EXPECT_FALSE_M', 'EXPECT_FALSE', + 'ASSERT_FALSE_M', 'ASSERT_FALSE', + ] + +# Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE +_CHECK_REPLACEMENT = dict([(m, {}) for m in _CHECK_MACROS]) + +for op, replacement in [('==', 'EQ'), ('!=', 'NE'), + ('>=', 'GE'), ('>', 'GT'), + ('<=', 'LE'), ('<', 'LT')]: + _CHECK_REPLACEMENT['DCHECK'][op] = 'DCHECK_%s' % replacement + _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement + _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement + _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement + +for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'), + ('>=', 'LT'), ('>', 'LE'), + ('<=', 'GT'), ('<', 'GE')]: + _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement + _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement + _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement + +# Alternative tokens and their replacements. For full list, see section 2.5 +# Alternative tokens [lex.digraph] in the C++ standard. +# +# Digraphs (such as '%:') are not included here since it's a mess to +# match those on a word boundary. +_ALT_TOKEN_REPLACEMENT = { + 'and': '&&', + 'bitor': '|', + 'or': '||', + 'xor': '^', + 'compl': '~', + 'bitand': '&', + 'and_eq': '&=', + 'or_eq': '|=', + 'xor_eq': '^=', + 'not': '!', + 'not_eq': '!=' + } + +# Compile regular expression that matches all the above keywords. The "[ =()]" +# bit is meant to avoid matching these keywords outside of boolean expressions. +# +# False positives include C-style multi-line comments and multi-line strings +# but those have always been troublesome for cpplint. +_ALT_TOKEN_REPLACEMENT_PATTERN = re.compile( + r'[ =()](' + ('|'.join(_ALT_TOKEN_REPLACEMENT.keys())) + r')(?=[ (]|$)') + + +# These constants define types of headers for use with +# _IncludeState.CheckNextIncludeOrder(). +_C_SYS_HEADER = 1 +_CPP_SYS_HEADER = 2 +_LIKELY_MY_HEADER = 3 +_POSSIBLE_MY_HEADER = 4 +_OTHER_HEADER = 5 + +# These constants define the current inline assembly state +_NO_ASM = 0 # Outside of inline assembly block +_INSIDE_ASM = 1 # Inside inline assembly block +_END_ASM = 2 # Last line of inline assembly block +_BLOCK_ASM = 3 # The whole block is an inline assembly block + +# Match start of assembly blocks +_MATCH_ASM = re.compile(r'^\s*(?:asm|_asm|__asm|__asm__)' + r'(?:\s+(volatile|__volatile__))?' + r'\s*[{(]') + + +_regexp_compile_cache = {} + +# {str, set(int)}: a map from error categories to sets of linenumbers +# on which those errors are expected and should be suppressed. +_error_suppressions = {} + +# The root directory used for deriving header guard CPP variable. +# This is set by --root flag. +_root = None + +# The allowed line length of files. +# This is set by --linelength flag. +_line_length = 80 + +# The allowed extensions for file names +# This is set by --extensions flag. +_valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh']) + +def ParseNolintSuppressions(filename, raw_line, linenum, error): + """Updates the global list of error-suppressions. + + Parses any NOLINT comments on the current line, updating the global + error_suppressions store. Reports an error if the NOLINT comment + was malformed. + + Args: + filename: str, the name of the input file. + raw_line: str, the line of input text, with comments. + linenum: int, the number of the current line. + error: function, an error handler. + """ + matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line) + if matched: + if matched.group(1): + suppressed_line = linenum + 1 + else: + suppressed_line = linenum + category = matched.group(2) + if category in (None, '(*)'): # => "suppress all" + _error_suppressions.setdefault(None, set()).add(suppressed_line) + else: + if category.startswith('(') and category.endswith(')'): + category = category[1:-1] + if category in _ERROR_CATEGORIES: + _error_suppressions.setdefault(category, set()).add(suppressed_line) + elif category not in _LEGACY_ERROR_CATEGORIES: + error(filename, linenum, 'readability/nolint', 5, + 'Unknown NOLINT error category: %s' % category) + + +def ResetNolintSuppressions(): + """Resets the set of NOLINT suppressions to empty.""" + _error_suppressions.clear() + + +def IsErrorSuppressedByNolint(category, linenum): + """Returns true if the specified error category is suppressed on this line. + + Consults the global error_suppressions map populated by + ParseNolintSuppressions/ResetNolintSuppressions. + + Args: + category: str, the category of the error. + linenum: int, the current line number. + Returns: + bool, True iff the error should be suppressed due to a NOLINT comment. + """ + return (linenum in _error_suppressions.get(category, set()) or + linenum in _error_suppressions.get(None, set())) + + +def Match(pattern, s): + """Matches the string with the pattern, caching the compiled regexp.""" + # The regexp compilation caching is inlined in both Match and Search for + # performance reasons; factoring it out into a separate function turns out + # to be noticeably expensive. + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].match(s) + + +def ReplaceAll(pattern, rep, s): + """Replaces instances of pattern in a string with a replacement. + + The compiled regex is kept in a cache shared by Match and Search. + + Args: + pattern: regex pattern + rep: replacement text + s: search string + + Returns: + string with replacements made (or original string if no replacements) + """ + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].sub(rep, s) + + +def Search(pattern, s): + """Searches the string for the pattern, caching the compiled regexp.""" + if pattern not in _regexp_compile_cache: + _regexp_compile_cache[pattern] = sre_compile.compile(pattern) + return _regexp_compile_cache[pattern].search(s) + + +class _IncludeState(object): + """Tracks line numbers for includes, and the order in which includes appear. + + include_list contains list of lists of (header, line number) pairs. + It's a lists of lists rather than just one flat list to make it + easier to update across preprocessor boundaries. + + Call CheckNextIncludeOrder() once for each header in the file, passing + in the type constants defined above. Calls in an illegal order will + raise an _IncludeError with an appropriate error message. + + """ + # self._section will move monotonically through this set. If it ever + # needs to move backwards, CheckNextIncludeOrder will raise an error. + _INITIAL_SECTION = 0 + _MY_H_SECTION = 1 + _C_SECTION = 2 + _CPP_SECTION = 3 + _OTHER_H_SECTION = 4 + + _TYPE_NAMES = { + _C_SYS_HEADER: 'C system header', + _CPP_SYS_HEADER: 'C++ system header', + _LIKELY_MY_HEADER: 'header this file implements', + _POSSIBLE_MY_HEADER: 'header this file may implement', + _OTHER_HEADER: 'other header', + } + _SECTION_NAMES = { + _INITIAL_SECTION: "... nothing. (This can't be an error.)", + _MY_H_SECTION: 'a header this file implements', + _C_SECTION: 'C system header', + _CPP_SECTION: 'C++ system header', + _OTHER_H_SECTION: 'other header', + } + + def __init__(self): + self.include_list = [[]] + self.ResetSection('') + + def FindHeader(self, header): + """Check if a header has already been included. + + Args: + header: header to check. + Returns: + Line number of previous occurrence, or -1 if the header has not + been seen before. + """ + for section_list in self.include_list: + for f in section_list: + if f[0] == header: + return f[1] + return -1 + + def ResetSection(self, directive): + """Reset section checking for preprocessor directive. + + Args: + directive: preprocessor directive (e.g. "if", "else"). + """ + # The name of the current section. + self._section = self._INITIAL_SECTION + # The path of last found header. + self._last_header = '' + + # Update list of includes. Note that we never pop from the + # include list. + if directive in ('if', 'ifdef', 'ifndef'): + self.include_list.append([]) + elif directive in ('else', 'elif'): + self.include_list[-1] = [] + + def SetLastHeader(self, header_path): + self._last_header = header_path + + def CanonicalizeAlphabeticalOrder(self, header_path): + """Returns a path canonicalized for alphabetical comparison. + + - replaces "-" with "_" so they both cmp the same. + - removes '-inl' since we don't require them to be after the main header. + - lowercase everything, just in case. + + Args: + header_path: Path to be canonicalized. + + Returns: + Canonicalized path. + """ + return header_path.replace('-inl.h', '.h').replace('-', '_').lower() + + def IsInAlphabeticalOrder(self, clean_lines, linenum, header_path): + """Check if a header is in alphabetical order with the previous header. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + header_path: Canonicalized header to be checked. + + Returns: + Returns true if the header is in alphabetical order. + """ + # If previous section is different from current section, _last_header will + # be reset to empty string, so it's always less than current header. + # + # If previous line was a blank line, assume that the headers are + # intentionally sorted the way they are. + if (self._last_header > header_path and + Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])): + return False + return True + + def CheckNextIncludeOrder(self, header_type): + """Returns a non-empty error message if the next header is out of order. + + This function also updates the internal state to be ready to check + the next include. + + Args: + header_type: One of the _XXX_HEADER constants defined above. + + Returns: + The empty string if the header is in the right order, or an + error message describing what's wrong. + + """ + error_message = ('Found %s after %s' % + (self._TYPE_NAMES[header_type], + self._SECTION_NAMES[self._section])) + + last_section = self._section + + if header_type == _C_SYS_HEADER: + if self._section <= self._C_SECTION: + self._section = self._C_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _CPP_SYS_HEADER: + if self._section <= self._CPP_SECTION: + self._section = self._CPP_SECTION + else: + self._last_header = '' + return error_message + elif header_type == _LIKELY_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + self._section = self._OTHER_H_SECTION + elif header_type == _POSSIBLE_MY_HEADER: + if self._section <= self._MY_H_SECTION: + self._section = self._MY_H_SECTION + else: + # This will always be the fallback because we're not sure + # enough that the header is associated with this file. + self._section = self._OTHER_H_SECTION + else: + assert header_type == _OTHER_HEADER + self._section = self._OTHER_H_SECTION + + if last_section != self._section: + self._last_header = '' + + return '' + + +class _CppLintState(object): + """Maintains module-wide state..""" + + def __init__(self): + self.verbose_level = 1 # global setting. + self.error_count = 0 # global count of reported errors + # filters to apply when emitting error messages + self.filters = _DEFAULT_FILTERS[:] + # backup of filter list. Used to restore the state after each file. + self._filters_backup = self.filters[:] + self.counting = 'total' # In what way are we counting errors? + self.errors_by_category = {} # string to int dict storing error counts + + # output format: + # "emacs" - format that emacs can parse (default) + # "vs7" - format that Microsoft Visual Studio 7 can parse + self.output_format = 'emacs' + + def SetOutputFormat(self, output_format): + """Sets the output format for errors.""" + self.output_format = output_format + + def SetVerboseLevel(self, level): + """Sets the module's verbosity, and returns the previous setting.""" + last_verbose_level = self.verbose_level + self.verbose_level = level + return last_verbose_level + + def SetCountingStyle(self, counting_style): + """Sets the module's counting options.""" + self.counting = counting_style + + def SetFilters(self, filters): + """Sets the error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "+whitespace/indent"). + Each filter should start with + or -; else we die. + + Raises: + ValueError: The comma-separated filters did not all start with '+' or '-'. + E.g. "-,+whitespace,-whitespace/indent,whitespace/badfilter" + """ + # Default filters always have less priority than the flag ones. + self.filters = _DEFAULT_FILTERS[:] + self.AddFilters(filters) + + def AddFilters(self, filters): + """ Adds more filters to the existing list of error-message filters. """ + for filt in filters.split(','): + clean_filt = filt.strip() + if clean_filt: + self.filters.append(clean_filt) + for filt in self.filters: + if not (filt.startswith('+') or filt.startswith('-')): + raise ValueError('Every filter in --filters must start with + or -' + ' (%s does not)' % filt) + + def BackupFilters(self): + """ Saves the current filter list to backup storage.""" + self._filters_backup = self.filters[:] + + def RestoreFilters(self): + """ Restores filters previously backed up.""" + self.filters = self._filters_backup[:] + + def ResetErrorCounts(self): + """Sets the module's error statistic back to zero.""" + self.error_count = 0 + self.errors_by_category = {} + + def IncrementErrorCount(self, category): + """Bumps the module's error statistic.""" + self.error_count += 1 + if self.counting in ('toplevel', 'detailed'): + if self.counting != 'detailed': + category = category.split('/')[0] + if category not in self.errors_by_category: + self.errors_by_category[category] = 0 + self.errors_by_category[category] += 1 + + def PrintErrorCounts(self): + """Print a summary of errors by category, and the total.""" + for category, count in self.errors_by_category.iteritems(): + sys.stderr.write('Category \'%s\' errors found: %d\n' % + (category, count)) + sys.stderr.write('Total errors found: %d\n' % self.error_count) + +_cpplint_state = _CppLintState() + + +def _OutputFormat(): + """Gets the module's output format.""" + return _cpplint_state.output_format + + +def _SetOutputFormat(output_format): + """Sets the module's output format.""" + _cpplint_state.SetOutputFormat(output_format) + + +def _VerboseLevel(): + """Returns the module's verbosity setting.""" + return _cpplint_state.verbose_level + + +def _SetVerboseLevel(level): + """Sets the module's verbosity, and returns the previous setting.""" + return _cpplint_state.SetVerboseLevel(level) + + +def _SetCountingStyle(level): + """Sets the module's counting options.""" + _cpplint_state.SetCountingStyle(level) + + +def _Filters(): + """Returns the module's list of output filters, as a list.""" + return _cpplint_state.filters + + +def _SetFilters(filters): + """Sets the module's error-message filters. + + These filters are applied when deciding whether to emit a given + error message. + + Args: + filters: A string of comma-separated filters (eg "whitespace/indent"). + Each filter should start with + or -; else we die. + """ + _cpplint_state.SetFilters(filters) + +def _AddFilters(filters): + """Adds more filter overrides. + + Unlike _SetFilters, this function does not reset the current list of filters + available. + + Args: + filters: A string of comma-separated filters (eg "whitespace/indent"). + Each filter should start with + or -; else we die. + """ + _cpplint_state.AddFilters(filters) + +def _BackupFilters(): + """ Saves the current filter list to backup storage.""" + _cpplint_state.BackupFilters() + +def _RestoreFilters(): + """ Restores filters previously backed up.""" + _cpplint_state.RestoreFilters() + +class _FunctionState(object): + """Tracks current function name and the number of lines in its body.""" + + _NORMAL_TRIGGER = 250 # for --v=0, 500 for --v=1, etc. + _TEST_TRIGGER = 400 # about 50% more than _NORMAL_TRIGGER. + + def __init__(self): + self.in_a_function = False + self.lines_in_function = 0 + self.current_function = '' + + def Begin(self, function_name): + """Start analyzing function body. + + Args: + function_name: The name of the function being tracked. + """ + self.in_a_function = True + self.lines_in_function = 0 + self.current_function = function_name + + def Count(self): + """Count line in current function body.""" + if self.in_a_function: + self.lines_in_function += 1 + + def Check(self, error, filename, linenum): + """Report if too many lines in function body. + + Args: + error: The function to call with any errors found. + filename: The name of the current file. + linenum: The number of the line to check. + """ + if Match(r'T(EST|est)', self.current_function): + base_trigger = self._TEST_TRIGGER + else: + base_trigger = self._NORMAL_TRIGGER + trigger = base_trigger * 2**_VerboseLevel() + + if self.lines_in_function > trigger: + error_level = int(math.log(self.lines_in_function / base_trigger, 2)) + # 50 => 0, 100 => 1, 200 => 2, 400 => 3, 800 => 4, 1600 => 5, ... + if error_level > 5: + error_level = 5 + error(filename, linenum, 'readability/fn_size', error_level, + 'Small and focused functions are preferred:' + ' %s has %d non-comment lines' + ' (error triggered by exceeding %d lines).' % ( + self.current_function, self.lines_in_function, trigger)) + + def End(self): + """Stop analyzing function body.""" + self.in_a_function = False + + +class _IncludeError(Exception): + """Indicates a problem with the include order in a file.""" + pass + + +class FileInfo(object): + """Provides utility functions for filenames. + + FileInfo provides easy access to the components of a file's path + relative to the project root. + """ + + def __init__(self, filename): + self._filename = filename + + def FullName(self): + """Make Windows paths like Unix.""" + return os.path.abspath(self._filename).replace('\\', '/') + + def RepositoryName(self): + """FullName after removing the local path to the repository. + + If we have a real absolute path name here we can try to do something smart: + detecting the root of the checkout and truncating /path/to/checkout from + the name so that we get header guards that don't include things like + "C:\Documents and Settings\..." or "/home/username/..." in them and thus + people on different computers who have checked the source out to different + locations won't see bogus errors. + """ + fullname = self.FullName() + + if os.path.exists(fullname): + project_dir = os.path.dirname(fullname) + + if os.path.exists(os.path.join(project_dir, ".svn")): + # If there's a .svn file in the current directory, we recursively look + # up the directory tree for the top of the SVN checkout + root_dir = project_dir + one_up_dir = os.path.dirname(root_dir) + while os.path.exists(os.path.join(one_up_dir, ".svn")): + root_dir = os.path.dirname(root_dir) + one_up_dir = os.path.dirname(one_up_dir) + + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by + # searching up from the current path. + root_dir = os.path.dirname(fullname) + while (root_dir != os.path.dirname(root_dir) and + not os.path.exists(os.path.join(root_dir, ".git")) and + not os.path.exists(os.path.join(root_dir, ".hg")) and + not os.path.exists(os.path.join(root_dir, ".svn"))): + root_dir = os.path.dirname(root_dir) + + if (os.path.exists(os.path.join(root_dir, ".git")) or + os.path.exists(os.path.join(root_dir, ".hg")) or + os.path.exists(os.path.join(root_dir, ".svn"))): + prefix = os.path.commonprefix([root_dir, project_dir]) + return fullname[len(prefix) + 1:] + + # Don't know what to do; header guard warnings may be wrong... + return fullname + + def Split(self): + """Splits the file into the directory, basename, and extension. + + For 'chrome/browser/browser.cc', Split() would + return ('chrome/browser', 'browser', '.cc') + + Returns: + A tuple of (directory, basename, extension). + """ + + googlename = self.RepositoryName() + project, rest = os.path.split(googlename) + return (project,) + os.path.splitext(rest) + + def BaseName(self): + """File base name - text after the final slash, before the final period.""" + return self.Split()[1] + + def Extension(self): + """File extension - text following the final period.""" + return self.Split()[2] + + def NoExtension(self): + """File has no source file extension.""" + return '/'.join(self.Split()[0:2]) + + def IsSource(self): + """File has a source file extension.""" + return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx') + + +def _ShouldPrintError(category, confidence, linenum): + """If confidence >= verbose, category passes filter and is not suppressed.""" + + # There are three ways we might decide not to print an error message: + # a "NOLINT(category)" comment appears in the source, + # the verbosity level isn't high enough, or the filters filter it out. + if IsErrorSuppressedByNolint(category, linenum): + return False + + if confidence < _cpplint_state.verbose_level: + return False + + is_filtered = False + for one_filter in _Filters(): + if one_filter.startswith('-'): + if category.startswith(one_filter[1:]): + is_filtered = True + elif one_filter.startswith('+'): + if category.startswith(one_filter[1:]): + is_filtered = False + else: + assert False # should have been checked for in SetFilter. + if is_filtered: + return False + + return True + + +def Error(filename, linenum, category, confidence, message): + """Logs the fact we've found a lint error. + + We log where the error was found, and also our confidence in the error, + that is, how certain we are this is a legitimate style regression, and + not a misidentification or a use that's sometimes justified. + + False positives can be suppressed by the use of + "cpplint(category)" comments on the offending line. These are + parsed into _error_suppressions. + + Args: + filename: The name of the file containing the error. + linenum: The number of the line containing the error. + category: A string used to describe the "category" this bug + falls under: "whitespace", say, or "runtime". Categories + may have a hierarchy separated by slashes: "whitespace/indent". + confidence: A number from 1-5 representing a confidence score for + the error, with 5 meaning that we are certain of the problem, + and 1 meaning that it could be a legitimate construct. + message: The error message. + """ + if _ShouldPrintError(category, confidence, linenum): + _cpplint_state.IncrementErrorCount(category) + if _cpplint_state.output_format == 'vs7': + sys.stderr.write('%s(%s): %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + elif _cpplint_state.output_format == 'eclipse': + sys.stderr.write('%s:%s: warning: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + else: + sys.stderr.write('%s:%s: %s [%s] [%d]\n' % ( + filename, linenum, message, category, confidence)) + + +# Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard. +_RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile( + r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)') +# Match a single C style comment on the same line. +_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/' +# Matches multi-line C style comments. +# This RE is a little bit more complicated than one might expect, because we +# have to take care of space removals tools so we can handle comments inside +# statements better. +# The current rule is: We only clear spaces from both sides when we're at the +# end of the line. Otherwise, we try to remove spaces from the right side, +# if this doesn't work we try on left side but only if there's a non-character +# on the right. +_RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile( + r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' + + _RE_PATTERN_C_COMMENTS + r'\s+|' + + r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' + + _RE_PATTERN_C_COMMENTS + r')') + + +def IsCppString(line): + """Does line terminate so, that the next symbol is in string constant. + + This function does not consider single-line nor multi-line comments. + + Args: + line: is a partial line of code starting from the 0..n. + + Returns: + True, if next character appended to 'line' is inside a + string constant. + """ + + line = line.replace(r'\\', 'XX') # after this, \\" does not match to \" + return ((line.count('"') - line.count(r'\"') - line.count("'\"'")) & 1) == 1 + + +def CleanseRawStrings(raw_lines): + """Removes C++11 raw strings from lines. + + Before: + static const char kData[] = R"( + multi-line string + )"; + + After: + static const char kData[] = "" + (replaced by blank line) + ""; + + Args: + raw_lines: list of raw lines. + + Returns: + list of lines with C++11 raw strings replaced by empty strings. + """ + + delimiter = None + lines_without_raw_strings = [] + for line in raw_lines: + if delimiter: + # Inside a raw string, look for the end + end = line.find(delimiter) + if end >= 0: + # Found the end of the string, match leading space for this + # line and resume copying the original lines, and also insert + # a "" on the last line. + leading_space = Match(r'^(\s*)\S', line) + line = leading_space.group(1) + '""' + line[end + len(delimiter):] + delimiter = None + else: + # Haven't found the end yet, append a blank line. + line = '""' + + # Look for beginning of a raw string, and replace them with + # empty strings. This is done in a loop to handle multiple raw + # strings on the same line. + while delimiter is None: + # Look for beginning of a raw string. + # See 2.14.15 [lex.string] for syntax. + matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line) + if matched: + delimiter = ')' + matched.group(2) + '"' + + end = matched.group(3).find(delimiter) + if end >= 0: + # Raw string ended on same line + line = (matched.group(1) + '""' + + matched.group(3)[end + len(delimiter):]) + delimiter = None + else: + # Start of a multi-line raw string + line = matched.group(1) + '""' + else: + break + + lines_without_raw_strings.append(line) + + # TODO(unknown): if delimiter is not None here, we might want to + # emit a warning for unterminated string. + return lines_without_raw_strings + + +def FindNextMultiLineCommentStart(lines, lineix): + """Find the beginning marker for a multiline comment.""" + while lineix < len(lines): + if lines[lineix].strip().startswith('/*'): + # Only return this marker if the comment goes beyond this line + if lines[lineix].strip().find('*/', 2) < 0: + return lineix + lineix += 1 + return len(lines) + + +def FindNextMultiLineCommentEnd(lines, lineix): + """We are inside a comment, find the end marker.""" + while lineix < len(lines): + if lines[lineix].strip().endswith('*/'): + return lineix + lineix += 1 + return len(lines) + + +def RemoveMultiLineCommentsFromRange(lines, begin, end): + """Clears a range of lines for multi-line comments.""" + # Having // dummy comments makes the lines non-empty, so we will not get + # unnecessary blank line warnings later in the code. + for i in range(begin, end): + lines[i] = '/**/' + + +def RemoveMultiLineComments(filename, lines, error): + """Removes multiline (c-style) comments from lines.""" + lineix = 0 + while lineix < len(lines): + lineix_begin = FindNextMultiLineCommentStart(lines, lineix) + if lineix_begin >= len(lines): + return + lineix_end = FindNextMultiLineCommentEnd(lines, lineix_begin) + if lineix_end >= len(lines): + error(filename, lineix_begin + 1, 'readability/multiline_comment', 5, + 'Could not find end of multi-line comment') + return + RemoveMultiLineCommentsFromRange(lines, lineix_begin, lineix_end + 1) + lineix = lineix_end + 1 + + +def CleanseComments(line): + """Removes //-comments and single-line C-style /* */ comments. + + Args: + line: A line of C++ source. + + Returns: + The line with single-line comments removed. + """ + commentpos = line.find('//') + if commentpos != -1 and not IsCppString(line[:commentpos]): + line = line[:commentpos].rstrip() + # get rid of /* ... */ + return _RE_PATTERN_CLEANSE_LINE_C_COMMENTS.sub('', line) + + +class CleansedLines(object): + """Holds 4 copies of all lines with different preprocessing applied to them. + + 1) elided member contains lines without strings and comments. + 2) lines member contains lines without comments. + 3) raw_lines member contains all the lines without processing. + 4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw + strings removed. + All these members are of , and of the same length. + """ + + def __init__(self, lines): + self.elided = [] + self.lines = [] + self.raw_lines = lines + self.num_lines = len(lines) + self.lines_without_raw_strings = CleanseRawStrings(lines) + for linenum in range(len(self.lines_without_raw_strings)): + self.lines.append(CleanseComments( + self.lines_without_raw_strings[linenum])) + elided = self._CollapseStrings(self.lines_without_raw_strings[linenum]) + self.elided.append(CleanseComments(elided)) + + def NumLines(self): + """Returns the number of lines represented.""" + return self.num_lines + + @staticmethod + def _CollapseStrings(elided): + """Collapses strings and chars on a line to simple "" or '' blocks. + + We nix strings first so we're not fooled by text like '"http://"' + + Args: + elided: The line being processed. + + Returns: + The line with collapsed strings. + """ + if _RE_PATTERN_INCLUDE.match(elided): + return elided + + # Remove escaped characters first to make quote/single quote collapsing + # basic. Things that look like escaped characters shouldn't occur + # outside of strings and chars. + elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided) + + # Replace quoted strings and digit separators. Both single quotes + # and double quotes are processed in the same loop, otherwise + # nested quotes wouldn't work. + collapsed = '' + while True: + # Find the first quote character + match = Match(r'^([^\'"]*)([\'"])(.*)$', elided) + if not match: + collapsed += elided + break + head, quote, tail = match.groups() + + if quote == '"': + # Collapse double quoted strings + second_quote = tail.find('"') + if second_quote >= 0: + collapsed += head + '""' + elided = tail[second_quote + 1:] + else: + # Unmatched double quote, don't bother processing the rest + # of the line since this is probably a multiline string. + collapsed += elided + break + else: + # Found single quote, check nearby text to eliminate digit separators. + # + # There is no special handling for floating point here, because + # the integer/fractional/exponent parts would all be parsed + # correctly as long as there are digits on both sides of the + # separator. So we are fine as long as we don't see something + # like "0.'3" (gcc 4.9.0 will not allow this literal). + if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head): + match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail) + collapsed += head + match_literal.group(1).replace("'", '') + elided = match_literal.group(2) + else: + second_quote = tail.find('\'') + if second_quote >= 0: + collapsed += head + "''" + elided = tail[second_quote + 1:] + else: + # Unmatched single quote + collapsed += elided + break + + return collapsed + + +def FindEndOfExpressionInLine(line, startpos, stack): + """Find the position just after the end of current parenthesized expression. + + Args: + line: a CleansedLines line. + startpos: start searching at this position. + stack: nesting stack at startpos. + + Returns: + On finding matching end: (index just after matching end, None) + On finding an unclosed expression: (-1, None) + Otherwise: (-1, new stack at end of this line) + """ + for i in xrange(startpos, len(line)): + char = line[i] + if char in '([{': + # Found start of parenthesized expression, push to expression stack + stack.append(char) + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + if stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + elif i > 0 and Search(r'\boperator\s*$', line[0:i]): + # operator<, don't add to stack + continue + else: + # Tentative start of template argument list + stack.append('<') + elif char in ')]}': + # Found end of parenthesized expression. + # + # If we are currently expecting a matching '>', the pending '<' + # must have been an operator. Remove them from expression stack. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + if ((stack[-1] == '(' and char == ')') or + (stack[-1] == '[' and char == ']') or + (stack[-1] == '{' and char == '}')): + stack.pop() + if not stack: + return (i + 1, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == '>': + # Found potential end of template argument list. + + # Ignore "->" and operator functions + if (i > 0 and + (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))): + continue + + # Pop the stack if there is a matching '<'. Otherwise, ignore + # this '>' since it must be an operator. + if stack: + if stack[-1] == '<': + stack.pop() + if not stack: + return (i + 1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '>', the matching '<' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '<': + stack.pop() + if not stack: + return (-1, None) + + # Did not find end of expression or unbalanced parentheses on this line + return (-1, stack) + + +def CloseExpression(clean_lines, linenum, pos): + """If input points to ( or { or [ or <, finds the position that closes it. + + If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the + linenum/pos that correspond to the closing of the expression. + + TODO(unknown): cpplint spends a fair bit of time matching parentheses. + Ideally we would want to index all opening and closing parentheses once + and have CloseExpression be just a simple lookup, but due to preprocessor + tricks, this is not so easy. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *past* the closing brace, or + (line, len(lines), -1) if we never find a close. Note we ignore + strings and comments when matching; and the line we return is the + 'cleansed' line at linenum. + """ + + line = clean_lines.elided[linenum] + if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]): + return (line, clean_lines.NumLines(), -1) + + # Check first line + (end_pos, stack) = FindEndOfExpressionInLine(line, pos, []) + if end_pos > -1: + return (line, linenum, end_pos) + + # Continue scanning forward + while stack and linenum < clean_lines.NumLines() - 1: + linenum += 1 + line = clean_lines.elided[linenum] + (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack) + if end_pos > -1: + return (line, linenum, end_pos) + + # Did not find end of expression before end of file, give up + return (line, clean_lines.NumLines(), -1) + + +def FindStartOfExpressionInLine(line, endpos, stack): + """Find position at the matching start of current expression. + + This is almost the reverse of FindEndOfExpressionInLine, but note + that the input position and returned position differs by 1. + + Args: + line: a CleansedLines line. + endpos: start searching at this position. + stack: nesting stack at endpos. + + Returns: + On finding matching start: (index at matching start, None) + On finding an unclosed expression: (-1, None) + Otherwise: (-1, new stack at beginning of this line) + """ + i = endpos + while i >= 0: + char = line[i] + if char in ')]}': + # Found end of expression, push to expression stack + stack.append(char) + elif char == '>': + # Found potential end of template argument list. + # + # Ignore it if it's a "->" or ">=" or "operator>" + if (i > 0 and + (line[i - 1] == '-' or + Match(r'\s>=\s', line[i - 1:]) or + Search(r'\boperator\s*$', line[0:i]))): + i -= 1 + else: + stack.append('>') + elif char == '<': + # Found potential start of template argument list + if i > 0 and line[i - 1] == '<': + # Left shift operator + i -= 1 + else: + # If there is a matching '>', we can pop the expression stack. + # Otherwise, ignore this '<' since it must be an operator. + if stack and stack[-1] == '>': + stack.pop() + if not stack: + return (i, None) + elif char in '([{': + # Found start of expression. + # + # If there are any unmatched '>' on the stack, they must be + # operators. Remove those. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + if ((char == '(' and stack[-1] == ')') or + (char == '[' and stack[-1] == ']') or + (char == '{' and stack[-1] == '}')): + stack.pop() + if not stack: + return (i, None) + else: + # Mismatched parentheses + return (-1, None) + elif char == ';': + # Found something that look like end of statements. If we are currently + # expecting a '<', the matching '>' must have been an operator, since + # template argument list should not contain statements. + while stack and stack[-1] == '>': + stack.pop() + if not stack: + return (-1, None) + + i -= 1 + + return (-1, stack) + + +def ReverseCloseExpression(clean_lines, linenum, pos): + """If input points to ) or } or ] or >, finds the position that opens it. + + If lines[linenum][pos] points to a ')' or '}' or ']' or '>', finds the + linenum/pos that correspond to the opening of the expression. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: A position on the line. + + Returns: + A tuple (line, linenum, pos) pointer *at* the opening brace, or + (line, 0, -1) if we never find the matching opening brace. Note + we ignore strings and comments when matching; and the line we + return is the 'cleansed' line at linenum. + """ + line = clean_lines.elided[linenum] + if line[pos] not in ')}]>': + return (line, 0, -1) + + # Check last line + (start_pos, stack) = FindStartOfExpressionInLine(line, pos, []) + if start_pos > -1: + return (line, linenum, start_pos) + + # Continue scanning backward + while stack and linenum > 0: + linenum -= 1 + line = clean_lines.elided[linenum] + (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack) + if start_pos > -1: + return (line, linenum, start_pos) + + # Did not find start of expression before beginning of file, give up + return (line, 0, -1) + + +def CheckForCopyright(filename, lines, error): + """Logs an error if no Copyright message appears at the top of the file.""" + + # We'll say it should occur by line 10. Don't forget there's a + # dummy line at the front. + for line in xrange(1, min(len(lines), 11)): + if re.search(r'Copyright', lines[line], re.I): break + else: # means no copyright line was found + error(filename, 0, 'legal/copyright', 5, + 'No copyright message found. ' + 'You should have a line: "Copyright [year] "') + + +def GetIndentLevel(line): + """Return the number of leading spaces in line. + + Args: + line: A string to check. + + Returns: + An integer count of leading spaces, possibly zero. + """ + indent = Match(r'^( *)\S', line) + if indent: + return len(indent.group(1)) + else: + return 0 + + +def GetHeaderGuardCPPVariable(filename): + """Returns the CPP variable that should be used as a header guard. + + Args: + filename: The name of a C++ header file. + + Returns: + The CPP variable that should be used as a header guard in the + named file. + + """ + + # Restores original filename in case that cpplint is invoked from Emacs's + # flymake. + filename = re.sub(r'_flymake\.h$', '.h', filename) + filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename) + # Replace 'c++' with 'cpp'. + filename = filename.replace('C++', 'cpp').replace('c++', 'cpp') + + fileinfo = FileInfo(filename) + file_path_from_root = fileinfo.RepositoryName() + if _root: + file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root) + return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_' + + +def CheckForHeaderGuard(filename, clean_lines, error): + """Checks that the file contains a header guard. + + Logs an error if no #ifndef header guard is present. For other + headers, checks that the full pathname is used. + + Args: + filename: The name of the C++ header file. + clean_lines: A CleansedLines instance containing the file. + error: The function to call with any errors found. + """ + + # Don't check for header guards if there are error suppression + # comments somewhere in this file. + # + # Because this is silencing a warning for a nonexistent line, we + # only support the very specific NOLINT(build/header_guard) syntax, + # and not the general NOLINT or NOLINT(*) syntax. + raw_lines = clean_lines.lines_without_raw_strings + for i in raw_lines: + if Search(r'//\s*NOLINT\(build/header_guard\)', i): + return + + cppvar = GetHeaderGuardCPPVariable(filename) + + ifndef = '' + ifndef_linenum = 0 + define = '' + endif = '' + endif_linenum = 0 + for linenum, line in enumerate(raw_lines): + linesplit = line.split() + if len(linesplit) >= 2: + # find the first occurrence of #ifndef and #define, save arg + if not ifndef and linesplit[0] == '#ifndef': + # set ifndef to the header guard presented on the #ifndef line. + ifndef = linesplit[1] + ifndef_linenum = linenum + if not define and linesplit[0] == '#define': + define = linesplit[1] + # find the last occurrence of #endif, save entire line + if line.startswith('#endif'): + endif = line + endif_linenum = linenum + + if not ifndef or not define or ifndef != define: + error(filename, 0, 'build/header_guard', 5, + 'No #ifndef header guard found, suggested CPP variable is: %s' % + cppvar) + return + + # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__ + # for backward compatibility. + if ifndef != cppvar: + error_level = 0 + if ifndef != cppvar + '_': + error_level = 5 + + ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum, + error) + error(filename, ifndef_linenum, 'build/header_guard', error_level, + '#ifndef header guard has wrong style, please use: %s' % cppvar) + + # Check for "//" comments on endif line. + ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum, + error) + match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif) + if match: + if match.group(1) == '_': + # Issue low severity warning for deprecated double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif // %s"' % cppvar) + return + + # Didn't find the corresponding "//" comment. If this file does not + # contain any "//" comments at all, it could be that the compiler + # only wants "/**/" comments, look for those instead. + no_single_line_comments = True + for i in xrange(1, len(raw_lines) - 1): + line = raw_lines[i] + if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line): + no_single_line_comments = False + break + + if no_single_line_comments: + match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif) + if match: + if match.group(1) == '_': + # Low severity warning for double trailing underscore + error(filename, endif_linenum, 'build/header_guard', 0, + '#endif line should be "#endif /* %s */"' % cppvar) + return + + # Didn't find anything + error(filename, endif_linenum, 'build/header_guard', 5, + '#endif line should be "#endif // %s"' % cppvar) + + +def CheckHeaderFileIncluded(filename, include_state, error): + """Logs an error if a .cc file does not include its header.""" + + # Do not check test files + if filename.endswith('_test.cc') or filename.endswith('_unittest.cc'): + return + + fileinfo = FileInfo(filename) + headerfile = filename[0:len(filename) - 2] + 'h' + if not os.path.exists(headerfile): + return + headername = FileInfo(headerfile).RepositoryName() + first_include = 0 + for section_list in include_state.include_list: + for f in section_list: + if headername in f[0] or f[0] in headername: + return + if not first_include: + first_include = f[1] + + error(filename, first_include, 'build/include', 5, + '%s should include its header file %s' % (fileinfo.RepositoryName(), + headername)) + + +def CheckForBadCharacters(filename, lines, error): + """Logs an error for each line containing bad characters. + + Two kinds of bad characters: + + 1. Unicode replacement characters: These indicate that either the file + contained invalid UTF-8 (likely) or Unicode replacement characters (which + it shouldn't). Note that it's possible for this to throw off line + numbering if the invalid UTF-8 occurred adjacent to a newline. + + 2. NUL bytes. These are problematic for some tools. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + for linenum, line in enumerate(lines): + if u'\ufffd' in line: + error(filename, linenum, 'readability/utf8', 5, + 'Line contains invalid UTF-8 (or Unicode replacement character).') + if '\0' in line: + error(filename, linenum, 'readability/nul', 5, 'Line contains NUL byte.') + + +def CheckForNewlineAtEOF(filename, lines, error): + """Logs an error if there is no newline char at the end of the file. + + Args: + filename: The name of the current file. + lines: An array of strings, each representing a line of the file. + error: The function to call with any errors found. + """ + + # The array lines() was created by adding two newlines to the + # original file (go figure), then splitting on \n. + # To verify that the file ends in \n, we just have to make sure the + # last-but-two element of lines() exists and is empty. + if len(lines) < 3 or lines[-2]: + error(filename, len(lines) - 2, 'whitespace/ending_newline', 5, + 'Could not find a newline character at the end of the file.') + + +def CheckForMultilineCommentsAndStrings(filename, clean_lines, linenum, error): + """Logs an error if we see /* ... */ or "..." that extend past one line. + + /* ... */ comments are legit inside macros, for one line. + Otherwise, we prefer // comments, so it's ok to warn about the + other. Likewise, it's ok for strings to extend across multiple + lines, as long as a line continuation character (backslash) + terminates each line. Although not currently prohibited by the C++ + style guide, it's ugly and unnecessary. We don't do well with either + in this lint program, so we warn about both. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Remove all \\ (escaped backslashes) from the line. They are OK, and the + # second (escaped) slash may trigger later \" detection erroneously. + line = line.replace('\\\\', '') + + if line.count('/*') > line.count('*/'): + error(filename, linenum, 'readability/multiline_comment', 5, + 'Complex multi-line /*...*/-style comment found. ' + 'Lint may give bogus warnings. ' + 'Consider replacing these with //-style comments, ' + 'with #if 0...#endif, ' + 'or with more clearly structured multi-line comments.') + + if (line.count('"') - line.count('\\"')) % 2: + error(filename, linenum, 'readability/multiline_string', 5, + 'Multi-line string ("...") found. This lint script doesn\'t ' + 'do well with such strings, and may give bogus warnings. ' + 'Use C++11 raw strings or concatenation instead.') + + +# (non-threadsafe name, thread-safe alternative, validation pattern) +# +# The validation pattern is used to eliminate false positives such as: +# _rand(); // false positive due to substring match. +# ->rand(); // some member function rand(). +# ACMRandom rand(seed); // some variable named rand. +# ISAACRandom rand(); // another variable named rand. +# +# Basically we require the return value of these functions to be used +# in some expression context on the same line by matching on some +# operator before the function name. This eliminates constructors and +# member function calls. +_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)' +_THREADING_LIST = ( + ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'), + ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'), + ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'), + ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'), + ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'), + ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'), + ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'), + ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'), + ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'), + ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'), + ('strtok(', 'strtok_r(', + _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'), + ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'), + ) + + +def CheckPosixThreading(filename, clean_lines, linenum, error): + """Checks for calls to thread-unsafe functions. + + Much code has been originally written without consideration of + multi-threading. Also, engineers are relying on their old experience; + they have learned posix before threading extensions were added. These + tests guide the engineers to use thread-safe functions (when using + posix directly). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST: + # Additional pattern matching check to confirm that this is the + # function we are looking for + if Search(pattern, line): + error(filename, linenum, 'runtime/threadsafe_fn', 2, + 'Consider using ' + multithread_safe_func + + '...) instead of ' + single_thread_func + + '...) for improved thread safety.') + + +def CheckVlogArguments(filename, clean_lines, linenum, error): + """Checks that VLOG() is only used for defining a logging level. + + For example, VLOG(2) is correct. VLOG(INFO), VLOG(WARNING), VLOG(ERROR), and + VLOG(FATAL) are not. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if Search(r'\bVLOG\((INFO|ERROR|WARNING|DFATAL|FATAL)\)', line): + error(filename, linenum, 'runtime/vlog', 5, + 'VLOG() should be used with numeric verbosity level. ' + 'Use LOG() if you want symbolic severity levels.') + +# Matches invalid increment: *count++, which moves pointer instead of +# incrementing a value. +_RE_PATTERN_INVALID_INCREMENT = re.compile( + r'^\s*\*\w+(\+\+|--);') + + +def CheckInvalidIncrement(filename, clean_lines, linenum, error): + """Checks for invalid increment *count++. + + For example following function: + void increment_counter(int* count) { + *count++; + } + is invalid, because it effectively does count++, moving pointer, and should + be replaced with ++*count, (*count)++ or *count += 1. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + if _RE_PATTERN_INVALID_INCREMENT.match(line): + error(filename, linenum, 'runtime/invalid_increment', 5, + 'Changing pointer instead of value (or unused value of operator*).') + + +def IsMacroDefinition(clean_lines, linenum): + if Search(r'^#define', clean_lines[linenum]): + return True + + if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]): + return True + + return False + + +def IsForwardClassDeclaration(clean_lines, linenum): + return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum]) + + +class _BlockInfo(object): + """Stores information about a generic block of code.""" + + def __init__(self, seen_open_brace): + self.seen_open_brace = seen_open_brace + self.open_parentheses = 0 + self.inline_asm = _NO_ASM + self.check_namespace_indentation = False + + def CheckBegin(self, filename, clean_lines, linenum, error): + """Run checks that applies to text up to the opening brace. + + This is mostly for checking the text after the class identifier + and the "{", usually where the base class is specified. For other + blocks, there isn't much to check, so we always pass. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Run checks that applies to text after the closing brace. + + This is mostly used for checking end of namespace comments. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + pass + + def IsBlockInfo(self): + """Returns true if this block is a _BlockInfo. + + This is convenient for verifying that an object is an instance of + a _BlockInfo, but not an instance of any of the derived classes. + + Returns: + True for this class, False for derived classes. + """ + return self.__class__ == _BlockInfo + + +class _ExternCInfo(_BlockInfo): + """Stores information about an 'extern "C"' block.""" + + def __init__(self): + _BlockInfo.__init__(self, True) + + +class _ClassInfo(_BlockInfo): + """Stores information about a class.""" + + def __init__(self, name, class_or_struct, clean_lines, linenum): + _BlockInfo.__init__(self, False) + self.name = name + self.starting_linenum = linenum + self.is_derived = False + self.check_namespace_indentation = True + if class_or_struct == 'struct': + self.access = 'public' + self.is_struct = True + else: + self.access = 'private' + self.is_struct = False + + # Remember initial indentation level for this class. Using raw_lines here + # instead of elided to account for leading comments. + self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum]) + + # Try to find the end of the class. This will be confused by things like: + # class A { + # } *x = { ... + # + # But it's still good enough for CheckSectionSpacing. + self.last_line = 0 + depth = 0 + for i in range(linenum, clean_lines.NumLines()): + line = clean_lines.elided[i] + depth += line.count('{') - line.count('}') + if not depth: + self.last_line = i + break + + def CheckBegin(self, filename, clean_lines, linenum, error): + # Look for a bare ':' + if Search('(^|[^:]):($|[^:])', clean_lines.elided[linenum]): + self.is_derived = True + + def CheckEnd(self, filename, clean_lines, linenum, error): + # If there is a DISALLOW macro, it should appear near the end of + # the class. + seen_last_thing_in_class = False + for i in xrange(linenum - 1, self.starting_linenum, -1): + match = Search( + r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' + + self.name + r'\)', + clean_lines.elided[i]) + if match: + if seen_last_thing_in_class: + error(filename, i, 'readability/constructors', 3, + match.group(1) + ' should be the last thing in the class') + break + + if not Match(r'^\s*$', clean_lines.elided[i]): + seen_last_thing_in_class = True + + # Check that closing brace is aligned with beginning of the class. + # Only do this if the closing brace is indented by only whitespaces. + # This means we will not check single-line class definitions. + indent = Match(r'^( *)\}', clean_lines.elided[linenum]) + if indent and len(indent.group(1)) != self.class_indent: + if self.is_struct: + parent = 'struct ' + self.name + else: + parent = 'class ' + self.name + error(filename, linenum, 'whitespace/indent', 3, + 'Closing brace should be aligned with beginning of %s' % parent) + + +class _NamespaceInfo(_BlockInfo): + """Stores information about a namespace.""" + + def __init__(self, name, linenum): + _BlockInfo.__init__(self, False) + self.name = name or '' + self.starting_linenum = linenum + self.check_namespace_indentation = True + + def CheckEnd(self, filename, clean_lines, linenum, error): + """Check end of namespace comments.""" + line = clean_lines.raw_lines[linenum] + + # Check how many lines is enclosed in this namespace. Don't issue + # warning for missing namespace comments if there aren't enough + # lines. However, do apply checks if there is already an end of + # namespace comment and it's incorrect. + # + # TODO(unknown): We always want to check end of namespace comments + # if a namespace is large, but sometimes we also want to apply the + # check if a short namespace contained nontrivial things (something + # other than forward declarations). There is currently no logic on + # deciding what these nontrivial things are, so this check is + # triggered by namespace size only, which works most of the time. + if (linenum - self.starting_linenum < 10 + and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)): + return + + # Look for matching comment at end of namespace. + # + # Note that we accept C style "/* */" comments for terminating + # namespaces, so that code that terminate namespaces inside + # preprocessor macros can be cpplint clean. + # + # We also accept stuff like "// end of namespace ." with the + # period at the end. + # + # Besides these, we don't accept anything else, otherwise we might + # get false negatives when existing comment is a substring of the + # expected namespace. + if self.name: + # Named namespace + if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) + + r'[\*/\.\\\s]*$'), + line): + error(filename, linenum, 'readability/namespace', 5, + 'Namespace should be terminated with "// namespace %s"' % + self.name) + else: + # Anonymous namespace + if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line): + # If "// namespace anonymous" or "// anonymous namespace (more text)", + # mention "// anonymous namespace" as an acceptable form + if Match(r'}.*\b(namespace anonymous|anonymous namespace)\b', line): + error(filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"' + ' or "// anonymous namespace"') + else: + error(filename, linenum, 'readability/namespace', 5, + 'Anonymous namespace should be terminated with "// namespace"') + + +class _PreprocessorInfo(object): + """Stores checkpoints of nesting stacks when #if/#else is seen.""" + + def __init__(self, stack_before_if): + # The entire nesting stack before #if + self.stack_before_if = stack_before_if + + # The entire nesting stack up to #else + self.stack_before_else = [] + + # Whether we have already seen #else or #elif + self.seen_else = False + + +class NestingState(object): + """Holds states related to parsing braces.""" + + def __init__(self): + # Stack for tracking all braces. An object is pushed whenever we + # see a "{", and popped when we see a "}". Only 3 types of + # objects are possible: + # - _ClassInfo: a class or struct. + # - _NamespaceInfo: a namespace. + # - _BlockInfo: some other type of block. + self.stack = [] + + # Top of the previous stack before each Update(). + # + # Because the nesting_stack is updated at the end of each line, we + # had to do some convoluted checks to find out what is the current + # scope at the beginning of the line. This check is simplified by + # saving the previous top of nesting stack. + # + # We could save the full stack, but we only need the top. Copying + # the full nesting stack would slow down cpplint by ~10%. + self.previous_stack_top = [] + + # Stack of _PreprocessorInfo objects. + self.pp_stack = [] + + def SeenOpenBrace(self): + """Check if we have seen the opening brace for the innermost block. + + Returns: + True if we have seen the opening brace, False if the innermost + block is still expecting an opening brace. + """ + return (not self.stack) or self.stack[-1].seen_open_brace + + def InNamespaceBody(self): + """Check if we are currently one level inside a namespace body. + + Returns: + True if top of the stack is a namespace block, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _NamespaceInfo) + + def InExternC(self): + """Check if we are currently one level inside an 'extern "C"' block. + + Returns: + True if top of the stack is an extern block, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _ExternCInfo) + + def InClassDeclaration(self): + """Check if we are currently one level inside a class or struct declaration. + + Returns: + True if top of the stack is a class/struct, False otherwise. + """ + return self.stack and isinstance(self.stack[-1], _ClassInfo) + + def InAsmBlock(self): + """Check if we are currently one level inside an inline ASM block. + + Returns: + True if the top of the stack is a block containing inline ASM. + """ + return self.stack and self.stack[-1].inline_asm != _NO_ASM + + def InTemplateArgumentList(self, clean_lines, linenum, pos): + """Check if current position is inside template argument list. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + pos: position just after the suspected template argument. + Returns: + True if (linenum, pos) is inside template arguments. + """ + while linenum < clean_lines.NumLines(): + # Find the earliest character that might indicate a template argument + line = clean_lines.elided[linenum] + match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:]) + if not match: + linenum += 1 + pos = 0 + continue + token = match.group(1) + pos += len(match.group(0)) + + # These things do not look like template argument list: + # class Suspect { + # class Suspect x; } + if token in ('{', '}', ';'): return False + + # These things look like template argument list: + # template + # template + # template + # template + if token in ('>', '=', '[', ']', '.'): return True + + # Check if token is an unmatched '<'. + # If not, move on to the next character. + if token != '<': + pos += 1 + if pos >= len(line): + linenum += 1 + pos = 0 + continue + + # We can't be sure if we just find a single '<', and need to + # find the matching '>'. + (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1) + if end_pos < 0: + # Not sure if template argument list or syntax error in file + return False + linenum = end_line + pos = end_pos + return False + + def UpdatePreprocessor(self, line): + """Update preprocessor stack. + + We need to handle preprocessors due to classes like this: + #ifdef SWIG + struct ResultDetailsPageElementExtensionPoint { + #else + struct ResultDetailsPageElementExtensionPoint : public Extension { + #endif + + We make the following assumptions (good enough for most files): + - Preprocessor condition evaluates to true from #if up to first + #else/#elif/#endif. + + - Preprocessor condition evaluates to false from #else/#elif up + to #endif. We still perform lint checks on these lines, but + these do not affect nesting stack. + + Args: + line: current line to check. + """ + if Match(r'^\s*#\s*(if|ifdef|ifndef)\b', line): + # Beginning of #if block, save the nesting stack here. The saved + # stack will allow us to restore the parsing state in the #else case. + self.pp_stack.append(_PreprocessorInfo(copy.deepcopy(self.stack))) + elif Match(r'^\s*#\s*(else|elif)\b', line): + # Beginning of #else block + if self.pp_stack: + if not self.pp_stack[-1].seen_else: + # This is the first #else or #elif block. Remember the + # whole nesting stack up to this point. This is what we + # keep after the #endif. + self.pp_stack[-1].seen_else = True + self.pp_stack[-1].stack_before_else = copy.deepcopy(self.stack) + + # Restore the stack to how it was before the #if + self.stack = copy.deepcopy(self.pp_stack[-1].stack_before_if) + else: + # TODO(unknown): unexpected #else, issue warning? + pass + elif Match(r'^\s*#\s*endif\b', line): + # End of #if or #else blocks. + if self.pp_stack: + # If we saw an #else, we will need to restore the nesting + # stack to its former state before the #else, otherwise we + # will just continue from where we left off. + if self.pp_stack[-1].seen_else: + # Here we can just use a shallow copy since we are the last + # reference to it. + self.stack = self.pp_stack[-1].stack_before_else + # Drop the corresponding #if + self.pp_stack.pop() + else: + # TODO(unknown): unexpected #endif, issue warning? + pass + + # TODO(unknown): Update() is too long, but we will refactor later. + def Update(self, filename, clean_lines, linenum, error): + """Update nesting state with current line. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Remember top of the previous nesting stack. + # + # The stack is always pushed/popped and not modified in place, so + # we can just do a shallow copy instead of copy.deepcopy. Using + # deepcopy would slow down cpplint by ~28%. + if self.stack: + self.previous_stack_top = self.stack[-1] + else: + self.previous_stack_top = None + + # Update pp_stack + self.UpdatePreprocessor(line) + + # Count parentheses. This is to avoid adding struct arguments to + # the nesting stack. + if self.stack: + inner_block = self.stack[-1] + depth_change = line.count('(') - line.count(')') + inner_block.open_parentheses += depth_change + + # Also check if we are starting or ending an inline assembly block. + if inner_block.inline_asm in (_NO_ASM, _END_ASM): + if (depth_change != 0 and + inner_block.open_parentheses == 1 and + _MATCH_ASM.match(line)): + # Enter assembly block + inner_block.inline_asm = _INSIDE_ASM + else: + # Not entering assembly block. If previous line was _END_ASM, + # we will now shift to _NO_ASM state. + inner_block.inline_asm = _NO_ASM + elif (inner_block.inline_asm == _INSIDE_ASM and + inner_block.open_parentheses == 0): + # Exit assembly block + inner_block.inline_asm = _END_ASM + + # Consume namespace declaration at the beginning of the line. Do + # this in a loop so that we catch same line declarations like this: + # namespace proto2 { namespace bridge { class MessageSet; } } + while True: + # Match start of namespace. The "\b\s*" below catches namespace + # declarations even if it weren't followed by a whitespace, this + # is so that we don't confuse our namespace checker. The + # missing spaces will be flagged by CheckSpacing. + namespace_decl_match = Match(r'^\s*namespace\b\s*([:\w]+)?(.*)$', line) + if not namespace_decl_match: + break + + new_namespace = _NamespaceInfo(namespace_decl_match.group(1), linenum) + self.stack.append(new_namespace) + + line = namespace_decl_match.group(2) + if line.find('{') != -1: + new_namespace.seen_open_brace = True + line = line[line.find('{') + 1:] + + # Look for a class declaration in whatever is left of the line + # after parsing namespaces. The regexp accounts for decorated classes + # such as in: + # class LOCKABLE API Object { + # }; + class_decl_match = Match( + r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?' + r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))' + r'(.*)$', line) + if (class_decl_match and + (not self.stack or self.stack[-1].open_parentheses == 0)): + # We do not want to accept classes that are actually template arguments: + # template , + # template class Ignore3> + # void Function() {}; + # + # To avoid template argument cases, we scan forward and look for + # an unmatched '>'. If we see one, assume we are inside a + # template argument list. + end_declaration = len(class_decl_match.group(1)) + if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration): + self.stack.append(_ClassInfo( + class_decl_match.group(3), class_decl_match.group(2), + clean_lines, linenum)) + line = class_decl_match.group(4) + + # If we have not yet seen the opening brace for the innermost block, + # run checks here. + if not self.SeenOpenBrace(): + self.stack[-1].CheckBegin(filename, clean_lines, linenum, error) + + # Update access control if we are inside a class/struct + if self.stack and isinstance(self.stack[-1], _ClassInfo): + classinfo = self.stack[-1] + access_match = Match( + r'^(.*)\b(public|private|protected|signals)(\s+(?:slots\s*)?)?' + r':(?:[^:]|$)', + line) + if access_match: + classinfo.access = access_match.group(2) + + # Check that access keywords are indented +1 space. Skip this + # check if the keywords are not preceded by whitespaces. + indent = access_match.group(1) + if (len(indent) != classinfo.class_indent + 1 and + Match(r'^\s*$', indent)): + if classinfo.is_struct: + parent = 'struct ' + classinfo.name + else: + parent = 'class ' + classinfo.name + slots = '' + if access_match.group(3): + slots = access_match.group(3) + error(filename, linenum, 'whitespace/indent', 3, + '%s%s: should be indented +1 space inside %s' % ( + access_match.group(2), slots, parent)) + + # Consume braces or semicolons from what's left of the line + while True: + # Match first brace, semicolon, or closed parenthesis. + matched = Match(r'^[^{;)}]*([{;)}])(.*)$', line) + if not matched: + break + + token = matched.group(1) + if token == '{': + # If namespace or class hasn't seen a opening brace yet, mark + # namespace/class head as complete. Push a new block onto the + # stack otherwise. + if not self.SeenOpenBrace(): + self.stack[-1].seen_open_brace = True + elif Match(r'^extern\s*"[^"]*"\s*\{', line): + self.stack.append(_ExternCInfo()) + else: + self.stack.append(_BlockInfo(True)) + if _MATCH_ASM.match(line): + self.stack[-1].inline_asm = _BLOCK_ASM + + elif token == ';' or token == ')': + # If we haven't seen an opening brace yet, but we already saw + # a semicolon, this is probably a forward declaration. Pop + # the stack for these. + # + # Similarly, if we haven't seen an opening brace yet, but we + # already saw a closing parenthesis, then these are probably + # function arguments with extra "class" or "struct" keywords. + # Also pop these stack for these. + if not self.SeenOpenBrace(): + self.stack.pop() + else: # token == '}' + # Perform end of block checks and pop the stack. + if self.stack: + self.stack[-1].CheckEnd(filename, clean_lines, linenum, error) + self.stack.pop() + line = matched.group(2) + + def InnermostClass(self): + """Get class info on the top of the stack. + + Returns: + A _ClassInfo object if we are inside a class, or None otherwise. + """ + for i in range(len(self.stack), 0, -1): + classinfo = self.stack[i - 1] + if isinstance(classinfo, _ClassInfo): + return classinfo + return None + + def CheckCompletedBlocks(self, filename, error): + """Checks that all classes and namespaces have been completely parsed. + + Call this when all lines in a file have been processed. + Args: + filename: The name of the current file. + error: The function to call with any errors found. + """ + # Note: This test can result in false positives if #ifdef constructs + # get in the way of brace matching. See the testBuildClass test in + # cpplint_unittest.py for an example of this. + for obj in self.stack: + if isinstance(obj, _ClassInfo): + error(filename, obj.starting_linenum, 'build/class', 5, + 'Failed to find complete declaration of class %s' % + obj.name) + elif isinstance(obj, _NamespaceInfo): + error(filename, obj.starting_linenum, 'build/namespaces', 5, + 'Failed to find complete declaration of namespace %s' % + obj.name) + + +def CheckForNonStandardConstructs(filename, clean_lines, linenum, + nesting_state, error): + r"""Logs an error if we see certain non-ANSI constructs ignored by gcc-2. + + Complain about several constructs which gcc-2 accepts, but which are + not standard C++. Warning about these in lint is one way to ease the + transition to new compilers. + - put storage class first (e.g. "static const" instead of "const static"). + - "%lld" instead of %qd" in printf-type functions. + - "%1$d" is non-standard in printf-type functions. + - "\%" is an undefined character escape sequence. + - text after #endif is not allowed. + - invalid inner-style forward declaration. + - >? and ?= and )\?=?\s*(\w+|[+-]?\d+)(\.\d*)?', + line): + error(filename, linenum, 'build/deprecated', 3, + '>? and ))?' + # r'\s*const\s*' + type_name + '\s*&\s*\w+\s*;' + error(filename, linenum, 'runtime/member_string_references', 2, + 'const string& members are dangerous. It is much better to use ' + 'alternatives, such as pointers or simple constants.') + + # Everything else in this function operates on class declarations. + # Return early if the top of the nesting stack is not a class, or if + # the class head is not completed yet. + classinfo = nesting_state.InnermostClass() + if not classinfo or not classinfo.seen_open_brace: + return + + # The class may have been declared with namespace or classname qualifiers. + # The constructor and destructor will not have those qualifiers. + base_classname = classinfo.name.split('::')[-1] + + # Look for single-argument constructors that aren't marked explicit. + # Technically a valid construct, but against style. Also look for + # non-single-argument constructors which are also technically valid, but + # strongly suggest something is wrong. + explicit_constructor_match = Match( + r'\s+(?:inline\s+)?(explicit\s+)?(?:inline\s+)?%s\s*' + r'\(((?:[^()]|\([^()]*\))*)\)' + % re.escape(base_classname), + line) + + if explicit_constructor_match: + is_marked_explicit = explicit_constructor_match.group(1) + + if not explicit_constructor_match.group(2): + constructor_args = [] + else: + constructor_args = explicit_constructor_match.group(2).split(',') + + # collapse arguments so that commas in template parameter lists and function + # argument parameter lists don't split arguments in two + i = 0 + while i < len(constructor_args): + constructor_arg = constructor_args[i] + while (constructor_arg.count('<') > constructor_arg.count('>') or + constructor_arg.count('(') > constructor_arg.count(')')): + constructor_arg += ',' + constructor_args[i + 1] + del constructor_args[i + 1] + constructor_args[i] = constructor_arg + i += 1 + + defaulted_args = [arg for arg in constructor_args if '=' in arg] + noarg_constructor = (not constructor_args or # empty arg list + # 'void' arg specifier + (len(constructor_args) == 1 and + constructor_args[0].strip() == 'void')) + onearg_constructor = ((len(constructor_args) == 1 and # exactly one arg + not noarg_constructor) or + # all but at most one arg defaulted + (len(constructor_args) >= 1 and + not noarg_constructor and + len(defaulted_args) >= len(constructor_args) - 1)) + initializer_list_constructor = bool( + onearg_constructor and + Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0])) + copy_constructor = bool( + onearg_constructor and + Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&' + % re.escape(base_classname), constructor_args[0].strip())) + + if (not is_marked_explicit and + onearg_constructor and + not initializer_list_constructor and + not copy_constructor): + if defaulted_args: + error(filename, linenum, 'runtime/explicit', 5, + 'Constructors callable with one argument ' + 'should be marked explicit.') + else: + error(filename, linenum, 'runtime/explicit', 5, + 'Single-parameter constructors should be marked explicit.') + elif is_marked_explicit and not onearg_constructor: + if noarg_constructor: + error(filename, linenum, 'runtime/explicit', 5, + 'Zero-parameter constructors should not be marked explicit.') + else: + error(filename, linenum, 'runtime/explicit', 0, + 'Constructors that require multiple arguments ' + 'should not be marked explicit.') + + +def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error): + """Checks for the correctness of various spacing around function calls. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Since function calls often occur inside if/for/while/switch + # expressions - which have their own, more liberal conventions - we + # first see if we should be looking inside such an expression for a + # function call, to which we can apply more strict standards. + fncall = line # if there's no control flow construct, look at whole line + for pattern in (r'\bif\s*\((.*)\)\s*{', + r'\bfor\s*\((.*)\)\s*{', + r'\bwhile\s*\((.*)\)\s*[{;]', + r'\bswitch\s*\((.*)\)\s*{'): + match = Search(pattern, line) + if match: + fncall = match.group(1) # look inside the parens for function calls + break + + # Except in if/for/while/switch, there should never be space + # immediately inside parens (eg "f( 3, 4 )"). We make an exception + # for nested parens ( (a+b) + c ). Likewise, there should never be + # a space before a ( when it's a function argument. I assume it's a + # function argument when the char before the whitespace is legal in + # a function name (alnum + _) and we're not starting a macro. Also ignore + # pointers and references to arrays and functions coz they're too tricky: + # we use a very simple way to recognize these: + # " (something)(maybe-something)" or + # " (something)(maybe-something," or + # " (something)[something]" + # Note that we assume the contents of [] to be short enough that + # they'll never need to wrap. + if ( # Ignore control structures. + not Search(r'\b(if|for|while|switch|return|new|delete|catch|sizeof)\b', + fncall) and + # Ignore pointers/references to functions. + not Search(r' \([^)]+\)\([^)]*(\)|,$)', fncall) and + # Ignore pointers/references to arrays. + not Search(r' \([^)]+\)\[[^\]]+\]', fncall)): + if Search(r'\w\s*\(\s(?!\s*\\$)', fncall): # a ( used for a fn call + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space after ( in function call') + elif Search(r'\(\s+(?!(\s*\\)|\()', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space after (') + if (Search(r'\w\s+\(', fncall) and + not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and + not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and + not Search(r'\bcase\s+\(', fncall)): + # TODO(unknown): Space after an operator function seem to be a common + # error, silence those for now by restricting them to highest verbosity. + if Search(r'\boperator_*\b', line): + error(filename, linenum, 'whitespace/parens', 0, + 'Extra space before ( in function call') + else: + error(filename, linenum, 'whitespace/parens', 4, + 'Extra space before ( in function call') + # If the ) is followed only by a newline or a { + newline, assume it's + # part of a control statement (if/while/etc), and don't complain + if Search(r'[^)]\s+\)\s*[^{\s]', fncall): + # If the closing parenthesis is preceded by only whitespaces, + # try to give a more descriptive error message. + if Search(r'^\s+\)', fncall): + error(filename, linenum, 'whitespace/parens', 2, + 'Closing ) should be moved to the previous line') + else: + error(filename, linenum, 'whitespace/parens', 2, + 'Extra space before )') + + +def IsBlankLine(line): + """Returns true if the given line is blank. + + We consider a line to be blank if the line is empty or consists of + only white spaces. + + Args: + line: A line of a string. + + Returns: + True, if the given line is blank. + """ + return not line or line.isspace() + + +def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, + error): + is_namespace_indent_item = ( + len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and + nesting_state.previous_stack_top == nesting_state.stack[-2]) + + if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, + clean_lines.elided, line): + CheckItemIndentationInNamespace(filename, clean_lines.elided, + line, error) + + +def CheckForFunctionLengths(filename, clean_lines, linenum, + function_state, error): + """Reports for long function bodies. + + For an overview why this is done, see: + http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions + + Uses a simplistic algorithm assuming other style guidelines + (especially spacing) are followed. + Only checks unindented functions, so class members are unchecked. + Trivial bodies are unchecked, so constructors with huge initializer lists + may be missed. + Blank/comment lines are not counted so as to avoid encouraging the removal + of vertical space and comments just to get through a lint check. + NOLINT *on the last line of a function* disables this check. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + function_state: Current function name and lines in body so far. + error: The function to call with any errors found. + """ + lines = clean_lines.lines + line = lines[linenum] + joined_line = '' + + starting_func = False + regexp = r'(\w(\w|::|\*|\&|\s)*)\(' # decls * & space::name( ... + match_result = Match(regexp, line) + if match_result: + # If the name is all caps and underscores, figure it's a macro and + # ignore it, unless it's TEST or TEST_F. + function_name = match_result.group(1).split()[-1] + if function_name == 'TEST' or function_name == 'TEST_F' or ( + not Match(r'[A-Z_]+$', function_name)): + starting_func = True + + if starting_func: + body_found = False + for start_linenum in xrange(linenum, clean_lines.NumLines()): + start_line = lines[start_linenum] + joined_line += ' ' + start_line.lstrip() + if Search(r'(;|})', start_line): # Declarations and trivial functions + body_found = True + break # ... ignore + elif Search(r'{', start_line): + body_found = True + function = Search(r'((\w|:)*)\(', line).group(1) + if Match(r'TEST', function): # Handle TEST... macros + parameter_regexp = Search(r'(\(.*\))', joined_line) + if parameter_regexp: # Ignore bad syntax + function += parameter_regexp.group(1) + else: + function += '()' + function_state.Begin(function) + break + if not body_found: + # No body for the function (or evidence of a non-function) was found. + error(filename, linenum, 'readability/fn_size', 5, + 'Lint failed to find start of function body.') + elif Match(r'^\}\s*$', line): # function end + function_state.Check(error, filename, linenum) + function_state.End() + elif not Match(r'^\s*$', line): + function_state.Count() # Count non-blank/non-comment lines. + + +_RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?') + + +def CheckComment(line, filename, linenum, next_line_start, error): + """Checks for common mistakes in comments. + + Args: + line: The line in question. + filename: The name of the current file. + linenum: The number of the line to check. + next_line_start: The first non-whitespace column of the next line. + error: The function to call with any errors found. + """ + commentpos = line.find('//') + if commentpos != -1: + # Check if the // may be in quotes. If so, ignore it + # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison + if (line.count('"', 0, commentpos) - + line.count('\\"', 0, commentpos)) % 2 == 0: # not in quotes + # Allow one space for new scopes, two spaces otherwise: + if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and + ((commentpos >= 1 and + line[commentpos-1] not in string.whitespace) or + (commentpos >= 2 and + line[commentpos-2] not in string.whitespace))): + error(filename, linenum, 'whitespace/comments', 2, + 'At least two spaces is best between code and comments') + + # Checks for common mistakes in TODO comments. + comment = line[commentpos:] + match = _RE_PATTERN_TODO.match(comment) + if match: + # One whitespace is correct; zero whitespace is handled elsewhere. + leading_whitespace = match.group(1) + if len(leading_whitespace) > 1: + error(filename, linenum, 'whitespace/todo', 2, + 'Too many spaces before TODO') + + username = match.group(2) + if not username: + error(filename, linenum, 'readability/todo', 2, + 'Missing username in TODO; it should look like ' + '"// TODO(my_username): Stuff."') + + middle_whitespace = match.group(3) + # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison + if middle_whitespace != ' ' and middle_whitespace != '': + error(filename, linenum, 'whitespace/todo', 2, + 'TODO(my_username) should be followed by a space') + + # If the comment contains an alphanumeric character, there + # should be a space somewhere between it and the // unless + # it's a /// or //! Doxygen comment. + if (Match(r'//[^ ]*\w', comment) and + not Match(r'(///|//\!)(\s+|$)', comment)): + error(filename, linenum, 'whitespace/comments', 4, + 'Should have a space between // and comment') + + +def CheckAccess(filename, clean_lines, linenum, nesting_state, error): + """Checks for improper use of DISALLOW* macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] # get rid of comments and strings + + matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|' + r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line) + if not matched: + return + if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo): + if nesting_state.stack[-1].access != 'private': + error(filename, linenum, 'readability/constructors', 3, + '%s must be in the private: section' % matched.group(1)) + + else: + # Found DISALLOW* macro outside a class declaration, or perhaps it + # was used inside a function when it should have been part of the + # class declaration. We could issue a warning here, but it + # probably resulted in a compiler error already. + pass + + +def CheckSpacing(filename, clean_lines, linenum, nesting_state, error): + """Checks for the correctness of various spacing issues in the code. + + Things we check for: spaces around operators, spaces after + if/for/while/switch, no spaces around parens in function calls, two + spaces between code and comment, don't start a block with a blank + line, don't end a function with a blank line, don't add a blank line + after public/protected/private, don't have too many blank lines in a row. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw = clean_lines.lines_without_raw_strings + line = raw[linenum] + + # Before nixing comments, check if the line is blank for no good + # reason. This includes the first line after a block is opened, and + # blank lines at the end of a function (ie, right before a line like '}' + # + # Skip all the blank line checks if we are immediately inside a + # namespace body. In other words, don't issue blank line warnings + # for this block: + # namespace { + # + # } + # + # A warning about missing end of namespace comments will be issued instead. + # + # Also skip blank line checks for 'extern "C"' blocks, which are formatted + # like namespaces. + if (IsBlankLine(line) and + not nesting_state.InNamespaceBody() and + not nesting_state.InExternC()): + elided = clean_lines.elided + prev_line = elided[linenum - 1] + prevbrace = prev_line.rfind('{') + # TODO(unknown): Don't complain if line before blank line, and line after, + # both start with alnums and are indented the same amount. + # This ignores whitespace at the start of a namespace block + # because those are not usually indented. + if prevbrace != -1 and prev_line[prevbrace:].find('}') == -1: + # OK, we have a blank line at the start of a code block. Before we + # complain, we check if it is an exception to the rule: The previous + # non-empty line has the parameters of a function header that are indented + # 4 spaces (because they did not fit in a 80 column line when placed on + # the same line as the function name). We also check for the case where + # the previous line is indented 6 spaces, which may happen when the + # initializers of a constructor do not fit into a 80 column line. + exception = False + if Match(r' {6}\w', prev_line): # Initializer list? + # We are looking for the opening column of initializer list, which + # should be indented 4 spaces to cause 6 space indentation afterwards. + search_position = linenum-2 + while (search_position >= 0 + and Match(r' {6}\w', elided[search_position])): + search_position -= 1 + exception = (search_position >= 0 + and elided[search_position][:5] == ' :') + else: + # Search for the function arguments or an initializer list. We use a + # simple heuristic here: If the line is indented 4 spaces; and we have a + # closing paren, without the opening paren, followed by an opening brace + # or colon (for initializer lists) we assume that it is the last line of + # a function header. If we have a colon indented 4 spaces, it is an + # initializer list. + exception = (Match(r' {4}\w[^\(]*\)\s*(const\s*)?(\{\s*$|:)', + prev_line) + or Match(r' {4}:', prev_line)) + + if not exception: + error(filename, linenum, 'whitespace/blank_line', 2, + 'Redundant blank line at the start of a code block ' + 'should be deleted.') + # Ignore blank lines at the end of a block in a long if-else + # chain, like this: + # if (condition1) { + # // Something followed by a blank line + # + # } else if (condition2) { + # // Something else + # } + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + if (next_line + and Match(r'\s*}', next_line) + and next_line.find('} else ') == -1): + error(filename, linenum, 'whitespace/blank_line', 3, + 'Redundant blank line at the end of a code block ' + 'should be deleted.') + + matched = Match(r'\s*(public|protected|private):', prev_line) + if matched: + error(filename, linenum, 'whitespace/blank_line', 3, + 'Do not leave a blank line after "%s:"' % matched.group(1)) + + # Next, check comments + next_line_start = 0 + if linenum + 1 < clean_lines.NumLines(): + next_line = raw[linenum + 1] + next_line_start = len(next_line) - len(next_line.lstrip()) + CheckComment(line, filename, linenum, next_line_start, error) + + # get rid of comments and strings + line = clean_lines.elided[linenum] + + # You shouldn't have spaces before your brackets, except maybe after + # 'delete []' or 'return []() {};' + if Search(r'\w\s+\[', line) and not Search(r'(?:delete|return)\s+\[', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Extra space before [') + + # In range-based for, we wanted spaces before and after the colon, but + # not around "::" tokens that might appear. + if (Search(r'for *\(.*[^:]:[^: ]', line) or + Search(r'for *\(.*[^: ]:[^:]', line)): + error(filename, linenum, 'whitespace/forcolon', 2, + 'Missing space around colon in range-based for loop') + + +def CheckOperatorSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing around operators. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Don't try to do spacing checks for operator methods. Do this by + # replacing the troublesome characters with something else, + # preserving column position for all other characters. + # + # The replacement is done repeatedly to avoid false positives from + # operators that call operators. + while True: + match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line) + if match: + line = match.group(1) + ('_' * len(match.group(2))) + match.group(3) + else: + break + + # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )". + # Otherwise not. Note we only check for non-spaces on *both* sides; + # sometimes people put non-spaces on one side when aligning ='s among + # many lines (not that this is behavior that I approve of...) + if ((Search(r'[\w.]=', line) or + Search(r'=[\w.]', line)) + and not Search(r'\b(if|while|for) ', line) + # Operators taken from [lex.operators] in C++11 standard. + and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line) + and not Search(r'operator=', line)): + error(filename, linenum, 'whitespace/operators', 4, + 'Missing spaces around =') + + # It's ok not to have spaces around binary operators like + - * /, but if + # there's too little whitespace, we get concerned. It's hard to tell, + # though, so we punt on this one for now. TODO. + + # You should always have whitespace around binary operators. + # + # Check <= and >= first to avoid false positives with < and >, then + # check non-include lines for spacing around < and >. + # + # If the operator is followed by a comma, assume it's be used in a + # macro context and don't do any checks. This avoids false + # positives. + # + # Note that && is not included here. Those are checked separately + # in CheckRValueReference + match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around %s' % match.group(1)) + elif not Match(r'#.*include', line): + # Look for < that is not surrounded by spaces. This is only + # triggered if both sides are missing spaces, even though + # technically should should flag if at least one side is missing a + # space. This is done to avoid some false positives with shifts. + match = Match(r'^(.*[^\s<])<[^\s=<,]', line) + if match: + (_, _, end_pos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + if end_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <') + + # Look for > that is not surrounded by spaces. Similar to the + # above, we only trigger if both sides are missing spaces to avoid + # false positives with shifts. + match = Match(r'^(.*[^-\s>])>[^\s=>,]', line) + if match: + (_, _, start_pos) = ReverseCloseExpression( + clean_lines, linenum, len(match.group(1))) + if start_pos <= -1: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >') + + # We allow no-spaces around << when used like this: 10<<20, but + # not otherwise (particularly, not when used as streams) + # + # We also allow operators following an opening parenthesis, since + # those tend to be macros that deal with operators. + match = Search(r'(operator|[^\s(<])(?:L|UL|ULL|l|ul|ull)?<<([^\s,=<])', line) + if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and + not (match.group(1) == 'operator' and match.group(2) == ';')): + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around <<') + + # We allow no-spaces around >> for almost anything. This is because + # C++11 allows ">>" to close nested templates, which accounts for + # most cases when ">>" is not followed by a space. + # + # We still warn on ">>" followed by alpha character, because that is + # likely due to ">>" being used for right shifts, e.g.: + # value >> alpha + # + # When ">>" is used to close templates, the alphanumeric letter that + # follows would be part of an identifier, and there should still be + # a space separating the template type and the identifier. + # type> alpha + match = Search(r'>>[a-zA-Z_]', line) + if match: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around >>') + + # There shouldn't be space around unary operators + match = Search(r'(!\s|~\s|[\s]--[\s;]|[\s]\+\+[\s;])', line) + if match: + error(filename, linenum, 'whitespace/operators', 4, + 'Extra space for operator %s' % match.group(1)) + + +def CheckParenthesisSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing around parentheses. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # No spaces after an if, while, switch, or for + match = Search(r' (if\(|for\(|while\(|switch\()', line) + if match: + error(filename, linenum, 'whitespace/parens', 5, + 'Missing space before ( in %s' % match.group(1)) + + # For if/for/while/switch, the left and right parens should be + # consistent about how many spaces are inside the parens, and + # there should either be zero or one spaces inside the parens. + # We don't want: "if ( foo)" or "if ( foo )". + # Exception: "for ( ; foo; bar)" and "for (foo; bar; )" are allowed. + match = Search(r'\b(if|for|while|switch)\s*' + r'\(([ ]*)(.).*[^ ]+([ ]*)\)\s*{\s*$', + line) + if match: + if len(match.group(2)) != len(match.group(4)): + if not (match.group(3) == ';' and + len(match.group(2)) == 1 + len(match.group(4)) or + not match.group(2) and Search(r'\bfor\s*\(.*; \)', line)): + error(filename, linenum, 'whitespace/parens', 5, + 'Mismatching spaces inside () in %s' % match.group(1)) + if len(match.group(2)) not in [0, 1]: + error(filename, linenum, 'whitespace/parens', 5, + 'Should have zero or one spaces inside ( and ) in %s' % + match.group(1)) + + +def CheckCommaSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing near commas and semicolons. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + raw = clean_lines.lines_without_raw_strings + line = clean_lines.elided[linenum] + + # You should always have a space after a comma (either as fn arg or operator) + # + # This does not apply when the non-space character following the + # comma is another comma, since the only time when that happens is + # for empty macro arguments. + # + # We run this check in two passes: first pass on elided lines to + # verify that lines contain missing whitespaces, second pass on raw + # lines to confirm that those missing whitespaces are not due to + # elided comments. + if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and + Search(r',[^,\s]', raw[linenum])): + error(filename, linenum, 'whitespace/comma', 3, + 'Missing space after ,') + + # You should always have a space after a semicolon + # except for few corner cases + # TODO(unknown): clarify if 'if (1) { return 1;}' is requires one more + # space after ; + if Search(r';[^\s};\\)/]', line): + error(filename, linenum, 'whitespace/semicolon', 3, + 'Missing space after ;') + + +def CheckBracesSpacing(filename, clean_lines, linenum, error): + """Checks for horizontal spacing near commas. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Except after an opening paren, or after another opening brace (in case of + # an initializer list, for instance), you should have spaces before your + # braces. And since you should never have braces at the beginning of a line, + # this is an easy test. + match = Match(r'^(.*[^ ({>]){', line) + if match: + # Try a bit harder to check for brace initialization. This + # happens in one of the following forms: + # Constructor() : initializer_list_{} { ... } + # Constructor{}.MemberFunction() + # Type variable{}; + # FunctionCall(type{}, ...); + # LastArgument(..., type{}); + # LOG(INFO) << type{} << " ..."; + # map_of_type[{...}] = ...; + # ternary = expr ? new type{} : nullptr; + # OuterTemplate{}> + # + # We check for the character following the closing brace, and + # silence the warning if it's one of those listed above, i.e. + # "{.;,)<>]:". + # + # To account for nested initializer list, we allow any number of + # closing braces up to "{;,)<". We can't simply silence the + # warning on first sight of closing brace, because that would + # cause false negatives for things that are not initializer lists. + # Silence this: But not this: + # Outer{ if (...) { + # Inner{...} if (...){ // Missing space before { + # }; } + # + # There is a false negative with this approach if people inserted + # spurious semicolons, e.g. "if (cond){};", but we will catch the + # spurious semicolon with a separate check. + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + trailing_text = '' + if endpos > -1: + trailing_text = endline[endpos:] + for offset in xrange(endlinenum + 1, + min(endlinenum + 3, clean_lines.NumLines() - 1)): + trailing_text += clean_lines.elided[offset] + if not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before {') + + # Make sure '} else {' has spaces. + if Search(r'}else', line): + error(filename, linenum, 'whitespace/braces', 5, + 'Missing space before else') + + # You shouldn't have a space before a semicolon at the end of the line. + # There's a special case for "for" since the style guide allows space before + # the semicolon there. + if Search(r':\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Semicolon defining empty statement. Use {} instead.') + elif Search(r'^\s*;\s*$', line): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Line contains only semicolon. If this should be an empty statement, ' + 'use {} instead.') + elif (Search(r'\s+;\s*$', line) and + not Search(r'\bfor\b', line)): + error(filename, linenum, 'whitespace/semicolon', 5, + 'Extra space before last semicolon. If this should be an empty ' + 'statement, use {} instead.') + + +def IsDecltype(clean_lines, linenum, column): + """Check if the token ending on (linenum, column) is decltype(). + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: the number of the line to check. + column: end column of the token to check. + Returns: + True if this token is decltype() expression, False otherwise. + """ + (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column) + if start_col < 0: + return False + if Search(r'\bdecltype\s*$', text[0:start_col]): + return True + return False + + +def IsTemplateParameterList(clean_lines, linenum, column): + """Check if the token ending on (linenum, column) is the end of template<>. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: the number of the line to check. + column: end column of the token to check. + Returns: + True if this token is end of a template parameter list, False otherwise. + """ + (_, startline, startpos) = ReverseCloseExpression( + clean_lines, linenum, column) + if (startpos > -1 and + Search(r'\btemplate\s*$', clean_lines.elided[startline][0:startpos])): + return True + return False + + +def IsRValueType(typenames, clean_lines, nesting_state, linenum, column): + """Check if the token ending on (linenum, column) is a type. + + Assumes that text to the right of the column is "&&" or a function + name. + + Args: + typenames: set of type names from template-argument-list. + clean_lines: A CleansedLines instance containing the file. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + linenum: the number of the line to check. + column: end column of the token to check. + Returns: + True if this token is a type, False if we are not sure. + """ + prefix = clean_lines.elided[linenum][0:column] + + # Get one word to the left. If we failed to do so, this is most + # likely not a type, since it's unlikely that the type name and "&&" + # would be split across multiple lines. + match = Match(r'^(.*)(\b\w+|[>*)&])\s*$', prefix) + if not match: + return False + + # Check text following the token. If it's "&&>" or "&&," or "&&...", it's + # most likely a rvalue reference used inside a template. + suffix = clean_lines.elided[linenum][column:] + if Match(r'&&\s*(?:[>,]|\.\.\.)', suffix): + return True + + # Check for known types and end of templates: + # int&& variable + # vector&& variable + # + # Because this function is called recursively, we also need to + # recognize pointer and reference types: + # int* Function() + # int& Function() + if (match.group(2) in typenames or + match.group(2) in ['char', 'char16_t', 'char32_t', 'wchar_t', 'bool', + 'short', 'int', 'long', 'signed', 'unsigned', + 'float', 'double', 'void', 'auto', '>', '*', '&']): + return True + + # If we see a close parenthesis, look for decltype on the other side. + # decltype would unambiguously identify a type, anything else is + # probably a parenthesized expression and not a type. + if match.group(2) == ')': + return IsDecltype( + clean_lines, linenum, len(match.group(1)) + len(match.group(2)) - 1) + + # Check for casts and cv-qualifiers. + # match.group(1) remainder + # -------------- --------- + # const_cast< type&& + # const type&& + # type const&& + if Search(r'\b(?:const_cast\s*<|static_cast\s*<|dynamic_cast\s*<|' + r'reinterpret_cast\s*<|\w+\s)\s*$', + match.group(1)): + return True + + # Look for a preceding symbol that might help differentiate the context. + # These are the cases that would be ambiguous: + # match.group(1) remainder + # -------------- --------- + # Call ( expression && + # Declaration ( type&& + # sizeof ( type&& + # if ( expression && + # while ( expression && + # for ( type&& + # for( ; expression && + # statement ; type&& + # block { type&& + # constructor { expression && + start = linenum + line = match.group(1) + match_symbol = None + while start >= 0: + # We want to skip over identifiers and commas to get to a symbol. + # Commas are skipped so that we can find the opening parenthesis + # for function parameter lists. + match_symbol = Match(r'^(.*)([^\w\s,])[\w\s,]*$', line) + if match_symbol: + break + start -= 1 + line = clean_lines.elided[start] + + if not match_symbol: + # Probably the first statement in the file is an rvalue reference + return True + + if match_symbol.group(2) == '}': + # Found closing brace, probably an indicate of this: + # block{} type&& + return True + + if match_symbol.group(2) == ';': + # Found semicolon, probably one of these: + # for(; expression && + # statement; type&& + + # Look for the previous 'for(' in the previous lines. + before_text = match_symbol.group(1) + for i in xrange(start - 1, max(start - 6, 0), -1): + before_text = clean_lines.elided[i] + before_text + if Search(r'for\s*\([^{};]*$', before_text): + # This is the condition inside a for-loop + return False + + # Did not find a for-init-statement before this semicolon, so this + # is probably a new statement and not a condition. + return True + + if match_symbol.group(2) == '{': + # Found opening brace, probably one of these: + # block{ type&& = ... ; } + # constructor{ expression && expression } + + # Look for a closing brace or a semicolon. If we see a semicolon + # first, this is probably a rvalue reference. + line = clean_lines.elided[start][0:len(match_symbol.group(1)) + 1] + end = start + depth = 1 + while True: + for ch in line: + if ch == ';': + return True + elif ch == '{': + depth += 1 + elif ch == '}': + depth -= 1 + if depth == 0: + return False + end += 1 + if end >= clean_lines.NumLines(): + break + line = clean_lines.elided[end] + # Incomplete program? + return False + + if match_symbol.group(2) == '(': + # Opening parenthesis. Need to check what's to the left of the + # parenthesis. Look back one extra line for additional context. + before_text = match_symbol.group(1) + if linenum > 1: + before_text = clean_lines.elided[linenum - 1] + before_text + before_text = match_symbol.group(1) + + # Patterns that are likely to be types: + # [](type&& + # for (type&& + # sizeof(type&& + # operator=(type&& + # + if Search(r'(?:\]|\bfor|\bsizeof|\boperator\s*\S+\s*)\s*$', before_text): + return True + + # Patterns that are likely to be expressions: + # if (expression && + # while (expression && + # : initializer(expression && + # , initializer(expression && + # ( FunctionCall(expression && + # + FunctionCall(expression && + # + (expression && + # + # The last '+' represents operators such as '+' and '-'. + if Search(r'(?:\bif|\bwhile|[-+=%^(]*>)?\s*$', + match_symbol.group(1)) + if match_func: + # Check for constructors, which don't have return types. + if Search(r'\b(?:explicit|inline)$', match_func.group(1)): + return True + implicit_constructor = Match(r'\s*(\w+)\((?:const\s+)?(\w+)', prefix) + if (implicit_constructor and + implicit_constructor.group(1) == implicit_constructor.group(2)): + return True + return IsRValueType(typenames, clean_lines, nesting_state, linenum, + len(match_func.group(1))) + + # Nothing before the function name. If this is inside a block scope, + # this is probably a function call. + return not (nesting_state.previous_stack_top and + nesting_state.previous_stack_top.IsBlockInfo()) + + if match_symbol.group(2) == '>': + # Possibly a closing bracket, check that what's on the other side + # looks like the start of a template. + return IsTemplateParameterList( + clean_lines, start, len(match_symbol.group(1))) + + # Some other symbol, usually something like "a=b&&c". This is most + # likely not a type. + return False + + +def IsDeletedOrDefault(clean_lines, linenum): + """Check if current constructor or operator is deleted or default. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if this is a deleted or default constructor. + """ + open_paren = clean_lines.elided[linenum].find('(') + if open_paren < 0: + return False + (close_line, _, close_paren) = CloseExpression( + clean_lines, linenum, open_paren) + if close_paren < 0: + return False + return Match(r'\s*=\s*(?:delete|default)\b', close_line[close_paren:]) + + +def IsRValueAllowed(clean_lines, linenum, typenames): + """Check if RValue reference is allowed on a particular line. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + typenames: set of type names from template-argument-list. + Returns: + True if line is within the region where RValue references are allowed. + """ + # Allow region marked by PUSH/POP macros + for i in xrange(linenum, 0, -1): + line = clean_lines.elided[i] + if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line): + if not line.endswith('PUSH'): + return False + for j in xrange(linenum, clean_lines.NumLines(), 1): + line = clean_lines.elided[j] + if Match(r'GOOGLE_ALLOW_RVALUE_REFERENCES_(?:PUSH|POP)', line): + return line.endswith('POP') + + # Allow operator= + line = clean_lines.elided[linenum] + if Search(r'\boperator\s*=\s*\(', line): + return IsDeletedOrDefault(clean_lines, linenum) + + # Allow constructors + match = Match(r'\s*(?:[\w<>]+::)*([\w<>]+)\s*::\s*([\w<>]+)\s*\(', line) + if match and match.group(1) == match.group(2): + return IsDeletedOrDefault(clean_lines, linenum) + if Search(r'\b(?:explicit|inline)\s+[\w<>]+\s*\(', line): + return IsDeletedOrDefault(clean_lines, linenum) + + if Match(r'\s*[\w<>]+\s*\(', line): + previous_line = 'ReturnType' + if linenum > 0: + previous_line = clean_lines.elided[linenum - 1] + if Match(r'^\s*$', previous_line) or Search(r'[{}:;]\s*$', previous_line): + return IsDeletedOrDefault(clean_lines, linenum) + + # Reject types not mentioned in template-argument-list + while line: + match = Match(r'^.*?(\w+)\s*&&(.*)$', line) + if not match: + break + if match.group(1) not in typenames: + return False + line = match.group(2) + + # All RValue types that were in template-argument-list should have + # been removed by now. Those were allowed, assuming that they will + # be forwarded. + # + # If there are no remaining RValue types left (i.e. types that were + # not found in template-argument-list), flag those as not allowed. + return line.find('&&') < 0 + + +def GetTemplateArgs(clean_lines, linenum): + """Find list of template arguments associated with this function declaration. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: Line number containing the start of the function declaration, + usually one line after the end of the template-argument-list. + Returns: + Set of type names, or empty set if this does not appear to have + any template parameters. + """ + # Find start of function + func_line = linenum + while func_line > 0: + line = clean_lines.elided[func_line] + if Match(r'^\s*$', line): + return set() + if line.find('(') >= 0: + break + func_line -= 1 + if func_line == 0: + return set() + + # Collapse template-argument-list into a single string + argument_list = '' + match = Match(r'^(\s*template\s*)<', clean_lines.elided[func_line]) + if match: + # template-argument-list on the same line as function name + start_col = len(match.group(1)) + _, end_line, end_col = CloseExpression(clean_lines, func_line, start_col) + if end_col > -1 and end_line == func_line: + start_col += 1 # Skip the opening bracket + argument_list = clean_lines.elided[func_line][start_col:end_col] + + elif func_line > 1: + # template-argument-list one line before function name + match = Match(r'^(.*)>\s*$', clean_lines.elided[func_line - 1]) + if match: + end_col = len(match.group(1)) + _, start_line, start_col = ReverseCloseExpression( + clean_lines, func_line - 1, end_col) + if start_col > -1: + start_col += 1 # Skip the opening bracket + while start_line < func_line - 1: + argument_list += clean_lines.elided[start_line][start_col:] + start_col = 0 + start_line += 1 + argument_list += clean_lines.elided[func_line - 1][start_col:end_col] + + if not argument_list: + return set() + + # Extract type names + typenames = set() + while True: + match = Match(r'^[,\s]*(?:typename|class)(?:\.\.\.)?\s+(\w+)(.*)$', + argument_list) + if not match: + break + typenames.add(match.group(1)) + argument_list = match.group(2) + return typenames + + +def CheckRValueReference(filename, clean_lines, linenum, nesting_state, error): + """Check for rvalue references. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # Find lines missing spaces around &&. + # TODO(unknown): currently we don't check for rvalue references + # with spaces surrounding the && to avoid false positives with + # boolean expressions. + line = clean_lines.elided[linenum] + match = Match(r'^(.*\S)&&', line) + if not match: + match = Match(r'(.*)&&\S', line) + if (not match) or '(&&)' in line or Search(r'\boperator\s*$', match.group(1)): + return + + # Either poorly formed && or an rvalue reference, check the context + # to get a more accurate error message. Mostly we want to determine + # if what's to the left of "&&" is a type or not. + typenames = GetTemplateArgs(clean_lines, linenum) + and_pos = len(match.group(1)) + if IsRValueType(typenames, clean_lines, nesting_state, linenum, and_pos): + if not IsRValueAllowed(clean_lines, linenum, typenames): + error(filename, linenum, 'build/c++11', 3, + 'RValue references are an unapproved C++ feature.') + else: + error(filename, linenum, 'whitespace/operators', 3, + 'Missing spaces around &&') + + +def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error): + """Checks for additional blank line issues related to sections. + + Currently the only thing checked here is blank line before protected/private. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + class_info: A _ClassInfo objects. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Skip checks if the class is small, where small means 25 lines or less. + # 25 lines seems like a good cutoff since that's the usual height of + # terminals, and any class that can't fit in one screen can't really + # be considered "small". + # + # Also skip checks if we are on the first line. This accounts for + # classes that look like + # class Foo { public: ... }; + # + # If we didn't find the end of the class, last_line would be zero, + # and the check will be skipped by the first condition. + if (class_info.last_line - class_info.starting_linenum <= 24 or + linenum <= class_info.starting_linenum): + return + + matched = Match(r'\s*(public|protected|private):', clean_lines.lines[linenum]) + if matched: + # Issue warning if the line before public/protected/private was + # not a blank line, but don't do this if the previous line contains + # "class" or "struct". This can happen two ways: + # - We are at the beginning of the class. + # - We are forward-declaring an inner class that is semantically + # private, but needed to be public for implementation reasons. + # Also ignores cases where the previous line ends with a backslash as can be + # common when defining classes in C macros. + prev_line = clean_lines.lines[linenum - 1] + if (not IsBlankLine(prev_line) and + not Search(r'\b(class|struct)\b', prev_line) and + not Search(r'\\$', prev_line)): + # Try a bit harder to find the beginning of the class. This is to + # account for multi-line base-specifier lists, e.g.: + # class Derived + # : public Base { + end_class_head = class_info.starting_linenum + for i in range(class_info.starting_linenum, linenum): + if Search(r'\{\s*$', clean_lines.lines[i]): + end_class_head = i + break + if end_class_head < linenum - 1: + error(filename, linenum, 'whitespace/blank_line', 3, + '"%s:" should be preceded by a blank line' % matched.group(1)) + + +def GetPreviousNonBlankLine(clean_lines, linenum): + """Return the most recent non-blank line and its line number. + + Args: + clean_lines: A CleansedLines instance containing the file contents. + linenum: The number of the line to check. + + Returns: + A tuple with two elements. The first element is the contents of the last + non-blank line before the current line, or the empty string if this is the + first non-blank line. The second is the line number of that line, or -1 + if this is the first non-blank line. + """ + + prevlinenum = linenum - 1 + while prevlinenum >= 0: + prevline = clean_lines.elided[prevlinenum] + if not IsBlankLine(prevline): # if not a blank line... + return (prevline, prevlinenum) + prevlinenum -= 1 + return ('', -1) + + +def CheckBraces(filename, clean_lines, linenum, error): + """Looks for misplaced braces (e.g. at the end of line). + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + line = clean_lines.elided[linenum] # get rid of comments and strings + + if Match(r'\s*{\s*$', line): + # We allow an open brace to start a line in the case where someone is using + # braces in a block to explicitly create a new scope, which is commonly used + # to control the lifetime of stack-allocated variables. Braces are also + # used for brace initializers inside function calls. We don't detect this + # perfectly: we just don't complain if the last non-whitespace character on + # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the + # previous line starts a preprocessor block. + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if (not Search(r'[,;:}{(]\s*$', prevline) and + not Match(r'\s*#', prevline)): + error(filename, linenum, 'whitespace/braces', 4, + '{ should almost always be at the end of the previous line') + + # An else clause should be on the same line as the preceding closing brace. + if Match(r'\s*else\b\s*(?:if\b|\{|$)', line): + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if Match(r'\s*}\s*$', prevline): + error(filename, linenum, 'whitespace/newline', 4, + 'An else should appear on the same line as the preceding }') + + # If braces come on one side of an else, they should be on both. + # However, we have to worry about "else if" that spans multiple lines! + if Search(r'else if\s*\(', line): # could be multi-line if + brace_on_left = bool(Search(r'}\s*else if\s*\(', line)) + # find the ( after the if + pos = line.find('else if') + pos = line.find('(', pos) + if pos > 0: + (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos) + brace_on_right = endline[endpos:].find('{') != -1 + if brace_on_left != brace_on_right: # must be brace after if + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line): + error(filename, linenum, 'readability/braces', 5, + 'If an else has a brace on one side, it should have it on both') + + # Likewise, an else should never have the else clause on the same line + if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line): + error(filename, linenum, 'whitespace/newline', 4, + 'Else clause should never be on same line as else (use 2 lines)') + + # In the same way, a do/while should never be on one line + if Match(r'\s*do [^\s{]', line): + error(filename, linenum, 'whitespace/newline', 4, + 'do/while clauses should not be on a single line') + + # Check single-line if/else bodies. The style guide says 'curly braces are not + # required for single-line statements'. We additionally allow multi-line, + # single statements, but we reject anything with more than one semicolon in + # it. This means that the first semicolon after the if should be at the end of + # its line, and the line after that should have an indent level equal to or + # lower than the if. We also check for ambiguous if/else nesting without + # braces. + if_else_match = Search(r'\b(if\s*\(|else\b)', line) + if if_else_match and not Match(r'\s*#', line): + if_indent = GetIndentLevel(line) + endline, endlinenum, endpos = line, linenum, if_else_match.end() + if_match = Search(r'\bif\s*\(', line) + if if_match: + # This could be a multiline if condition, so find the end first. + pos = if_match.end() - 1 + (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos) + # Check for an opening brace, either directly after the if or on the next + # line. If found, this isn't a single-statement conditional. + if (not Match(r'\s*{', endline[endpos:]) + and not (Match(r'\s*$', endline[endpos:]) + and endlinenum < (len(clean_lines.elided) - 1) + and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))): + while (endlinenum < len(clean_lines.elided) + and ';' not in clean_lines.elided[endlinenum][endpos:]): + endlinenum += 1 + endpos = 0 + if endlinenum < len(clean_lines.elided): + endline = clean_lines.elided[endlinenum] + # We allow a mix of whitespace and closing braces (e.g. for one-liner + # methods) and a single \ after the semicolon (for macros) + endpos = endline.find(';') + if not Match(r';[\s}]*(\\?)$', endline[endpos:]): + # Semicolon isn't the last character, there's something trailing. + # Output a warning if the semicolon is not contained inside + # a lambda expression. + if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$', + endline): + error(filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces') + elif endlinenum < len(clean_lines.elided) - 1: + # Make sure the next line is dedented + next_line = clean_lines.elided[endlinenum + 1] + next_indent = GetIndentLevel(next_line) + # With ambiguous nested if statements, this will error out on the + # if that *doesn't* match the else, regardless of whether it's the + # inner one or outer one. + if (if_match and Match(r'\s*else\b', next_line) + and next_indent != if_indent): + error(filename, linenum, 'readability/braces', 4, + 'Else clause should be indented at the same level as if. ' + 'Ambiguous nested if/else chains require braces.') + elif next_indent > if_indent: + error(filename, linenum, 'readability/braces', 4, + 'If/else bodies with multiple statements require braces') + + +def CheckTrailingSemicolon(filename, clean_lines, linenum, error): + """Looks for redundant trailing semicolon. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + line = clean_lines.elided[linenum] + + # Block bodies should not be followed by a semicolon. Due to C++11 + # brace initialization, there are more places where semicolons are + # required than not, so we use a whitelist approach to check these + # rather than a blacklist. These are the places where "};" should + # be replaced by just "}": + # 1. Some flavor of block following closing parenthesis: + # for (;;) {}; + # while (...) {}; + # switch (...) {}; + # Function(...) {}; + # if (...) {}; + # if (...) else if (...) {}; + # + # 2. else block: + # if (...) else {}; + # + # 3. const member function: + # Function(...) const {}; + # + # 4. Block following some statement: + # x = 42; + # {}; + # + # 5. Block at the beginning of a function: + # Function(...) { + # {}; + # } + # + # Note that naively checking for the preceding "{" will also match + # braces inside multi-dimensional arrays, but this is fine since + # that expression will not contain semicolons. + # + # 6. Block following another block: + # while (true) {} + # {}; + # + # 7. End of namespaces: + # namespace {}; + # + # These semicolons seems far more common than other kinds of + # redundant semicolons, possibly due to people converting classes + # to namespaces. For now we do not warn for this case. + # + # Try matching case 1 first. + match = Match(r'^(.*\)\s*)\{', line) + if match: + # Matched closing parenthesis (case 1). Check the token before the + # matching opening parenthesis, and don't warn if it looks like a + # macro. This avoids these false positives: + # - macro that defines a base class + # - multi-line macro that defines a base class + # - macro that defines the whole class-head + # + # But we still issue warnings for macros that we know are safe to + # warn, specifically: + # - TEST, TEST_F, TEST_P, MATCHER, MATCHER_P + # - TYPED_TEST + # - INTERFACE_DEF + # - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED: + # + # We implement a whitelist of safe macros instead of a blacklist of + # unsafe macros, even though the latter appears less frequently in + # google code and would have been easier to implement. This is because + # the downside for getting the whitelist wrong means some extra + # semicolons, while the downside for getting the blacklist wrong + # would result in compile errors. + # + # In addition to macros, we also don't want to warn on + # - Compound literals + # - Lambdas + # - alignas specifier with anonymous structs: + closing_brace_pos = match.group(1).rfind(')') + opening_parenthesis = ReverseCloseExpression( + clean_lines, linenum, closing_brace_pos) + if opening_parenthesis[2] > -1: + line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]] + macro = Search(r'\b([A-Z_]+)\s*$', line_prefix) + func = Match(r'^(.*\])\s*$', line_prefix) + if ((macro and + macro.group(1) not in ( + 'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST', + 'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED', + 'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or + (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or + Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or + Search(r'\s+=\s*$', line_prefix)): + match = None + if (match and + opening_parenthesis[1] > 1 and + Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])): + # Multi-line lambda-expression + match = None + + else: + # Try matching cases 2-3. + match = Match(r'^(.*(?:else|\)\s*const)\s*)\{', line) + if not match: + # Try matching cases 4-6. These are always matched on separate lines. + # + # Note that we can't simply concatenate the previous line to the + # current line and do a single match, otherwise we may output + # duplicate warnings for the blank line case: + # if (cond) { + # // blank line + # } + prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0] + if prevline and Search(r'[;{}]\s*$', prevline): + match = Match(r'^(\s*)\{', line) + + # Check matching closing brace + if match: + (endline, endlinenum, endpos) = CloseExpression( + clean_lines, linenum, len(match.group(1))) + if endpos > -1 and Match(r'^\s*;', endline[endpos:]): + # Current {} pair is eligible for semicolon check, and we have found + # the redundant semicolon, output warning here. + # + # Note: because we are scanning forward for opening braces, and + # outputting warnings for the matching closing brace, if there are + # nested blocks with trailing semicolons, we will get the error + # messages in reversed order. + error(filename, endlinenum, 'readability/braces', 4, + "You don't need a ; after a }") + + +def CheckEmptyBlockBody(filename, clean_lines, linenum, error): + """Look for empty loop/conditional body with only a single semicolon. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Search for loop keywords at the beginning of the line. Because only + # whitespaces are allowed before the keywords, this will also ignore most + # do-while-loops, since those lines should start with closing brace. + # + # We also check "if" blocks here, since an empty conditional block + # is likely an error. + line = clean_lines.elided[linenum] + matched = Match(r'\s*(for|while|if)\s*\(', line) + if matched: + # Find the end of the conditional expression + (end_line, end_linenum, end_pos) = CloseExpression( + clean_lines, linenum, line.find('(')) + + # Output warning if what follows the condition expression is a semicolon. + # No warning for all other cases, including whitespace or newline, since we + # have a separate check for semicolons preceded by whitespace. + if end_pos >= 0 and Match(r';', end_line[end_pos:]): + if matched.group(1) == 'if': + error(filename, end_linenum, 'whitespace/empty_conditional_body', 5, + 'Empty conditional bodies should use {}') + else: + error(filename, end_linenum, 'whitespace/empty_loop_body', 5, + 'Empty loop bodies should use {} or continue') + + +def FindCheckMacro(line): + """Find a replaceable CHECK-like macro. + + Args: + line: line to search on. + Returns: + (macro name, start position), or (None, -1) if no replaceable + macro is found. + """ + for macro in _CHECK_MACROS: + i = line.find(macro) + if i >= 0: + # Find opening parenthesis. Do a regular expression match here + # to make sure that we are matching the expected CHECK macro, as + # opposed to some other macro that happens to contain the CHECK + # substring. + matched = Match(r'^(.*\b' + macro + r'\s*)\(', line) + if not matched: + continue + return (macro, len(matched.group(1))) + return (None, -1) + + +def CheckCheck(filename, clean_lines, linenum, error): + """Checks the use of CHECK and EXPECT macros. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + + # Decide the set of replacement macros that should be suggested + lines = clean_lines.elided + (check_macro, start_pos) = FindCheckMacro(lines[linenum]) + if not check_macro: + return + + # Find end of the boolean expression by matching parentheses + (last_line, end_line, end_pos) = CloseExpression( + clean_lines, linenum, start_pos) + if end_pos < 0: + return + + # If the check macro is followed by something other than a + # semicolon, assume users will log their own custom error messages + # and don't suggest any replacements. + if not Match(r'\s*;', last_line[end_pos:]): + return + + if linenum == end_line: + expression = lines[linenum][start_pos + 1:end_pos - 1] + else: + expression = lines[linenum][start_pos + 1:] + for i in xrange(linenum + 1, end_line): + expression += lines[i] + expression += last_line[0:end_pos - 1] + + # Parse expression so that we can take parentheses into account. + # This avoids false positives for inputs like "CHECK((a < 4) == b)", + # which is not replaceable by CHECK_LE. + lhs = '' + rhs = '' + operator = None + while expression: + matched = Match(r'^\s*(<<|<<=|>>|>>=|->\*|->|&&|\|\||' + r'==|!=|>=|>|<=|<|\()(.*)$', expression) + if matched: + token = matched.group(1) + if token == '(': + # Parenthesized operand + expression = matched.group(2) + (end, _) = FindEndOfExpressionInLine(expression, 0, ['(']) + if end < 0: + return # Unmatched parenthesis + lhs += '(' + expression[0:end] + expression = expression[end:] + elif token in ('&&', '||'): + # Logical and/or operators. This means the expression + # contains more than one term, for example: + # CHECK(42 < a && a < b); + # + # These are not replaceable with CHECK_LE, so bail out early. + return + elif token in ('<<', '<<=', '>>', '>>=', '->*', '->'): + # Non-relational operator + lhs += token + expression = matched.group(2) + else: + # Relational operator + operator = token + rhs = matched.group(2) + break + else: + # Unparenthesized operand. Instead of appending to lhs one character + # at a time, we do another regular expression match to consume several + # characters at once if possible. Trivial benchmark shows that this + # is more efficient when the operands are longer than a single + # character, which is generally the case. + matched = Match(r'^([^-=!<>()&|]+)(.*)$', expression) + if not matched: + matched = Match(r'^(\s*\S)(.*)$', expression) + if not matched: + break + lhs += matched.group(1) + expression = matched.group(2) + + # Only apply checks if we got all parts of the boolean expression + if not (lhs and operator and rhs): + return + + # Check that rhs do not contain logical operators. We already know + # that lhs is fine since the loop above parses out && and ||. + if rhs.find('&&') > -1 or rhs.find('||') > -1: + return + + # At least one of the operands must be a constant literal. This is + # to avoid suggesting replacements for unprintable things like + # CHECK(variable != iterator) + # + # The following pattern matches decimal, hex integers, strings, and + # characters (in that order). + lhs = lhs.strip() + rhs = rhs.strip() + match_constant = r'^([-+]?(\d+|0[xX][0-9a-fA-F]+)[lLuU]{0,3}|".*"|\'.*\')$' + if Match(match_constant, lhs) or Match(match_constant, rhs): + # Note: since we know both lhs and rhs, we can provide a more + # descriptive error message like: + # Consider using CHECK_EQ(x, 42) instead of CHECK(x == 42) + # Instead of: + # Consider using CHECK_EQ instead of CHECK(a == b) + # + # We are still keeping the less descriptive message because if lhs + # or rhs gets long, the error message might become unreadable. + error(filename, linenum, 'readability/check', 2, + 'Consider using %s instead of %s(a %s b)' % ( + _CHECK_REPLACEMENT[check_macro][operator], + check_macro, operator)) + + +def CheckAltTokens(filename, clean_lines, linenum, error): + """Check alternative keywords being used in boolean expressions. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Avoid preprocessor lines + if Match(r'^\s*#', line): + return + + # Last ditch effort to avoid multi-line comments. This will not help + # if the comment started before the current line or ended after the + # current line, but it catches most of the false positives. At least, + # it provides a way to workaround this warning for people who use + # multi-line comments in preprocessor macros. + # + # TODO(unknown): remove this once cpplint has better support for + # multi-line comments. + if line.find('/*') >= 0 or line.find('*/') >= 0: + return + + for match in _ALT_TOKEN_REPLACEMENT_PATTERN.finditer(line): + error(filename, linenum, 'readability/alt_tokens', 2, + 'Use operator %s instead of %s' % ( + _ALT_TOKEN_REPLACEMENT[match.group(1)], match.group(1))) + + +def GetLineWidth(line): + """Determines the width of the line in column positions. + + Args: + line: A string, which may be a Unicode string. + + Returns: + The width of the line in column positions, accounting for Unicode + combining characters and wide characters. + """ + if isinstance(line, unicode): + width = 0 + for uc in unicodedata.normalize('NFC', line): + if unicodedata.east_asian_width(uc) in ('W', 'F'): + width += 2 + elif not unicodedata.combining(uc): + width += 1 + return width + else: + return len(line) + + +def CheckStyle(filename, clean_lines, linenum, file_extension, nesting_state, + error): + """Checks rules from the 'C++ style rules' section of cppguide.html. + + Most of these rules are hard to test (naming, comment style), but we + do what we can. In particular we check for 2-space indents, line lengths, + tab usage, spaces inside code, etc. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + + # Don't use "elided" lines here, otherwise we can't check commented lines. + # Don't want to use "raw" either, because we don't want to check inside C++11 + # raw strings, + raw_lines = clean_lines.lines_without_raw_strings + line = raw_lines[linenum] + + if line.find('\t') != -1: + error(filename, linenum, 'whitespace/tab', 1, + 'Tab found; better to use spaces') + + # One or three blank spaces at the beginning of the line is weird; it's + # hard to reconcile that with 2-space indents. + # NOTE: here are the conditions rob pike used for his tests. Mine aren't + # as sophisticated, but it may be worth becoming so: RLENGTH==initial_spaces + # if(RLENGTH > 20) complain = 0; + # if(match($0, " +(error|private|public|protected):")) complain = 0; + # if(match(prev, "&& *$")) complain = 0; + # if(match(prev, "\\|\\| *$")) complain = 0; + # if(match(prev, "[\",=><] *$")) complain = 0; + # if(match($0, " <<")) complain = 0; + # if(match(prev, " +for \\(")) complain = 0; + # if(prevodd && match(prevprev, " +for \\(")) complain = 0; + scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$' + classinfo = nesting_state.InnermostClass() + initial_spaces = 0 + cleansed_line = clean_lines.elided[linenum] + while initial_spaces < len(line) and line[initial_spaces] == ' ': + initial_spaces += 1 + if line and line[-1].isspace(): + error(filename, linenum, 'whitespace/end_of_line', 4, + 'Line ends in whitespace. Consider deleting these extra spaces.') + # There are certain situations we allow one space, notably for + # section labels, and also lines containing multi-line raw strings. + elif ((initial_spaces == 1 or initial_spaces == 3) and + not Match(scope_or_label_pattern, cleansed_line) and + not (clean_lines.raw_lines[linenum] != line and + Match(r'^\s*""', line))): + error(filename, linenum, 'whitespace/indent', 3, + 'Weird number of spaces at line-start. ' + 'Are you using a 2-space indent?') + + # Check if the line is a header guard. + is_header_guard = False + if file_extension == 'h': + cppvar = GetHeaderGuardCPPVariable(filename) + if (line.startswith('#ifndef %s' % cppvar) or + line.startswith('#define %s' % cppvar) or + line.startswith('#endif // %s' % cppvar)): + is_header_guard = True + # #include lines and header guards can be long, since there's no clean way to + # split them. + # + # URLs can be long too. It's possible to split these, but it makes them + # harder to cut&paste. + # + # The "$Id:...$" comment may also get very long without it being the + # developers fault. + if (not line.startswith('#include') and not is_header_guard and + not Match(r'^\s*//.*http(s?)://\S*$', line) and + not Match(r'^// \$Id:.*#[0-9]+ \$$', line)): + line_width = GetLineWidth(line) + extended_length = int((_line_length * 1.25)) + if line_width > extended_length: + error(filename, linenum, 'whitespace/line_length', 4, + 'Lines should very rarely be longer than %i characters' % + extended_length) + elif line_width > _line_length: + error(filename, linenum, 'whitespace/line_length', 2, + 'Lines should be <= %i characters long' % _line_length) + + if (cleansed_line.count(';') > 1 and + # for loops are allowed two ;'s (and may run over two lines). + cleansed_line.find('for') == -1 and + (GetPreviousNonBlankLine(clean_lines, linenum)[0].find('for') == -1 or + GetPreviousNonBlankLine(clean_lines, linenum)[0].find(';') != -1) and + # It's ok to have many commands in a switch case that fits in 1 line + not ((cleansed_line.find('case ') != -1 or + cleansed_line.find('default:') != -1) and + cleansed_line.find('break;') != -1)): + error(filename, linenum, 'whitespace/newline', 0, + 'More than one command on the same line') + + # Some more style checks + CheckBraces(filename, clean_lines, linenum, error) + CheckTrailingSemicolon(filename, clean_lines, linenum, error) + CheckEmptyBlockBody(filename, clean_lines, linenum, error) + CheckAccess(filename, clean_lines, linenum, nesting_state, error) + CheckSpacing(filename, clean_lines, linenum, nesting_state, error) + CheckOperatorSpacing(filename, clean_lines, linenum, error) + CheckParenthesisSpacing(filename, clean_lines, linenum, error) + CheckCommaSpacing(filename, clean_lines, linenum, error) + CheckBracesSpacing(filename, clean_lines, linenum, error) + CheckSpacingForFunctionCall(filename, clean_lines, linenum, error) + CheckRValueReference(filename, clean_lines, linenum, nesting_state, error) + CheckCheck(filename, clean_lines, linenum, error) + CheckAltTokens(filename, clean_lines, linenum, error) + classinfo = nesting_state.InnermostClass() + if classinfo: + CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error) + + +_RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$') +# Matches the first component of a filename delimited by -s and _s. That is: +# _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo-bar_baz.cc').group(0) == 'foo' +# _RE_FIRST_COMPONENT.match('foo_bar-baz.cc').group(0) == 'foo' +_RE_FIRST_COMPONENT = re.compile(r'^[^-_.]+') + + +def _DropCommonSuffixes(filename): + """Drops common suffixes like _test.cc or -inl.h from filename. + + For example: + >>> _DropCommonSuffixes('foo/foo-inl.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/bar/foo.cc') + 'foo/bar/foo' + >>> _DropCommonSuffixes('foo/foo_internal.h') + 'foo/foo' + >>> _DropCommonSuffixes('foo/foo_unusualinternal.h') + 'foo/foo_unusualinternal' + + Args: + filename: The input filename. + + Returns: + The filename with the common suffix removed. + """ + for suffix in ('test.cc', 'regtest.cc', 'unittest.cc', + 'inl.h', 'impl.h', 'internal.h'): + if (filename.endswith(suffix) and len(filename) > len(suffix) and + filename[-len(suffix) - 1] in ('-', '_')): + return filename[:-len(suffix) - 1] + return os.path.splitext(filename)[0] + + +def _IsTestFilename(filename): + """Determines if the given filename has a suffix that identifies it as a test. + + Args: + filename: The input filename. + + Returns: + True if 'filename' looks like a test, False otherwise. + """ + if (filename.endswith('_test.cc') or + filename.endswith('_unittest.cc') or + filename.endswith('_regtest.cc')): + return True + else: + return False + + +def _ClassifyInclude(fileinfo, include, is_system): + """Figures out what kind of header 'include' is. + + Args: + fileinfo: The current file cpplint is running over. A FileInfo instance. + include: The path to a #included file. + is_system: True if the #include used <> rather than "". + + Returns: + One of the _XXX_HEADER constants. + + For example: + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'stdio.h', True) + _C_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'string', True) + _CPP_SYS_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/foo.h', False) + _LIKELY_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo_unknown_extension.cc'), + ... 'bar/foo_other_ext.h', False) + _POSSIBLE_MY_HEADER + >>> _ClassifyInclude(FileInfo('foo/foo.cc'), 'foo/bar.h', False) + _OTHER_HEADER + """ + # This is a list of all standard c++ header files, except + # those already checked for above. + is_cpp_h = include in _CPP_HEADERS + + if is_system: + if is_cpp_h: + return _CPP_SYS_HEADER + else: + return _C_SYS_HEADER + + # If the target file and the include we're checking share a + # basename when we drop common extensions, and the include + # lives in . , then it's likely to be owned by the target file. + target_dir, target_base = ( + os.path.split(_DropCommonSuffixes(fileinfo.RepositoryName()))) + include_dir, include_base = os.path.split(_DropCommonSuffixes(include)) + if target_base == include_base and ( + include_dir == target_dir or + include_dir == os.path.normpath(target_dir + '/../public')): + return _LIKELY_MY_HEADER + + # If the target and include share some initial basename + # component, it's possible the target is implementing the + # include, so it's allowed to be first, but we'll never + # complain if it's not there. + target_first_component = _RE_FIRST_COMPONENT.match(target_base) + include_first_component = _RE_FIRST_COMPONENT.match(include_base) + if (target_first_component and include_first_component and + target_first_component.group(0) == + include_first_component.group(0)): + return _POSSIBLE_MY_HEADER + + return _OTHER_HEADER + + + +def CheckIncludeLine(filename, clean_lines, linenum, include_state, error): + """Check rules that are applicable to #include lines. + + Strings on #include lines are NOT removed from elided line, to make + certain tasks easier. However, to prevent false positives, checks + applicable to #include lines in CheckLanguage must be put here. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + include_state: An _IncludeState instance in which the headers are inserted. + error: The function to call with any errors found. + """ + fileinfo = FileInfo(filename) + line = clean_lines.lines[linenum] + + # "include" should use the new style "foo/bar.h" instead of just "bar.h" + # Only do this check if the included header follows google naming + # conventions. If not, assume that it's a 3rd party API that + # requires special include conventions. + # + # We also make an exception for Lua headers, which follow google + # naming convention but not the include convention. + match = Match(r'#include\s*"([^/]+\.h)"', line) + if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)): + error(filename, linenum, 'build/include', 4, + 'Include the directory when naming .h files') + + # we shouldn't include a file more than once. actually, there are a + # handful of instances where doing so is okay, but in general it's + # not. + match = _RE_PATTERN_INCLUDE.search(line) + if match: + include = match.group(2) + is_system = (match.group(1) == '<') + duplicate_line = include_state.FindHeader(include) + if duplicate_line >= 0: + error(filename, linenum, 'build/include', 4, + '"%s" already included at %s:%s' % + (include, filename, duplicate_line)) + elif (include.endswith('.cc') and + os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)): + error(filename, linenum, 'build/include', 4, + 'Do not include .cc files from other packages') + elif not _THIRD_PARTY_HEADERS_PATTERN.match(include): + include_state.include_list[-1].append((include, linenum)) + + # We want to ensure that headers appear in the right order: + # 1) for foo.cc, foo.h (preferred location) + # 2) c system files + # 3) cpp system files + # 4) for foo.cc, foo.h (deprecated location) + # 5) other google headers + # + # We classify each include statement as one of those 5 types + # using a number of techniques. The include_state object keeps + # track of the highest type seen, and complains if we see a + # lower type after that. + error_message = include_state.CheckNextIncludeOrder( + _ClassifyInclude(fileinfo, include, is_system)) + if error_message: + error(filename, linenum, 'build/include_order', 4, + '%s. Should be: %s.h, c system, c++ system, other.' % + (error_message, fileinfo.BaseName())) + canonical_include = include_state.CanonicalizeAlphabeticalOrder(include) + if not include_state.IsInAlphabeticalOrder( + clean_lines, linenum, canonical_include): + error(filename, linenum, 'build/include_alpha', 4, + 'Include "%s" not in alphabetical order' % include) + include_state.SetLastHeader(canonical_include) + + + +def _GetTextInside(text, start_pattern): + r"""Retrieves all the text between matching open and close parentheses. + + Given a string of lines and a regular expression string, retrieve all the text + following the expression and between opening punctuation symbols like + (, [, or {, and the matching close-punctuation symbol. This properly nested + occurrences of the punctuations, so for the text like + printf(a(), b(c())); + a call to _GetTextInside(text, r'printf\(') will return 'a(), b(c())'. + start_pattern must match string having an open punctuation symbol at the end. + + Args: + text: The lines to extract text. Its comments and strings must be elided. + It can be single line and can span multiple lines. + start_pattern: The regexp string indicating where to start extracting + the text. + Returns: + The extracted text. + None if either the opening string or ending punctuation could not be found. + """ + # TODO(unknown): Audit cpplint.py to see what places could be profitably + # rewritten to use _GetTextInside (and use inferior regexp matching today). + + # Give opening punctuations to get the matching close-punctuations. + matching_punctuation = {'(': ')', '{': '}', '[': ']'} + closing_punctuation = set(matching_punctuation.itervalues()) + + # Find the position to start extracting text. + match = re.search(start_pattern, text, re.M) + if not match: # start_pattern not found in text. + return None + start_position = match.end(0) + + assert start_position > 0, ( + 'start_pattern must ends with an opening punctuation.') + assert text[start_position - 1] in matching_punctuation, ( + 'start_pattern must ends with an opening punctuation.') + # Stack of closing punctuations we expect to have in text after position. + punctuation_stack = [matching_punctuation[text[start_position - 1]]] + position = start_position + while punctuation_stack and position < len(text): + if text[position] == punctuation_stack[-1]: + punctuation_stack.pop() + elif text[position] in closing_punctuation: + # A closing punctuation without matching opening punctuations. + return None + elif text[position] in matching_punctuation: + punctuation_stack.append(matching_punctuation[text[position]]) + position += 1 + if punctuation_stack: + # Opening punctuations left without matching close-punctuations. + return None + # punctuations match. + return text[start_position:position - 1] + + +# Patterns for matching call-by-reference parameters. +# +# Supports nested templates up to 2 levels deep using this messy pattern: +# < (?: < (?: < [^<>]* +# > +# | [^<>] )* +# > +# | [^<>] )* +# > +_RE_PATTERN_IDENT = r'[_a-zA-Z]\w*' # =~ [[:alpha:]][[:alnum:]]* +_RE_PATTERN_TYPE = ( + r'(?:const\s+)?(?:typename\s+|class\s+|struct\s+|union\s+|enum\s+)?' + r'(?:\w|' + r'\s*<(?:<(?:<[^<>]*>|[^<>])*>|[^<>])*>|' + r'::)+') +# A call-by-reference parameter ends with '& identifier'. +_RE_PATTERN_REF_PARAM = re.compile( + r'(' + _RE_PATTERN_TYPE + r'(?:\s*(?:\bconst\b|[*]))*\s*' + r'&\s*' + _RE_PATTERN_IDENT + r')\s*(?:=[^,()]+)?[,)]') +# A call-by-const-reference parameter either ends with 'const& identifier' +# or looks like 'const type& identifier' when 'type' is atomic. +_RE_PATTERN_CONST_REF_PARAM = ( + r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT + + r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')') + + +def CheckLanguage(filename, clean_lines, linenum, file_extension, + include_state, nesting_state, error): + """Checks rules from the 'C++ language rules' section of cppguide.html. + + Some of these rules are hard to test (function overloading, using + uint32 inappropriately), but we do the best we can. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + file_extension: The extension (without the dot) of the filename. + include_state: An _IncludeState instance in which the headers are inserted. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # If the line is empty or consists of entirely a comment, no need to + # check it. + line = clean_lines.elided[linenum] + if not line: + return + + match = _RE_PATTERN_INCLUDE.search(line) + if match: + CheckIncludeLine(filename, clean_lines, linenum, include_state, error) + return + + # Reset include state across preprocessor directives. This is meant + # to silence warnings for conditional includes. + match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line) + if match: + include_state.ResetSection(match.group(1)) + + # Make Windows paths like Unix. + fullname = os.path.abspath(filename).replace('\\', '/') + + # Perform other checks now that we are sure that this is not an include line + CheckCasts(filename, clean_lines, linenum, error) + CheckGlobalStatic(filename, clean_lines, linenum, error) + CheckPrintf(filename, clean_lines, linenum, error) + + if file_extension == 'h': + # TODO(unknown): check that 1-arg constructors are explicit. + # How to tell it's a constructor? + # (handled in CheckForNonStandardConstructs for now) + # TODO(unknown): check that classes declare or disable copy/assign + # (level 1 error) + pass + + # Check if people are using the verboten C basic types. The only exception + # we regularly allow is "unsigned short port" for port. + if Search(r'\bshort port\b', line): + if not Search(r'\bunsigned short port\b', line): + error(filename, linenum, 'runtime/int', 4, + 'Use "unsigned short" for ports, not "short"') + else: + match = Search(r'\b(short|long(?! +double)|long long)\b', line) + if match: + error(filename, linenum, 'runtime/int', 4, + 'Use int16/int64/etc, rather than the C type %s' % match.group(1)) + + # Check if some verboten operator overloading is going on + # TODO(unknown): catch out-of-line unary operator&: + # class X {}; + # int operator&(const X& x) { return 42; } // unary operator& + # The trick is it's hard to tell apart from binary operator&: + # class Y { int operator&(const Y& x) { return 23; } }; // binary operator& + if Search(r'\boperator\s*&\s*\(\s*\)', line): + error(filename, linenum, 'runtime/operator', 4, + 'Unary operator& is dangerous. Do not use it.') + + # Check for suspicious usage of "if" like + # } if (a == b) { + if Search(r'\}\s*if\s*\(', line): + error(filename, linenum, 'readability/braces', 4, + 'Did you mean "else if"? If not, start a new line for "if".') + + # Check for potential format string bugs like printf(foo). + # We constrain the pattern not to pick things like DocidForPrintf(foo). + # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str()) + # TODO(unknown): Catch the following case. Need to change the calling + # convention of the whole function to process multiple line to handle it. + # printf( + # boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line); + printf_args = _GetTextInside(line, r'(?i)\b(string)?printf\s*\(') + if printf_args: + match = Match(r'([\w.\->()]+)$', printf_args) + if match and match.group(1) != '__VA_ARGS__': + function_name = re.search(r'\b((?:string)?printf)\s*\(', + line, re.I).group(1) + error(filename, linenum, 'runtime/printf', 4, + 'Potential format string bug. Do %s("%%s", %s) instead.' + % (function_name, match.group(1))) + + # Check for potential memset bugs like memset(buf, sizeof(buf), 0). + match = Search(r'memset\s*\(([^,]*),\s*([^,]*),\s*0\s*\)', line) + if match and not Match(r"^''|-?[0-9]+|0x[0-9A-Fa-f]$", match.group(2)): + error(filename, linenum, 'runtime/memset', 4, + 'Did you mean "memset(%s, 0, %s)"?' + % (match.group(1), match.group(2))) + + if Search(r'\busing namespace\b', line): + error(filename, linenum, 'build/namespaces', 5, + 'Do not use namespace using-directives. ' + 'Use using-declarations instead.') + + # Detect variable-length arrays. + match = Match(r'\s*(.+::)?(\w+) [a-z]\w*\[(.+)];', line) + if (match and match.group(2) != 'return' and match.group(2) != 'delete' and + match.group(3).find(']') == -1): + # Split the size using space and arithmetic operators as delimiters. + # If any of the resulting tokens are not compile time constants then + # report the error. + tokens = re.split(r'\s|\+|\-|\*|\/|<<|>>]', match.group(3)) + is_const = True + skip_next = False + for tok in tokens: + if skip_next: + skip_next = False + continue + + if Search(r'sizeof\(.+\)', tok): continue + if Search(r'arraysize\(\w+\)', tok): continue + + tok = tok.lstrip('(') + tok = tok.rstrip(')') + if not tok: continue + if Match(r'\d+', tok): continue + if Match(r'0[xX][0-9a-fA-F]+', tok): continue + if Match(r'k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?k[A-Z0-9]\w*', tok): continue + if Match(r'(.+::)?[A-Z][A-Z0-9_]*', tok): continue + # A catch all for tricky sizeof cases, including 'sizeof expression', + # 'sizeof(*type)', 'sizeof(const type)', 'sizeof(struct StructName)' + # requires skipping the next token because we split on ' ' and '*'. + if tok.startswith('sizeof'): + skip_next = True + continue + is_const = False + break + if not is_const: + error(filename, linenum, 'runtime/arrays', 1, + 'Do not use variable-length arrays. Use an appropriately named ' + "('k' followed by CamelCase) compile-time constant for the size.") + + # Check for use of unnamed namespaces in header files. Registration + # macros are typically OK, so we allow use of "namespace {" on lines + # that end with backslashes. + if (file_extension == 'h' + and Search(r'\bnamespace\s*{', line) + and line[-1] != '\\'): + error(filename, linenum, 'build/namespaces', 4, + 'Do not use unnamed namespaces in header files. See ' + 'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces' + ' for more information.') + + +def CheckGlobalStatic(filename, clean_lines, linenum, error): + """Check for unsafe global or static objects. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Match two lines at a time to support multiline declarations + if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line): + line += clean_lines.elided[linenum + 1].strip() + + # Check for people declaring static/global STL strings at the top level. + # This is dangerous because the C++ language does not guarantee that + # globals with constructors are initialized before the first access. + match = Match( + r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)', + line) + + # Remove false positives: + # - String pointers (as opposed to values). + # string *pointer + # const string *pointer + # string const *pointer + # string *const pointer + # + # - Functions and template specializations. + # string Function(... + # string Class::Method(... + # + # - Operators. These are matched separately because operator names + # cross non-word boundaries, and trying to match both operators + # and functions at the same time would decrease accuracy of + # matching identifiers. + # string Class::operator*() + if (match and + not Search(r'\bstring\b(\s+const)?\s*\*\s*(const\s+)?\w', line) and + not Search(r'\boperator\W', line) and + not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(3))): + error(filename, linenum, 'runtime/string', 4, + 'For a static/global string constant, use a C style string instead: ' + '"%schar %s[]".' % + (match.group(1), match.group(2))) + + if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line): + error(filename, linenum, 'runtime/init', 4, + 'You seem to be initializing a member variable with itself.') + + +def CheckPrintf(filename, clean_lines, linenum, error): + """Check for printf related issues. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # When snprintf is used, the second argument shouldn't be a literal. + match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line) + if match and match.group(2) != '0': + # If 2nd arg is zero, snprintf is used to calculate size. + error(filename, linenum, 'runtime/printf', 3, + 'If you can, use sizeof(%s) instead of %s as the 2nd arg ' + 'to snprintf.' % (match.group(1), match.group(2))) + + # Check if some verboten C functions are being used. + if Search(r'\bsprintf\s*\(', line): + error(filename, linenum, 'runtime/printf', 5, + 'Never use sprintf. Use snprintf instead.') + match = Search(r'\b(strcpy|strcat)\s*\(', line) + if match: + error(filename, linenum, 'runtime/printf', 4, + 'Almost always, snprintf is better than %s' % match.group(1)) + + +def IsDerivedFunction(clean_lines, linenum): + """Check if current line contains an inherited function. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line contains a function with "override" + virt-specifier. + """ + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i]) + if match: + # Look for "override" after the matching closing parenthesis + line, _, closing_paren = CloseExpression( + clean_lines, i, len(match.group(1))) + return (closing_paren >= 0 and + Search(r'\boverride\b', line[closing_paren:])) + return False + + +def IsOutOfLineMethodDefinition(clean_lines, linenum): + """Check if current line contains an out-of-line method definition. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line contains an out-of-line method definition. + """ + # Scan back a few lines for start of current function + for i in xrange(linenum, max(-1, linenum - 10), -1): + if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]): + return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None + return False + + +def IsInitializerList(clean_lines, linenum): + """Check if current line is inside constructor initializer list. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + Returns: + True if current line appears to be inside constructor initializer + list, False otherwise. + """ + for i in xrange(linenum, 1, -1): + line = clean_lines.elided[i] + if i == linenum: + remove_function_body = Match(r'^(.*)\{\s*$', line) + if remove_function_body: + line = remove_function_body.group(1) + + if Search(r'\s:\s*\w+[({]', line): + # A lone colon tend to indicate the start of a constructor + # initializer list. It could also be a ternary operator, which + # also tend to appear in constructor initializer lists as + # opposed to parameter lists. + return True + if Search(r'\}\s*,\s*$', line): + # A closing brace followed by a comma is probably the end of a + # brace-initialized member in constructor initializer list. + return True + if Search(r'[{};]\s*$', line): + # Found one of the following: + # - A closing brace or semicolon, probably the end of the previous + # function. + # - An opening brace, probably the start of current class or namespace. + # + # Current line is probably not inside an initializer list since + # we saw one of those things without seeing the starting colon. + return False + + # Got to the beginning of the file without seeing the start of + # constructor initializer list. + return False + + +def CheckForNonConstReference(filename, clean_lines, linenum, + nesting_state, error): + """Check for non-const references. + + Separate from CheckLanguage since it scans backwards from current + line, instead of scanning forward. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: The function to call with any errors found. + """ + # Do nothing if there is no '&' on current line. + line = clean_lines.elided[linenum] + if '&' not in line: + return + + # If a function is inherited, current function doesn't have much of + # a choice, so any non-const references should not be blamed on + # derived function. + if IsDerivedFunction(clean_lines, linenum): + return + + # Don't warn on out-of-line method definitions, as we would warn on the + # in-line declaration, if it isn't marked with 'override'. + if IsOutOfLineMethodDefinition(clean_lines, linenum): + return + + # Long type names may be broken across multiple lines, usually in one + # of these forms: + # LongType + # ::LongTypeContinued &identifier + # LongType:: + # LongTypeContinued &identifier + # LongType< + # ...>::LongTypeContinued &identifier + # + # If we detected a type split across two lines, join the previous + # line to current line so that we can match const references + # accordingly. + # + # Note that this only scans back one line, since scanning back + # arbitrary number of lines would be expensive. If you have a type + # that spans more than 2 lines, please use a typedef. + if linenum > 1: + previous = None + if Match(r'\s*::(?:[\w<>]|::)+\s*&\s*\S', line): + # previous_line\n + ::current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+[\w<>])\s*$', + clean_lines.elided[linenum - 1]) + elif Match(r'\s*[a-zA-Z_]([\w<>]|::)+\s*&\s*\S', line): + # previous_line::\n + current_line + previous = Search(r'\b((?:const\s*)?(?:[\w<>]|::)+::)\s*$', + clean_lines.elided[linenum - 1]) + if previous: + line = previous.group(1) + line.lstrip() + else: + # Check for templated parameter that is split across multiple lines + endpos = line.rfind('>') + if endpos > -1: + (_, startline, startpos) = ReverseCloseExpression( + clean_lines, linenum, endpos) + if startpos > -1 and startline < linenum: + # Found the matching < on an earlier line, collect all + # pieces up to current line. + line = '' + for i in xrange(startline, linenum + 1): + line += clean_lines.elided[i].strip() + + # Check for non-const references in function parameters. A single '&' may + # found in the following places: + # inside expression: binary & for bitwise AND + # inside expression: unary & for taking the address of something + # inside declarators: reference parameter + # We will exclude the first two cases by checking that we are not inside a + # function body, including one that was just introduced by a trailing '{'. + # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare]. + if (nesting_state.previous_stack_top and + not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or + isinstance(nesting_state.previous_stack_top, _NamespaceInfo))): + # Not at toplevel, not within a class, and not within a namespace + return + + # Avoid initializer lists. We only need to scan back from the + # current line for something that starts with ':'. + # + # We don't need to check the current line, since the '&' would + # appear inside the second set of parentheses on the current line as + # opposed to the first set. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 10), -1): + previous_line = clean_lines.elided[i] + if not Search(r'[),]\s*$', previous_line): + break + if Match(r'^\s*:\s+\S', previous_line): + return + + # Avoid preprocessors + if Search(r'\\\s*$', line): + return + + # Avoid constructor initializer lists + if IsInitializerList(clean_lines, linenum): + return + + # We allow non-const references in a few standard places, like functions + # called "swap()" or iostream operators like "<<" or ">>". Do not check + # those function parameters. + # + # We also accept & in static_assert, which looks like a function but + # it's actually a declaration expression. + whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|' + r'operator\s*[<>][<>]|' + r'static_assert|COMPILE_ASSERT' + r')\s*\(') + if Search(whitelisted_functions, line): + return + elif not Search(r'\S+\([^)]*$', line): + # Don't see a whitelisted function on this line. Actually we + # didn't see any function name on this line, so this is likely a + # multi-line parameter list. Try a bit harder to catch this case. + for i in xrange(2): + if (linenum > i and + Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])): + return + + decls = ReplaceAll(r'{[^}]*}', ' ', line) # exclude function body + for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls): + if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter): + error(filename, linenum, 'runtime/references', 2, + 'Is this a non-const reference? ' + 'If so, make const or use a pointer: ' + + ReplaceAll(' *<', '<', parameter)) + + +def CheckCasts(filename, clean_lines, linenum, error): + """Various cast related checks. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Check to see if they're using an conversion function cast. + # I just try to capture the most common basic types, though there are more. + # Parameterless conversion functions, such as bool(), are allowed as they are + # probably a member operator declaration or default constructor. + match = Search( + r'(\bnew\s+|\S<\s*(?:const\s+)?)?\b' + r'(int|float|double|bool|char|int32|uint32|int64|uint64)' + r'(\([^)].*)', line) + expecting_function = ExpectingFunctionArgs(clean_lines, linenum) + if match and not expecting_function: + matched_type = match.group(2) + + # matched_new_or_template is used to silence two false positives: + # - New operators + # - Template arguments with function types + # + # For template arguments, we match on types immediately following + # an opening bracket without any spaces. This is a fast way to + # silence the common case where the function type is the first + # template argument. False negative with less-than comparison is + # avoided because those operators are usually followed by a space. + # + # function // bracket + no space = false positive + # value < double(42) // bracket + space = true positive + matched_new_or_template = match.group(1) + + # Avoid arrays by looking for brackets that come after the closing + # parenthesis. + if Match(r'\([^()]+\)\s*\[', match.group(3)): + return + + # Other things to ignore: + # - Function pointers + # - Casts to pointer types + # - Placement new + # - Alias declarations + matched_funcptr = match.group(3) + if (matched_new_or_template is None and + not (matched_funcptr and + (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(', + matched_funcptr) or + matched_funcptr.startswith('(*)'))) and + not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and + not Search(r'new\(\S+\)\s*' + matched_type, line)): + error(filename, linenum, 'readability/casting', 4, + 'Using deprecated casting style. ' + 'Use static_cast<%s>(...) instead' % + matched_type) + + if not expecting_function: + CheckCStyleCast(filename, clean_lines, linenum, 'static_cast', + r'\((int|float|double|bool|char|u?int(16|32|64))\)', error) + + # This doesn't catch all cases. Consider (const char * const)"hello". + # + # (char *) "foo" should always be a const_cast (reinterpret_cast won't + # compile). + if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast', + r'\((char\s?\*+\s?)\)\s*"', error): + pass + else: + # Check pointer casts for other than string constants + CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast', + r'\((\w+\s?\*+\s?)\)', error) + + # In addition, we look for people taking the address of a cast. This + # is dangerous -- casts can assign to temporaries, so the pointer doesn't + # point where you think. + # + # Some non-identifier character is required before the '&' for the + # expression to be recognized as a cast. These are casts: + # expression = &static_cast(temporary()); + # function(&(int*)(temporary())); + # + # This is not a cast: + # reference_type&(int* function_param); + match = Search( + r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|' + r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line) + if match: + # Try a better error message when the & is bound to something + # dereferenced by the casted pointer, as opposed to the casted + # pointer itself. + parenthesis_error = False + match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line) + if match: + _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1))) + if x1 >= 0 and clean_lines.elided[y1][x1] == '(': + _, y2, x2 = CloseExpression(clean_lines, y1, x1) + if x2 >= 0: + extended_line = clean_lines.elided[y2][x2:] + if y2 < clean_lines.NumLines() - 1: + extended_line += clean_lines.elided[y2 + 1] + if Match(r'\s*(?:->|\[)', extended_line): + parenthesis_error = True + + if parenthesis_error: + error(filename, linenum, 'readability/casting', 4, + ('Are you taking an address of something dereferenced ' + 'from a cast? Wrapping the dereferenced expression in ' + 'parentheses will make the binding more obvious')) + else: + error(filename, linenum, 'runtime/casting', 4, + ('Are you taking an address of a cast? ' + 'This is dangerous: could be a temp var. ' + 'Take the address before doing the cast, rather than after')) + + +def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error): + """Checks for a C-style cast by looking for the pattern. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + cast_type: The string for the C++ cast to recommend. This is either + reinterpret_cast, static_cast, or const_cast, depending. + pattern: The regular expression used to find C-style casts. + error: The function to call with any errors found. + + Returns: + True if an error was emitted. + False otherwise. + """ + line = clean_lines.elided[linenum] + match = Search(pattern, line) + if not match: + return False + + # Exclude lines with keywords that tend to look like casts + context = line[0:match.start(1) - 1] + if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context): + return False + + # Try expanding current context to see if we one level of + # parentheses inside a macro. + if linenum > 0: + for i in xrange(linenum - 1, max(0, linenum - 5), -1): + context = clean_lines.elided[i] + context + if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context): + return False + + # operator++(int) and operator--(int) + if context.endswith(' operator++') or context.endswith(' operator--'): + return False + + # A single unnamed argument for a function tends to look like old + # style cast. If we see those, don't issue warnings for deprecated + # casts, instead issue warnings for unnamed arguments where + # appropriate. + # + # These are things that we want warnings for, since the style guide + # explicitly require all parameters to be named: + # Function(int); + # Function(int) { + # ConstMember(int) const; + # ConstMember(int) const { + # ExceptionMember(int) throw (...); + # ExceptionMember(int) throw (...) { + # PureVirtual(int) = 0; + # [](int) -> bool { + # + # These are functions of some sort, where the compiler would be fine + # if they had named parameters, but people often omit those + # identifiers to reduce clutter: + # (FunctionPointer)(int); + # (FunctionPointer)(int) = value; + # Function((function_pointer_arg)(int)) + # Function((function_pointer_arg)(int), int param) + # ; + # <(FunctionPointerTemplateArgument)(int)>; + remainder = line[match.end(0):] + if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)', + remainder): + # Looks like an unnamed parameter. + + # Don't warn on any kind of template arguments. + if Match(r'^\s*>', remainder): + return False + + # Don't warn on assignments to function pointers, but keep warnings for + # unnamed parameters to pure virtual functions. Note that this pattern + # will also pass on assignments of "0" to function pointers, but the + # preferred values for those would be "nullptr" or "NULL". + matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder) + if matched_zero and matched_zero.group(1) != '0': + return False + + # Don't warn on function pointer declarations. For this we need + # to check what came before the "(type)" string. + if Match(r'.*\)\s*$', line[0:match.start(0)]): + return False + + # Don't warn if the parameter is named with block comments, e.g.: + # Function(int /*unused_param*/); + raw_line = clean_lines.raw_lines[linenum] + if '/*' in raw_line: + return False + + # Passed all filters, issue warning here. + error(filename, linenum, 'readability/function', 3, + 'All parameters should be named in a function') + return True + + # At this point, all that should be left is actual casts. + error(filename, linenum, 'readability/casting', 4, + 'Using C-style cast. Use %s<%s>(...) instead' % + (cast_type, match.group(1))) + + return True + + +def ExpectingFunctionArgs(clean_lines, linenum): + """Checks whether where function type arguments are expected. + + Args: + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + + Returns: + True if the line at 'linenum' is inside something that expects arguments + of function types. + """ + line = clean_lines.elided[linenum] + return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or + (linenum >= 2 and + (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$', + clean_lines.elided[linenum - 1]) or + Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$', + clean_lines.elided[linenum - 2]) or + Search(r'\bstd::m?function\s*\<\s*$', + clean_lines.elided[linenum - 1])))) + + +_HEADERS_CONTAINING_TEMPLATES = ( + ('', ('deque',)), + ('', ('unary_function', 'binary_function', + 'plus', 'minus', 'multiplies', 'divides', 'modulus', + 'negate', + 'equal_to', 'not_equal_to', 'greater', 'less', + 'greater_equal', 'less_equal', + 'logical_and', 'logical_or', 'logical_not', + 'unary_negate', 'not1', 'binary_negate', 'not2', + 'bind1st', 'bind2nd', + 'pointer_to_unary_function', + 'pointer_to_binary_function', + 'ptr_fun', + 'mem_fun_t', 'mem_fun', 'mem_fun1_t', 'mem_fun1_ref_t', + 'mem_fun_ref_t', + 'const_mem_fun_t', 'const_mem_fun1_t', + 'const_mem_fun_ref_t', 'const_mem_fun1_ref_t', + 'mem_fun_ref', + )), + ('', ('numeric_limits',)), + ('', ('list',)), + ('', ('map', 'multimap',)), + ('', ('allocator',)), + ('', ('queue', 'priority_queue',)), + ('', ('set', 'multiset',)), + ('', ('stack',)), + ('', ('char_traits', 'basic_string',)), + ('', ('tuple',)), + ('', ('pair',)), + ('', ('vector',)), + + # gcc extensions. + # Note: std::hash is their hash, ::hash is our hash + ('', ('hash_map', 'hash_multimap',)), + ('', ('hash_set', 'hash_multiset',)), + ('', ('slist',)), + ) + +_RE_PATTERN_STRING = re.compile(r'\bstring\b') + +_re_pattern_algorithm_header = [] +for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap', + 'transform'): + # Match max(..., ...), max(..., ...), but not foo->max, foo.max or + # type::max(). + _re_pattern_algorithm_header.append( + (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'), + _template, + '')) + +_re_pattern_templates = [] +for _header, _templates in _HEADERS_CONTAINING_TEMPLATES: + for _template in _templates: + _re_pattern_templates.append( + (re.compile(r'(\<|\b)' + _template + r'\s*\<'), + _template + '<>', + _header)) + + +def FilesBelongToSameModule(filename_cc, filename_h): + """Check if these two filenames belong to the same module. + + The concept of a 'module' here is a as follows: + foo.h, foo-inl.h, foo.cc, foo_test.cc and foo_unittest.cc belong to the + same 'module' if they are in the same directory. + some/path/public/xyzzy and some/path/internal/xyzzy are also considered + to belong to the same module here. + + If the filename_cc contains a longer path than the filename_h, for example, + '/absolute/path/to/base/sysinfo.cc', and this file would include + 'base/sysinfo.h', this function also produces the prefix needed to open the + header. This is used by the caller of this function to more robustly open the + header file. We don't have access to the real include paths in this context, + so we need this guesswork here. + + Known bugs: tools/base/bar.cc and base/bar.h belong to the same module + according to this implementation. Because of this, this function gives + some false positives. This should be sufficiently rare in practice. + + Args: + filename_cc: is the path for the .cc file + filename_h: is the path for the header path + + Returns: + Tuple with a bool and a string: + bool: True if filename_cc and filename_h belong to the same module. + string: the additional prefix needed to open the header file. + """ + + if not filename_cc.endswith('.cc'): + return (False, '') + filename_cc = filename_cc[:-len('.cc')] + if filename_cc.endswith('_unittest'): + filename_cc = filename_cc[:-len('_unittest')] + elif filename_cc.endswith('_test'): + filename_cc = filename_cc[:-len('_test')] + filename_cc = filename_cc.replace('/public/', '/') + filename_cc = filename_cc.replace('/internal/', '/') + + if not filename_h.endswith('.h'): + return (False, '') + filename_h = filename_h[:-len('.h')] + if filename_h.endswith('-inl'): + filename_h = filename_h[:-len('-inl')] + filename_h = filename_h.replace('/public/', '/') + filename_h = filename_h.replace('/internal/', '/') + + files_belong_to_same_module = filename_cc.endswith(filename_h) + common_path = '' + if files_belong_to_same_module: + common_path = filename_cc[:-len(filename_h)] + return files_belong_to_same_module, common_path + + +def UpdateIncludeState(filename, include_dict, io=codecs): + """Fill up the include_dict with new includes found from the file. + + Args: + filename: the name of the header to read. + include_dict: a dictionary in which the headers are inserted. + io: The io factory to use to read the file. Provided for testability. + + Returns: + True if a header was successfully added. False otherwise. + """ + headerfile = None + try: + headerfile = io.open(filename, 'r', 'utf8', 'replace') + except IOError: + return False + linenum = 0 + for line in headerfile: + linenum += 1 + clean_line = CleanseComments(line) + match = _RE_PATTERN_INCLUDE.search(clean_line) + if match: + include = match.group(2) + include_dict.setdefault(include, linenum) + return True + + +def CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error, + io=codecs): + """Reports for missing stl includes. + + This function will output warnings to make sure you are including the headers + necessary for the stl containers and functions that you use. We only give one + reason to include a header. For example, if you use both equal_to<> and + less<> in a .h file, only one (the latter in the file) of these will be + reported as a reason to include the . + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + include_state: An _IncludeState instance. + error: The function to call with any errors found. + io: The IO factory to use to read the header file. Provided for unittest + injection. + """ + required = {} # A map of header name to linenumber and the template entity. + # Example of required: { '': (1219, 'less<>') } + + for linenum in xrange(clean_lines.NumLines()): + line = clean_lines.elided[linenum] + if not line or line[0] == '#': + continue + + # String is special -- it is a non-templatized type in STL. + matched = _RE_PATTERN_STRING.search(line) + if matched: + # Don't warn about strings in non-STL namespaces: + # (We check only the first match per line; good enough.) + prefix = line[:matched.start()] + if prefix.endswith('std::') or not prefix.endswith('::'): + required[''] = (linenum, 'string') + + for pattern, template, header in _re_pattern_algorithm_header: + if pattern.search(line): + required[header] = (linenum, template) + + # The following function is just a speed up, no semantics are changed. + if not '<' in line: # Reduces the cpu time usage by skipping lines. + continue + + for pattern, template, header in _re_pattern_templates: + if pattern.search(line): + required[header] = (linenum, template) + + # The policy is that if you #include something in foo.h you don't need to + # include it again in foo.cc. Here, we will look at possible includes. + # Let's flatten the include_state include_list and copy it into a dictionary. + include_dict = dict([item for sublist in include_state.include_list + for item in sublist]) + + # Did we find the header for this file (if any) and successfully load it? + header_found = False + + # Use the absolute path so that matching works properly. + abs_filename = FileInfo(filename).FullName() + + # For Emacs's flymake. + # If cpplint is invoked from Emacs's flymake, a temporary file is generated + # by flymake and that file name might end with '_flymake.cc'. In that case, + # restore original file name here so that the corresponding header file can be + # found. + # e.g. If the file name is 'foo_flymake.cc', we should search for 'foo.h' + # instead of 'foo_flymake.h' + abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename) + + # include_dict is modified during iteration, so we iterate over a copy of + # the keys. + header_keys = include_dict.keys() + for header in header_keys: + (same_module, common_path) = FilesBelongToSameModule(abs_filename, header) + fullpath = common_path + header + if same_module and UpdateIncludeState(fullpath, include_dict, io): + header_found = True + + # If we can't find the header file for a .cc, assume it's because we don't + # know where to look. In that case we'll give up as we're not sure they + # didn't include it in the .h file. + # TODO(unknown): Do a better job of finding .h files so we are confident that + # not having the .h file means there isn't one. + if filename.endswith('.cc') and not header_found: + return + + # All the lines have been processed, report the errors found. + for required_header_unstripped in required: + template = required[required_header_unstripped][1] + if required_header_unstripped.strip('<>"') not in include_dict: + error(filename, required[required_header_unstripped][0], + 'build/include_what_you_use', 4, + 'Add #include ' + required_header_unstripped + ' for ' + template) + + +_RE_PATTERN_EXPLICIT_MAKEPAIR = re.compile(r'\bmake_pair\s*<') + + +def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error): + """Check that make_pair's template arguments are deduced. + + G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are + specified explicitly, and such use isn't intended in any case. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + match = _RE_PATTERN_EXPLICIT_MAKEPAIR.search(line) + if match: + error(filename, linenum, 'build/explicit_make_pair', + 4, # 4 = high confidence + 'For C++11-compatibility, omit template arguments from make_pair' + ' OR use pair directly OR if appropriate, construct a pair directly') + + +def CheckDefaultLambdaCaptures(filename, clean_lines, linenum, error): + """Check that default lambda captures are not used. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # A lambda introducer specifies a default capture if it starts with "[=" + # or if it starts with "[&" _not_ followed by an identifier. + match = Match(r'^(.*)\[\s*(?:=|&[^\w])', line) + if match: + # Found a potential error, check what comes after the lambda-introducer. + # If it's not open parenthesis (for lambda-declarator) or open brace + # (for compound-statement), it's not a lambda. + line, _, pos = CloseExpression(clean_lines, linenum, len(match.group(1))) + if pos >= 0 and Match(r'^\s*[{(]', line[pos:]): + error(filename, linenum, 'build/c++11', + 4, # 4 = high confidence + 'Default lambda captures are an unapproved C++ feature.') + + +def CheckRedundantVirtual(filename, clean_lines, linenum, error): + """Check if line contains a redundant "virtual" function-specifier. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Look for "virtual" on current line. + line = clean_lines.elided[linenum] + virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line) + if not virtual: return + + # Ignore "virtual" keywords that are near access-specifiers. These + # are only used in class base-specifier and do not apply to member + # functions. + if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or + Match(r'^\s+(public|protected|private)\b', virtual.group(3))): + return + + # Ignore the "virtual" keyword from virtual base classes. Usually + # there is a column on the same line in these cases (virtual base + # classes are rare in google3 because multiple inheritance is rare). + if Match(r'^.*[^:]:[^:].*$', line): return + + # Look for the next opening parenthesis. This is the start of the + # parameter list (possibly on the next line shortly after virtual). + # TODO(unknown): doesn't work if there are virtual functions with + # decltype() or other things that use parentheses, but csearch suggests + # that this is rare. + end_col = -1 + end_line = -1 + start_col = len(virtual.group(2)) + for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())): + line = clean_lines.elided[start_line][start_col:] + parameter_list = Match(r'^([^(]*)\(', line) + if parameter_list: + # Match parentheses to find the end of the parameter list + (_, end_line, end_col) = CloseExpression( + clean_lines, start_line, start_col + len(parameter_list.group(1))) + break + start_col = 0 + + if end_col < 0: + return # Couldn't find end of parameter list, give up + + # Look for "override" or "final" after the parameter list + # (possibly on the next few lines). + for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())): + line = clean_lines.elided[i][end_col:] + match = Search(r'\b(override|final)\b', line) + if match: + error(filename, linenum, 'readability/inheritance', 4, + ('"virtual" is redundant since function is ' + 'already declared as "%s"' % match.group(1))) + + # Set end_col to check whole lines after we are done with the + # first line. + end_col = 0 + if Search(r'[^\w]\s*$', line): + break + + +def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error): + """Check if line contains a redundant "override" or "final" virt-specifier. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + # Look for closing parenthesis nearby. We need one to confirm where + # the declarator ends and where the virt-specifier starts to avoid + # false positives. + line = clean_lines.elided[linenum] + declarator_end = line.rfind(')') + if declarator_end >= 0: + fragment = line[declarator_end:] + else: + if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0: + fragment = line + else: + return + + # Check that at most one of "override" or "final" is present, not both + if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment): + error(filename, linenum, 'readability/inheritance', 4, + ('"override" is redundant since function is ' + 'already declared as "final"')) + + + + +# Returns true if we are at a new block, and it is directly +# inside of a namespace. +def IsBlockInNameSpace(nesting_state, is_forward_declaration): + """Checks that the new block is directly in a namespace. + + Args: + nesting_state: The _NestingState object that contains info about our state. + is_forward_declaration: If the class is a forward declared class. + Returns: + Whether or not the new block is directly in a namespace. + """ + if is_forward_declaration: + if len(nesting_state.stack) >= 1 and ( + isinstance(nesting_state.stack[-1], _NamespaceInfo)): + return True + else: + return False + + return (len(nesting_state.stack) > 1 and + nesting_state.stack[-1].check_namespace_indentation and + isinstance(nesting_state.stack[-2], _NamespaceInfo)) + + +def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item, + raw_lines_no_comments, linenum): + """This method determines if we should apply our namespace indentation check. + + Args: + nesting_state: The current nesting state. + is_namespace_indent_item: If we just put a new class on the stack, True. + If the top of the stack is not a class, or we did not recently + add the class, False. + raw_lines_no_comments: The lines without the comments. + linenum: The current line number we are processing. + + Returns: + True if we should apply our namespace indentation check. Currently, it + only works for classes and namespaces inside of a namespace. + """ + + is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments, + linenum) + + if not (is_namespace_indent_item or is_forward_declaration): + return False + + # If we are in a macro, we do not want to check the namespace indentation. + if IsMacroDefinition(raw_lines_no_comments, linenum): + return False + + return IsBlockInNameSpace(nesting_state, is_forward_declaration) + + +# Call this method if the line is directly inside of a namespace. +# If the line above is blank (excluding comments) or the start of +# an inner namespace, it cannot be indented. +def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum, + error): + line = raw_lines_no_comments[linenum] + if Match(r'^\s+', line): + error(filename, linenum, 'runtime/indentation_namespace', 4, + 'Do not indent within a namespace') + + +def ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions=[]): + """Processes a single line in the file. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + clean_lines: An array of strings, each representing a line of the file, + with comments stripped. + line: Number of line being processed. + include_state: An _IncludeState instance in which the headers are inserted. + function_state: A _FunctionState instance which counts function lines, etc. + nesting_state: A NestingState instance which maintains information about + the current stack of nested blocks being parsed. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + raw_lines = clean_lines.raw_lines + ParseNolintSuppressions(filename, raw_lines[line], line, error) + nesting_state.Update(filename, clean_lines, line, error) + CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line, + error) + if nesting_state.InAsmBlock(): return + CheckForFunctionLengths(filename, clean_lines, line, function_state, error) + CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error) + CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error) + CheckLanguage(filename, clean_lines, line, file_extension, include_state, + nesting_state, error) + CheckForNonConstReference(filename, clean_lines, line, nesting_state, error) + CheckForNonStandardConstructs(filename, clean_lines, line, + nesting_state, error) + CheckVlogArguments(filename, clean_lines, line, error) + CheckPosixThreading(filename, clean_lines, line, error) + CheckInvalidIncrement(filename, clean_lines, line, error) + CheckMakePairUsesDeduction(filename, clean_lines, line, error) + CheckDefaultLambdaCaptures(filename, clean_lines, line, error) + CheckRedundantVirtual(filename, clean_lines, line, error) + CheckRedundantOverrideOrFinal(filename, clean_lines, line, error) + for check_fn in extra_check_functions: + check_fn(filename, clean_lines, line, error) + +def FlagCxx11Features(filename, clean_lines, linenum, error): + """Flag those c++11 features that we only allow in certain places. + + Args: + filename: The name of the current file. + clean_lines: A CleansedLines instance containing the file. + linenum: The number of the line to check. + error: The function to call with any errors found. + """ + line = clean_lines.elided[linenum] + + # Flag unapproved C++11 headers. + include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line) + if include and include.group(1) in ('cfenv', + 'condition_variable', + 'fenv.h', + 'future', + 'mutex', + 'thread', + 'chrono', + 'ratio', + 'regex', + 'system_error', + ): + error(filename, linenum, 'build/c++11', 5, + ('<%s> is an unapproved C++11 header.') % include.group(1)) + + # The only place where we need to worry about C++11 keywords and library + # features in preprocessor directives is in macro definitions. + if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return + + # These are classes and free functions. The classes are always + # mentioned as std::*, but we only catch the free functions if + # they're not found by ADL. They're alphabetical by header. + for top_name in ( + # type_traits + 'alignment_of', + 'aligned_union', + ): + if Search(r'\bstd::%s\b' % top_name, line): + error(filename, linenum, 'build/c++11', 5, + ('std::%s is an unapproved C++11 class or function. Send c-style ' + 'an example of where it would make your code more readable, and ' + 'they may let you use it.') % top_name) + + +def ProcessFileData(filename, file_extension, lines, error, + extra_check_functions=[]): + """Performs lint checks and reports any errors to the given error function. + + Args: + filename: Filename of the file that is being processed. + file_extension: The extension (dot not included) of the file. + lines: An array of strings, each representing a line of the file, with the + last element being empty if the file is terminated with a newline. + error: A callable to which errors are reported, which takes 4 arguments: + filename, line number, error level, and message + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + lines = (['// marker so line numbers and indices both start at 1'] + lines + + ['// marker so line numbers end in a known way']) + + include_state = _IncludeState() + function_state = _FunctionState() + nesting_state = NestingState() + + ResetNolintSuppressions() + + CheckForCopyright(filename, lines, error) + + RemoveMultiLineComments(filename, lines, error) + clean_lines = CleansedLines(lines) + + if file_extension == 'h': + CheckForHeaderGuard(filename, clean_lines, error) + + for line in xrange(clean_lines.NumLines()): + ProcessLine(filename, file_extension, clean_lines, line, + include_state, function_state, nesting_state, error, + extra_check_functions) + FlagCxx11Features(filename, clean_lines, line, error) + nesting_state.CheckCompletedBlocks(filename, error) + + CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error) + + # Check that the .cc file has included its header if it exists. + if file_extension == 'cc': + CheckHeaderFileIncluded(filename, include_state, error) + + # We check here rather than inside ProcessLine so that we see raw + # lines rather than "cleaned" lines. + CheckForBadCharacters(filename, lines, error) + + CheckForNewlineAtEOF(filename, lines, error) + +def ProcessConfigOverrides(filename): + """ Loads the configuration files and processes the config overrides. + + Args: + filename: The name of the file being processed by the linter. + + Returns: + False if the current |filename| should not be processed further. + """ + + abs_filename = os.path.abspath(filename) + cfg_filters = [] + keep_looking = True + while keep_looking: + abs_path, base_name = os.path.split(abs_filename) + if not base_name: + break # Reached the root directory. + + cfg_file = os.path.join(abs_path, "CPPLINT.cfg") + abs_filename = abs_path + if not os.path.isfile(cfg_file): + continue + + try: + with open(cfg_file) as file_handle: + for line in file_handle: + line, _, _ = line.partition('#') # Remove comments. + if not line.strip(): + continue + + name, _, val = line.partition('=') + name = name.strip() + val = val.strip() + if name == 'set noparent': + keep_looking = False + elif name == 'filter': + cfg_filters.append(val) + elif name == 'exclude_files': + # When matching exclude_files pattern, use the base_name of + # the current file name or the directory name we are processing. + # For example, if we are checking for lint errors in /foo/bar/baz.cc + # and we found the .cfg file at /foo/CPPLINT.cfg, then the config + # file's "exclude_files" filter is meant to be checked against "bar" + # and not "baz" nor "bar/baz.cc". + if base_name: + pattern = re.compile(val) + if pattern.match(base_name): + sys.stderr.write('Ignoring "%s": file excluded by "%s". ' + 'File path component "%s" matches ' + 'pattern "%s"\n' % + (filename, cfg_file, base_name, val)) + return False + elif name == 'linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + sys.stderr.write('Line length must be numeric.') + else: + sys.stderr.write( + 'Invalid configuration option (%s) in file %s\n' % + (name, cfg_file)) + + except IOError: + sys.stderr.write( + "Skipping config file '%s': Can't open for reading\n" % cfg_file) + keep_looking = False + + # Apply all the accumulated filters in reverse order (top-level directory + # config options having the least priority). + for filter in reversed(cfg_filters): + _AddFilters(filter) + + return True + + +def ProcessFile(filename, vlevel, extra_check_functions=[]): + """Does google-lint on a single file. + + Args: + filename: The name of the file to parse. + + vlevel: The level of errors to report. Every error of confidence + >= verbose_level will be reported. 0 is a good default. + + extra_check_functions: An array of additional check functions that will be + run on each source line. Each function takes 4 + arguments: filename, clean_lines, line, error + """ + + _SetVerboseLevel(vlevel) + _BackupFilters() + + if not ProcessConfigOverrides(filename): + _RestoreFilters() + return + + lf_lines = [] + crlf_lines = [] + try: + # Support the UNIX convention of using "-" for stdin. Note that + # we are not opening the file with universal newline support + # (which codecs doesn't support anyway), so the resulting lines do + # contain trailing '\r' characters if we are reading a file that + # has CRLF endings. + # If after the split a trailing '\r' is present, it is removed + # below. + if filename == '-': + lines = codecs.StreamReaderWriter(sys.stdin, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace').read().split('\n') + else: + lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n') + + # Remove trailing '\r'. + # The -1 accounts for the extra trailing blank line we get from split() + for linenum in range(len(lines) - 1): + if lines[linenum].endswith('\r'): + lines[linenum] = lines[linenum].rstrip('\r') + crlf_lines.append(linenum + 1) + else: + lf_lines.append(linenum + 1) + + except IOError: + sys.stderr.write( + "Skipping input '%s': Can't open for reading\n" % filename) + _RestoreFilters() + return + + # Note, if no dot is found, this will give the entire filename as the ext. + file_extension = filename[filename.rfind('.') + 1:] + + # When reading from stdin, the extension is unknown, so no cpplint tests + # should rely on the extension. + if filename != '-' and file_extension not in _valid_extensions: + sys.stderr.write('Ignoring %s; not a valid file name ' + '(%s)\n' % (filename, ', '.join(_valid_extensions))) + else: + ProcessFileData(filename, file_extension, lines, Error, + extra_check_functions) + + # If end-of-line sequences are a mix of LF and CR-LF, issue + # warnings on the lines with CR. + # + # Don't issue any warnings if all lines are uniformly LF or CR-LF, + # since critique can handle these just fine, and the style guide + # doesn't dictate a particular end of line sequence. + # + # We can't depend on os.linesep to determine what the desired + # end-of-line sequence should be, since that will return the + # server-side end-of-line sequence. + if lf_lines and crlf_lines: + # Warn on every line with CR. An alternative approach might be to + # check whether the file is mostly CRLF or just LF, and warn on the + # minority, we bias toward LF here since most tools prefer LF. + for linenum in crlf_lines: + Error(filename, linenum, 'whitespace/newline', 1, + 'Unexpected \\r (^M) found; better to use only \\n') + + sys.stderr.write('Done processing %s\n' % filename) + _RestoreFilters() + + +def PrintUsage(message): + """Prints a brief usage string and exits, optionally with an error message. + + Args: + message: The optional error message. + """ + sys.stderr.write(_USAGE) + if message: + sys.exit('\nFATAL ERROR: ' + message) + else: + sys.exit(1) + + +def PrintCategories(): + """Prints a list of all the error-categories used by error messages. + + These are the categories used to filter messages via --filter. + """ + sys.stderr.write(''.join(' %s\n' % cat for cat in _ERROR_CATEGORIES)) + sys.exit(0) + + +def ParseArguments(args): + """Parses the command line arguments. + + This may set the output format and verbosity level as side-effects. + + Args: + args: The command line arguments: + + Returns: + The list of filenames to lint. + """ + try: + (opts, filenames) = getopt.getopt(args, '', ['help', 'output=', 'verbose=', + 'counting=', + 'filter=', + 'root=', + 'linelength=', + 'extensions=']) + except getopt.GetoptError: + PrintUsage('Invalid arguments.') + + verbosity = _VerboseLevel() + output_format = _OutputFormat() + filters = '' + counting_style = '' + + for (opt, val) in opts: + if opt == '--help': + PrintUsage(None) + elif opt == '--output': + if val not in ('emacs', 'vs7', 'eclipse'): + PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.') + output_format = val + elif opt == '--verbose': + verbosity = int(val) + elif opt == '--filter': + filters = val + if not filters: + PrintCategories() + elif opt == '--counting': + if val not in ('total', 'toplevel', 'detailed'): + PrintUsage('Valid counting options are total, toplevel, and detailed') + counting_style = val + elif opt == '--root': + global _root + _root = val + elif opt == '--linelength': + global _line_length + try: + _line_length = int(val) + except ValueError: + PrintUsage('Line length must be digits.') + elif opt == '--extensions': + global _valid_extensions + try: + _valid_extensions = set(val.split(',')) + except ValueError: + PrintUsage('Extensions must be comma seperated list.') + + if not filenames: + PrintUsage('No files were specified.') + + _SetOutputFormat(output_format) + _SetVerboseLevel(verbosity) + _SetFilters(filters) + _SetCountingStyle(counting_style) + + return filenames + + +def main(): + filenames = ParseArguments(sys.argv[1:]) + + # Change stderr to write with replacement characters so we don't die + # if we try to print something containing non-ASCII characters. + sys.stderr = codecs.StreamReaderWriter(sys.stderr, + codecs.getreader('utf8'), + codecs.getwriter('utf8'), + 'replace') + + _cpplint_state.ResetErrorCounts() + for filename in filenames: + ProcessFile(filename, _cpplint_state.verbose_level) + _cpplint_state.PrintErrorCounts() + + sys.exit(_cpplint_state.error_count > 0) + + +if __name__ == '__main__': + main() diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh new file mode 100755 index 00000000000..b2039134d55 --- /dev/null +++ b/cpp/build-support/run-test.sh @@ -0,0 +1,195 @@ +#!/bin/bash +# Copyright 2014 Cloudera, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Script which wraps running a test and redirects its output to a +# test log directory. +# +# If KUDU_COMPRESS_TEST_OUTPUT is non-empty, then the logs will be +# gzip-compressed while they are written. +# +# If KUDU_FLAKY_TEST_ATTEMPTS is non-zero, and the test being run matches +# one of the lines in the file KUDU_FLAKY_TEST_LIST, then the test will +# be retried on failure up to the specified number of times. This can be +# used in the gerrit workflow to prevent annoying false -1s caused by +# tests that are known to be flaky in master. +# +# If KUDU_REPORT_TEST_RESULTS is non-zero, then tests are reported to the +# central test server. + +ROOT=$(cd $(dirname $BASH_SOURCE)/..; pwd) + +TEST_LOGDIR=$ROOT/build/test-logs +mkdir -p $TEST_LOGDIR + +TEST_DEBUGDIR=$ROOT/build/test-debug +mkdir -p $TEST_DEBUGDIR + +TEST_DIRNAME=$(cd $(dirname $1); pwd) +TEST_FILENAME=$(basename $1) +shift +TEST_EXECUTABLE="$TEST_DIRNAME/$TEST_FILENAME" +TEST_NAME=$(echo $TEST_FILENAME | perl -pe 's/\..+?$//') # Remove path and extension (if any). + +# We run each test in its own subdir to avoid core file related races. +TEST_WORKDIR=$ROOT/build/test-work/$TEST_NAME +mkdir -p $TEST_WORKDIR +pushd $TEST_WORKDIR >/dev/null || exit 1 +rm -f * + +set -o pipefail + +LOGFILE=$TEST_LOGDIR/$TEST_NAME.txt +XMLFILE=$TEST_LOGDIR/$TEST_NAME.xml + +TEST_EXECUTION_ATTEMPTS=1 + +# Remove both the uncompressed output, so the developer doesn't accidentally get confused +# and read output from a prior test run. +rm -f $LOGFILE $LOGFILE.gz + +pipe_cmd=cat + +# Configure TSAN (ignored if this isn't a TSAN build). +# +# Deadlock detection (new in clang 3.5) is disabled because: +# 1. The clang 3.5 deadlock detector crashes in some unit tests. It +# needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others. +# 2. Many unit tests report lock-order-inversion warnings; they should be +# fixed before reenabling the detector. +TSAN_OPTIONS="$TSAN_OPTIONS detect_deadlocks=0" +TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$ROOT/build-support/tsan-suppressions.txt" +TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" +export TSAN_OPTIONS + +# Enable leak detection even under LLVM 3.4, where it was disabled by default. +# This flag only takes effect when running an ASAN build. +ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" +export ASAN_OPTIONS + +# Set up suppressions for LeakSanitizer +LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt" +export LSAN_OPTIONS + +# Suppressions require symbolization. We'll default to using the symbolizer in +# thirdparty. +if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then + export ASAN_SYMBOLIZER_PATH=$(find $NATIVE_TOOLCHAIN/llvm-3.7.0/bin -name llvm-symbolizer) +fi + +# Allow for collecting core dumps. +ARROW_TEST_ULIMIT_CORE=${ARROW_TEST_ULIMIT_CORE:-0} +ulimit -c $ARROW_TEST_ULIMIT_CORE + +# Run the actual test. +for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do + if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then + # If the test fails, the test output may or may not be left behind, + # depending on whether the test cleaned up or exited immediately. Either + # way we need to clean it up. We do this by comparing the data directory + # contents before and after the test runs, and deleting anything new. + # + # The comm program requires that its two inputs be sorted. + TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + fi + + # gtest won't overwrite old junit test files, resulting in a build failure + # even when retries are successful. + rm -f $XMLFILE + + echo "Running $TEST_NAME, redirecting output into $LOGFILE" \ + "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)" + $TEST_EXECUTABLE "$@" 2>&1 \ + | $ROOT/build-support/asan_symbolize.py \ + | c++filt \ + | $ROOT/build-support/stacktrace_addr2line.pl $TEST_EXECUTABLE \ + | $pipe_cmd > $LOGFILE + STATUS=$? + + # TSAN doesn't always exit with a non-zero exit code due to a bug: + # mutex errors don't get reported through the normal error reporting infrastructure. + # So we make sure to detect this and exit 1. + # + # Additionally, certain types of failures won't show up in the standard JUnit + # XML output from gtest. We assume that gtest knows better than us and our + # regexes in most cases, but for certain errors we delete the resulting xml + # file and let our own post-processing step regenerate it. + export GREP=$(which egrep) + if zgrep --silent "ThreadSanitizer|Leak check.*detected leaks" $LOGFILE ; then + echo ThreadSanitizer or leak check failures in $LOGFILE + STATUS=1 + rm -f $XMLFILE + fi + + if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then + # Now delete any new test output. + TEST_TMPDIR_AFTER=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + DIFF=$(comm -13 <(echo "$TEST_TMPDIR_BEFORE") \ + <(echo "$TEST_TMPDIR_AFTER")) + for DIR in $DIFF; do + # Multiple tests may be running concurrently. To avoid deleting the + # wrong directories, constrain to only directories beginning with the + # test name. + # + # This may delete old test directories belonging to this test, but + # that's not typically a concern when rerunning flaky tests. + if [[ $DIR =~ ^$TEST_TMPDIR/$TEST_NAME ]]; then + echo Deleting leftover flaky test directory "$DIR" + rm -Rf "$DIR" + fi + done + fi + + if [ "$STATUS" -eq "0" ]; then + break + elif [ "$ATTEMPT_NUMBER" -lt "$TEST_EXECUTION_ATTEMPTS" ]; then + echo Test failed attempt number $ATTEMPT_NUMBER + echo Will retry... + fi +done + +# If we have a LeakSanitizer report, and XML reporting is configured, add a new test +# case result to the XML file for the leak report. Otherwise Jenkins won't show +# us which tests had LSAN errors. +if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then + echo Test had memory leaks. Editing XML + perl -p -i -e ' + if (m##) { + print "\n"; + print " \n"; + print " See txt log file for details\n"; + print " \n"; + print "\n"; + }' $XMLFILE +fi + +# Capture and compress core file and binary. +COREFILES=$(ls | grep ^core) +if [ -n "$COREFILES" ]; then + echo Found core dump. Saving executable and core files. + gzip < $TEST_EXECUTABLE > "$TEST_DEBUGDIR/$TEST_NAME.gz" || exit $? + for COREFILE in $COREFILES; do + gzip < $COREFILE > "$TEST_DEBUGDIR/$TEST_NAME.$COREFILE.gz" || exit $? + done + # Pull in any .so files as well. + for LIB in $(ldd $TEST_EXECUTABLE | grep $ROOT | awk '{print $3}'); do + LIB_NAME=$(basename $LIB) + gzip < $LIB > "$TEST_DEBUGDIR/$LIB_NAME.gz" || exit $? + done +fi + +popd +rm -Rf $TEST_WORKDIR + +exit $STATUS diff --git a/cpp/build-support/stacktrace_addr2line.pl b/cpp/build-support/stacktrace_addr2line.pl new file mode 100755 index 00000000000..7664bab5af6 --- /dev/null +++ b/cpp/build-support/stacktrace_addr2line.pl @@ -0,0 +1,92 @@ +#!/usr/bin/perl +# Copyright 2014 Cloudera, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +####################################################################### +# This script will convert a stack trace with addresses: +# @ 0x5fb015 kudu::master::Master::Init() +# @ 0x5c2d38 kudu::master::MiniMaster::StartOnPorts() +# @ 0x5c31fa kudu::master::MiniMaster::Start() +# @ 0x58270a kudu::MiniCluster::Start() +# @ 0x57dc71 kudu::CreateTableStressTest::SetUp() +# To one with line numbers: +# @ 0x5fb015 kudu::master::Master::Init() at /home/mpercy/src/kudu/src/master/master.cc:54 +# @ 0x5c2d38 kudu::master::MiniMaster::StartOnPorts() at /home/mpercy/src/kudu/src/master/mini_master.cc:52 +# @ 0x5c31fa kudu::master::MiniMaster::Start() at /home/mpercy/src/kudu/src/master/mini_master.cc:33 +# @ 0x58270a kudu::MiniCluster::Start() at /home/mpercy/src/kudu/src/integration-tests/mini_cluster.cc:48 +# @ 0x57dc71 kudu::CreateTableStressTest::SetUp() at /home/mpercy/src/kudu/src/integration-tests/create-table-stress-test.cc:61 +# +# If the script detects that the output is not symbolized, it will also attempt +# to determine the function names, i.e. it will convert: +# @ 0x5fb015 +# @ 0x5c2d38 +# @ 0x5c31fa +# To: +# @ 0x5fb015 kudu::master::Master::Init() at /home/mpercy/src/kudu/src/master/master.cc:54 +# @ 0x5c2d38 kudu::master::MiniMaster::StartOnPorts() at /home/mpercy/src/kudu/src/master/mini_master.cc:52 +# @ 0x5c31fa kudu::master::MiniMaster::Start() at /home/mpercy/src/kudu/src/master/mini_master.cc:33 +####################################################################### +use strict; +use warnings; + +if (!@ARGV) { + die < is magical in Perl. +while (defined(my $input = )) { + if ($input =~ /^\s+\@\s+(0x[[:xdigit:]]{6,})(?:\s+(\S+))?/) { + my $addr = $1; + my $lookup_func_name = (!defined $2); + if (!exists($addr2line_map{$addr})) { + $addr2line_map{$addr} = `addr2line -ifC -e $binary $addr`; + } + chomp $input; + $input .= parse_addr2line_output($addr2line_map{$addr}, $lookup_func_name) . "\n"; + } + print $input; +} + +exit 0; diff --git a/cpp/cmake_modules/CompilerInfo.cmake b/cpp/cmake_modules/CompilerInfo.cmake new file mode 100644 index 00000000000..07860682f9b --- /dev/null +++ b/cpp/cmake_modules/CompilerInfo.cmake @@ -0,0 +1,46 @@ +# Copyright 2013 Cloudera, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Sets COMPILER_FAMILY to 'clang' or 'gcc' +# Sets COMPILER_VERSION to the version +execute_process(COMMAND "${CMAKE_CXX_COMPILER}" -v + ERROR_VARIABLE COMPILER_VERSION_FULL) +message(INFO " ${COMPILER_VERSION_FULL}") + +# clang on Linux and Mac OS X before 10.9 +if("${COMPILER_VERSION_FULL}" MATCHES ".*clang version.*") + set(COMPILER_FAMILY "clang") + string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") +# clang on Mac OS X 10.9 and later +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*based on LLVM.*") + set(COMPILER_FAMILY "clang") + string(REGEX REPLACE ".*based on LLVM ([0-9]+\\.[0.9]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") + +# clang on Mac OS X, XCode 7. No version replacement is done +# because Apple no longer advertises the upstream LLVM version. +elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-700\\..*") + set(COMPILER_FAMILY "clang") + +# gcc +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*gcc version.*") + set(COMPILER_FAMILY "gcc") + string(REGEX REPLACE ".*gcc version ([0-9\\.]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") +else() + message(FATAL_ERROR "Unknown compiler. Version info:\n${COMPILER_VERSION_FULL}") +endif() +message("Selected compiler ${COMPILER_FAMILY} ${COMPILER_VERSION}") + diff --git a/cpp/cmake_modules/FindGPerf.cmake b/cpp/cmake_modules/FindGPerf.cmake new file mode 100644 index 00000000000..e8310799c36 --- /dev/null +++ b/cpp/cmake_modules/FindGPerf.cmake @@ -0,0 +1,69 @@ +# -*- cmake -*- + +# - Find Google perftools +# Find the Google perftools includes and libraries +# This module defines +# GOOGLE_PERFTOOLS_INCLUDE_DIR, where to find heap-profiler.h, etc. +# GOOGLE_PERFTOOLS_FOUND, If false, do not try to use Google perftools. +# also defined for general use are +# TCMALLOC_LIBS, where to find the tcmalloc libraries. +# TCMALLOC_STATIC_LIB, path to libtcmalloc.a. +# TCMALLOC_SHARED_LIB, path to libtcmalloc's shared library +# PROFILER_LIBS, where to find the profiler libraries. +# PROFILER_STATIC_LIB, path to libprofiler.a. +# PROFILER_SHARED_LIB, path to libprofiler's shared library + +FIND_PATH(GOOGLE_PERFTOOLS_INCLUDE_DIR google/heap-profiler.h + $ENV{NATIVE_TOOLCHAIN}/gperftools-$ENV{GPERFTOOLS_VERSION}/include + NO_DEFAULT_PATH +) + +SET(GPERF_LIB_SEARCH $ENV{NATIVE_TOOLCHAIN}/gperftools-$ENV{GPERFTOOLS_VERSION}/lib) + +FIND_LIBRARY(TCMALLOC_LIB_PATH + NAMES libtcmalloc.a + PATHS ${GPERF_LIB_SEARCH} + NO_DEFAULT_PATH +) + +IF (TCMALLOC_LIB_PATH AND GOOGLE_PERFTOOLS_INCLUDE_DIR) + SET(TCMALLOC_LIBS ${GPERF_LIB_SEARCH}) + SET(TCMALLOC_LIB_NAME libtcmalloc) + SET(TCMALLOC_STATIC_LIB ${GPERF_LIB_SEARCH}/${TCMALLOC_LIB_NAME}.a) + SET(TCMALLOC_SHARED_LIB ${TCMALLOC_LIBS}/${TCMALLOC_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + SET(GOOGLE_PERFTOOLS_FOUND "YES") +ELSE (TCMALLOC_LIB_PATH AND GOOGLE_PERFTOOLS_INCLUDE_DIR) + SET(GOOGLE_PERFTOOLS_FOUND "NO") +ENDIF (TCMALLOC_LIB_PATH AND GOOGLE_PERFTOOLS_INCLUDE_DIR) + +FIND_LIBRARY(PROFILER_LIB_PATH + NAMES libprofiler.a + PATHS ${GPERF_LIB_SEARCH} +) + +IF (PROFILER_LIB_PATH AND GOOGLE_PERFTOOLS_INCLUDE_DIR) + SET(PROFILER_LIBS ${GPERF_LIB_SEARCH}) + SET(PROFILER_LIB_NAME libprofiler) + SET(PROFILER_STATIC_LIB ${GPERF_LIB_SEARCH}/${PROFILER_LIB_NAME}.a) + SET(PROFILER_SHARED_LIB ${PROFILER_LIBS}/${PROFILER_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +ENDIF (PROFILER_LIB_PATH AND GOOGLE_PERFTOOLS_INCLUDE_DIR) + +IF (GOOGLE_PERFTOOLS_FOUND) + IF (NOT GPerf_FIND_QUIETLY) + MESSAGE(STATUS "Found the Google perftools library: ${TCMALLOC_LIBS}") + ENDIF (NOT GPerf_FIND_QUIETLY) +ELSE (GOOGLE_PERFTOOLS_FOUND) + IF (GPerf_FIND_REQUIRED) + MESSAGE(FATAL_ERROR "Could not find the Google perftools library") + ENDIF (GPerf_FIND_REQUIRED) +ENDIF (GOOGLE_PERFTOOLS_FOUND) + +MARK_AS_ADVANCED( + TCMALLOC_LIBS + TCMALLOC_STATIC_LIB + TCMALLOC_SHARED_LIB + PROFILER_LIBS + PROFILER_STATIC_LIB + PROFILER_SHARED_LIB + GOOGLE_PERFTOOLS_INCLUDE_DIR +) diff --git a/cpp/cmake_modules/FindGTest.cmake b/cpp/cmake_modules/FindGTest.cmake new file mode 100644 index 00000000000..e47faf0dd89 --- /dev/null +++ b/cpp/cmake_modules/FindGTest.cmake @@ -0,0 +1,91 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find GTest headers and libraries. +# +# Usage of this module as follows: +# +# find_package(GTest) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# GTest_HOME - When set, this path is inspected instead of standard library +# locations as the root of the GTest installation. +# The environment variable GTEST_HOME overrides this veriable. +# +# This module defines +# GTEST_INCLUDE_DIR, directory containing headers +# GTEST_LIBS, directory containing gtest libraries +# GTEST_STATIC_LIB, path to libgtest.a +# GTEST_SHARED_LIB, path to libgtest's shared library +# GTEST_FOUND, whether gtest has been found + +if( NOT "$ENV{GTEST_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{GTEST_HOME}" _native_path ) + list( APPEND _gtest_roots ${_native_path} ) +elseif ( GTest_HOME ) + list( APPEND _gtest_roots ${GTest_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _gtest_roots ) + find_path( GTEST_INCLUDE_DIR NAMES gtest/gtest.h + PATHS ${_gtest_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( GTEST_LIBRARIES NAMES gtest + PATHS ${_gtest_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else () + find_path( GTEST_INCLUDE_DIR NAMES gtest/gtest.h ) + find_library( GTEST_LIBRARIES NAMES gtest ) +endif () + + +if (GTEST_INCLUDE_DIR AND GTEST_LIBRARIES) + set(GTEST_FOUND TRUE) + get_filename_component( GTEST_LIBS ${GTEST_LIBRARIES} DIRECTORY ) + set(GTEST_LIB_NAME libgtest) + set(GTEST_STATIC_LIB ${GTEST_LIBS}/${GTEST_LIB_NAME}.a) + set(GTEST_SHARED_LIB ${GTEST_LIBS}/${GTEST_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +else () + set(GTEST_FOUND FALSE) +endif () + +if (GTEST_FOUND) + if (NOT GTest_FIND_QUIETLY) + message(STATUS "Found the GTest library: ${GTEST_LIBRARIES}") + endif () +else () + if (NOT GTest_FIND_QUIETLY) + set(GTEST_ERR_MSG "Could not find the GTest library. Looked in ") + if ( _gtest_roots ) + set(GTEST_ERR_MSG "${GTEST_ERR_MSG} in ${_gtest_roots}.") + else () + set(GTEST_ERR_MSG "${GTEST_ERR_MSG} system search paths.") + endif () + if (GTest_FIND_REQUIRED) + message(FATAL_ERROR "${GTEST_ERR_MSG}") + else (GTest_FIND_REQUIRED) + message(STATUS "${GTEST_ERR_MSG}") + endif (GTest_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + GTEST_INCLUDE_DIR + GTEST_LIBS + GTEST_LIBRARIES + GTEST_STATIC_LIB + GTEST_SHARED_LIB +) diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake new file mode 100644 index 00000000000..76c2d1dbee9 --- /dev/null +++ b/cpp/cmake_modules/FindParquet.cmake @@ -0,0 +1,80 @@ +# Copyright 2012 Cloudera Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so) +# This module defines +# PARQUET_INCLUDE_DIR, directory containing headers +# PARQUET_LIBS, directory containing parquet libraries +# PARQUET_STATIC_LIB, path to libparquet.a +# PARQUET_SHARED_LIB, path to libparquet's shared library +# PARQUET_FOUND, whether parquet has been found + +if( NOT "$ENV{PARQUET_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{PARQUET_HOME}" _native_path ) + list( APPEND _parquet_roots ${_native_path} ) +elseif ( Parquet_HOME ) + list( APPEND _parquet_roots ${Parquet_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _parquet_roots ) + find_path( PARQUET_INCLUDE_DIR NAMES parquet/parquet.h + PATHS ${_parquet_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( PARQUET_LIBRARIES NAMES parquet + PATHS ${_parquet_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else () + find_path( PARQUET_INCLUDE_DIR NAMES parquet/parquet.h ) + find_library( PARQUET_LIBRARIES NAMES parquet ) +endif () + + +if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES) + set(PARQUET_FOUND TRUE) + get_filename_component( PARQUET_LIBS ${PARQUET_LIBRARIES} DIRECTORY ) + set(PARQUET_LIB_NAME libparquet) + set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a) + set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +else () + set(PARQUET_FOUND FALSE) +endif () + +if (PARQUET_FOUND) + if (NOT Parquet_FIND_QUIETLY) + message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}") + endif () +else () + if (NOT Parquet_FIND_QUIETLY) + set(PARQUET_ERR_MSG "Could not find the Parquet library. Looked in ") + if ( _parquet_roots ) + set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} in ${_parquet_roots}.") + else () + set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} system search paths.") + endif () + if (Parquet_FIND_REQUIRED) + message(FATAL_ERROR "${PARQUET_ERR_MSG}") + else (Parquet_FIND_REQUIRED) + message(STATUS "${PARQUET_ERR_MSG}") + endif (Parquet_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + PARQUET_INCLUDE_DIR + PARQUET_LIBS + PARQUET_LIBRARIES + PARQUET_STATIC_LIB + PARQUET_SHARED_LIB +) diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake new file mode 100644 index 00000000000..b847c96657a --- /dev/null +++ b/cpp/cmake_modules/san-config.cmake @@ -0,0 +1,92 @@ +# Clang does not support using ASAN and TSAN simultaneously. +if ("${ARROW_USE_ASAN}" AND "${ARROW_USE_TSAN}") + message(SEND_ERROR "Can only enable one of ASAN or TSAN at a time") +endif() + +# Flag to enable clang address sanitizer +# This will only build if clang or a recent enough gcc is the chosen compiler +if (${ARROW_USE_ASAN}) + if(NOT (("${COMPILER_FAMILY}" STREQUAL "clang") OR + ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.8"))) + message(SEND_ERROR "Cannot use ASAN without clang or gcc >= 4.8") + endif() + + # If UBSAN is also enabled, and we're on clang < 3.5, ensure static linking is + # enabled. Otherwise, we run into https://llvm.org/bugs/show_bug.cgi?id=18211 + if("${ARROW_USE_UBSAN}" AND + "${COMPILER_FAMILY}" STREQUAL "clang" AND + "${COMPILER_VERSION}" VERSION_LESS "3.5") + if("${ARROW_LINK}" STREQUAL "a") + message("Using static linking for ASAN+UBSAN build") + set(ARROW_LINK "s") + elseif("${ARROW_LINK}" STREQUAL "d") + message(SEND_ERROR "Cannot use dynamic linking when ASAN and UBSAN are both enabled") + endif() + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -DADDRESS_SANITIZER") +endif() + + +# Flag to enable clang undefined behavior sanitizer +# We explicitly don't enable all of the sanitizer flags: +# - disable 'vptr' because it currently crashes somewhere in boost::intrusive::list code +# - disable 'alignment' because unaligned access is really OK on Nehalem and we do it +# all over the place. +if (${ARROW_USE_UBSAN}) + if(NOT (("${COMPILER_FAMILY}" STREQUAL "clang") OR + ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.9"))) + message(SEND_ERROR "Cannot use UBSAN without clang or gcc >= 4.9") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined -fno-sanitize=alignment,vptr -fno-sanitize-recover") +endif () + +# Flag to enable thread sanitizer (clang or gcc 4.8) +if (${ARROW_USE_TSAN}) + if(NOT (("${COMPILER_FAMILY}" STREQUAL "clang") OR + ("${COMPILER_FAMILY}" STREQUAL "gcc" AND "${COMPILER_VERSION}" VERSION_GREATER "4.8"))) + message(SEND_ERROR "Cannot use TSAN without clang or gcc >= 4.8") + endif() + + add_definitions("-fsanitize=thread") + + # Enables dynamic_annotations.h to actually generate code + add_definitions("-DDYNAMIC_ANNOTATIONS_ENABLED") + + # changes atomicops to use the tsan implementations + add_definitions("-DTHREAD_SANITIZER") + + # Disables using the precompiled template specializations for std::string, shared_ptr, etc + # so that the annotations in the header actually take effect. + add_definitions("-D_GLIBCXX_EXTERN_TEMPLATE=0") + + # Some of the above also need to be passed to the linker. + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pie -fsanitize=thread") + + # Strictly speaking, TSAN doesn't require dynamic linking. But it does + # require all code to be position independent, and the easiest way to + # guarantee that is via dynamic linking (not all 3rd party archives are + # compiled with -fPIC e.g. boost). + if("${ARROW_LINK}" STREQUAL "a") + message("Using dynamic linking for TSAN") + set(ARROW_LINK "d") + elseif("${ARROW_LINK}" STREQUAL "s") + message(SEND_ERROR "Cannot use TSAN with static linking") + endif() +endif() + + +if ("${ARROW_USE_UBSAN}" OR "${ARROW_USE_ASAN}" OR "${ARROW_USE_TSAN}") + # GCC 4.8 and 4.9 (latest as of this writing) don't allow you to specify a + # sanitizer blacklist. + if("${COMPILER_FAMILY}" STREQUAL "clang") + # Require clang 3.4 or newer; clang 3.3 has issues with TSAN and pthread + # symbol interception. + if("${COMPILER_VERSION}" VERSION_LESS "3.4") + message(SEND_ERROR "Must use clang 3.4 or newer to run a sanitizer build." + " Try using clang from $NATIVE_TOOLCHAIN/") + endif() + add_definitions("-fsanitize-blacklist=${BUILD_SUPPORT_DIR}/sanitize-blacklist.txt") + else() + message(WARNING "GCC does not support specifying a sanitizer blacklist. Known sanitizer check failures will not be suppressed.") + endif() +endif() diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh new file mode 100755 index 00000000000..457b9717ebe --- /dev/null +++ b/cpp/setup_build_env.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -e + +SOURCE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) + +./thirdparty/download_thirdparty.sh +./thirdparty/build_thirdparty.sh + +export GTEST_HOME=$SOURCE_DIR/thirdparty/$GTEST_BASEDIR + +echo "Build env initialized" diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt new file mode 100644 index 00000000000..eeea2dbc517 --- /dev/null +++ b/cpp/src/arrow/CMakeLists.txt @@ -0,0 +1,33 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Headers: top level +install(FILES + api.h + array.h + builder.h + type.h + DESTINATION include/arrow) + +####################################### +# Unit tests +####################################### + +set(ARROW_TEST_LINK_LIBS arrow_test_util ${ARROW_MIN_TEST_LIBS}) + +ADD_ARROW_TEST(array-test) +ADD_ARROW_TEST(field-test) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h new file mode 100644 index 00000000000..899e8aae19c --- /dev/null +++ b/cpp/src/arrow/api.h @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_API_H +#define ARROW_API_H + +#endif // ARROW_API_H diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc new file mode 100644 index 00000000000..5ecf91624fe --- /dev/null +++ b/cpp/src/arrow/array-test.cc @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" +#include "arrow/util/buffer.h" + +using std::string; +using std::vector; + +namespace arrow { + +static TypePtr int32 = TypePtr(new Int32Type()); +static TypePtr int32_nn = TypePtr(new Int32Type(false)); + + +class TestArray : public ::testing::Test { + public: + void SetUp() { + auto data = std::make_shared(); + auto nulls = std::make_shared(); + + ASSERT_OK(data->Resize(400)); + ASSERT_OK(nulls->Resize(128)); + + arr_.reset(new Int32Array(100, data, nulls)); + } + + protected: + std::unique_ptr arr_; +}; + + +TEST_F(TestArray, TestNullable) { + std::shared_ptr tmp = arr_->data(); + std::unique_ptr arr_nn(new Int32Array(100, tmp)); + + ASSERT_TRUE(arr_->nullable()); + ASSERT_FALSE(arr_nn->nullable()); +} + + +TEST_F(TestArray, TestLength) { + ASSERT_EQ(arr_->length(), 100); +} + +TEST_F(TestArray, TestIsNull) { + vector nulls = {1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 0, 1}; + + std::shared_ptr null_buf = bytes_to_null_buffer(nulls.data(), nulls.size()); + std::unique_ptr arr; + arr.reset(new Array(int32, nulls.size(), null_buf)); + + ASSERT_EQ(null_buf->size(), 5); + for (size_t i = 0; i < nulls.size(); ++i) { + ASSERT_EQ(static_cast(nulls[i]), arr->IsNull(i)); + } +} + + +TEST_F(TestArray, TestCopy) { +} + +} // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc new file mode 100644 index 00000000000..1726a2f27d8 --- /dev/null +++ b/cpp/src/arrow/array.cc @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array.h" + +#include "arrow/util/buffer.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Base array class + +Array::Array(const TypePtr& type, int64_t length, + const std::shared_ptr& nulls) { + Init(type, length, nulls); +} + +void Array::Init(const TypePtr& type, int64_t length, + const std::shared_ptr& nulls) { + type_ = type; + length_ = length; + nulls_ = nulls; + + nullable_ = type->nullable; + if (nulls_) { + null_bits_ = nulls_->data(); + } +} + +} // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h new file mode 100644 index 00000000000..c95450d12a4 --- /dev/null +++ b/cpp/src/arrow/array.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_ARRAY_H +#define ARROW_ARRAY_H + +#include +#include +#include + +#include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/macros.h" + +namespace arrow { + +class Buffer; + +// Immutable data array with some logical type and some length. Any memory is +// owned by the respective Buffer instance (or its parents). May or may not be +// nullable. +// +// The base class only has a null array (if the data type is nullable) +// +// Any buffers used to initialize the array have their references "stolen". If +// you wish to use the buffer beyond the lifetime of the array, you need to +// explicitly increment its reference count +class Array { + public: + Array() : length_(0), nulls_(nullptr), null_bits_(nullptr) {} + Array(const TypePtr& type, int64_t length, + const std::shared_ptr& nulls = nullptr); + + virtual ~Array() {} + + void Init(const TypePtr& type, int64_t length, const std::shared_ptr& nulls); + + // Determine if a slot if null. For inner loops. Does *not* boundscheck + bool IsNull(int64_t i) const { + return nullable_ && util::get_bit(null_bits_, i); + } + + int64_t length() const { return length_;} + bool nullable() const { return nullable_;} + const TypePtr& type() const { return type_;} + TypeEnum type_enum() const { return type_->type;} + + protected: + TypePtr type_; + bool nullable_; + int64_t length_; + + std::shared_ptr nulls_; + const uint8_t* null_bits_; + + private: + DISALLOW_COPY_AND_ASSIGN(Array); +}; + + +typedef std::shared_ptr ArrayPtr; + +} // namespace arrow + +#endif diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc new file mode 100644 index 00000000000..1fd74719283 --- /dev/null +++ b/cpp/src/arrow/builder.cc @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/builder.h" + +#include + +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +Status ArrayBuilder::Init(int64_t capacity) { + capacity_ = capacity; + + if (nullable_) { + int64_t to_alloc = util::ceil_byte(capacity) / 8; + nulls_ = std::make_shared(); + RETURN_NOT_OK(nulls_->Resize(to_alloc)); + null_bits_ = nulls_->mutable_data(); + memset(null_bits_, 0, to_alloc); + } + return Status::OK(); +} + +Status ArrayBuilder::Resize(int64_t new_bits) { + if (nullable_) { + int64_t new_bytes = util::ceil_byte(new_bits) / 8; + int64_t old_bytes = nulls_->size(); + RETURN_NOT_OK(nulls_->Resize(new_bytes)); + null_bits_ = nulls_->mutable_data(); + if (old_bytes < new_bytes) { + memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); + } + } + return Status::OK(); +} + +Status ArrayBuilder::Advance(int64_t elements) { + if (nullable_ && length_ + elements > capacity_) { + return Status::Invalid("Builder must be expanded"); + } + length_ += elements; + return Status::OK(); +} + + +} // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h new file mode 100644 index 00000000000..b43668af77c --- /dev/null +++ b/cpp/src/arrow/builder.h @@ -0,0 +1,101 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_BUILDER_H +#define ARROW_BUILDER_H + +#include +#include +#include + +#include "arrow/type.h" +#include "arrow/util/buffer.h" +#include "arrow/util/macros.h" +#include "arrow/util/status.h" + +namespace arrow { + +class Array; + +static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8; + +// Base class for all data array builders +class ArrayBuilder { + public: + explicit ArrayBuilder(const TypePtr& type) + : type_(type), + nullable_(type_->nullable), + nulls_(nullptr), null_bits_(nullptr), + length_(0), + capacity_(0) {} + + virtual ~ArrayBuilder() {} + + // For nested types. Since the objects are owned by this class instance, we + // skip shared pointers and just return a raw pointer + ArrayBuilder* child(int i) { + return children_[i].get(); + } + + int num_children() const { + return children_.size(); + } + + int64_t length() const { return length_;} + int64_t capacity() const { return capacity_;} + bool nullable() const { return nullable_;} + + // Allocates requires memory at this level, but children need to be + // initialized independently + Status Init(int64_t capacity); + + // Resizes the nulls array (if nullable) + Status Resize(int64_t new_bits); + + // For cases where raw data was memcpy'd into the internal buffers, allows us + // to advance the length of the builder. It is your responsibility to use + // this function responsibly. + Status Advance(int64_t elements); + + const std::shared_ptr& nulls() const { return nulls_;} + + // Creates new array object to hold the contents of the builder and transfers + // ownership of the data + virtual Status ToArray(Array** out) = 0; + + protected: + TypePtr type_; + bool nullable_; + + // If the type is not nullable, then null_ is nullptr after initialization + std::shared_ptr nulls_; + uint8_t* null_bits_; + + // Array length, so far. Also, the index of the next element to be added + int64_t length_; + int64_t capacity_; + + // Child value array builders. These are owned by this class + std::vector > children_; + + private: + DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); +}; + +} // namespace arrow + +#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/field-test.cc b/cpp/src/arrow/field-test.cc new file mode 100644 index 00000000000..2bb8bad4054 --- /dev/null +++ b/cpp/src/arrow/field-test.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/field.h" +#include "arrow/type.h" +#include "arrow/types/integer.h" + +using std::string; + +namespace arrow { + +TEST(TestField, Basics) { + TypePtr ftype = TypePtr(new Int32Type()); + Field f0("f0", ftype); + + ASSERT_EQ(f0.name, "f0"); + ASSERT_EQ(f0.type->ToString(), ftype->ToString()); +} + +} // namespace arrow diff --git a/cpp/src/arrow/field.h b/cpp/src/arrow/field.h new file mode 100644 index 00000000000..664cae61a77 --- /dev/null +++ b/cpp/src/arrow/field.h @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_FIELD_H +#define ARROW_FIELD_H + +#include + +#include "arrow/type.h" + +namespace arrow { + +// A field is a piece of metadata that includes (for now) a name and a data +// type + +struct Field { + // Field name + std::string name; + + // The field's data type + TypePtr type; + + Field(const std::string& name, const TypePtr& type) : + name(name), type(type) {} + + bool Equals(const Field& other) const { + return (this == &other) || (this->name == other.name && + this->type->Equals(other.type.get())); + } +}; + +} // namespace arrow + +#endif // ARROW_FIELD_H diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt new file mode 100644 index 00000000000..7b449affab0 --- /dev/null +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# ---------------------------------------------------------------------- +# arrow_parquet : Arrow <-> Parquet adapter + +set(PARQUET_SRCS +) + +set(PARQUET_LIBS +) + +add_library(arrow_parquet STATIC + ${PARQUET_SRCS} +) +target_link_libraries(arrow_parquet ${PARQUET_LIBS}) +SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) + +# Headers: top level +install(FILES + DESTINATION include/arrow/parquet) diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h new file mode 100644 index 00000000000..2233a4f832a --- /dev/null +++ b/cpp/src/arrow/test-util.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TEST_UTIL_H_ +#define ARROW_TEST_UTIL_H_ + +#include +#include +#include +#include + +#include "arrow/util/bit-util.h" +#include "arrow/util/random.h" +#include "arrow/util/status.h" + +#define ASSERT_RAISES(ENUM, expr) \ + do { \ + Status s = (expr); \ + ASSERT_TRUE(s.Is##ENUM()); \ + } while (0) + + +#define ASSERT_OK(expr) \ + do { \ + Status s = (expr); \ + ASSERT_TRUE(s.ok()); \ + } while (0) + + +#define EXPECT_OK(expr) \ + do { \ + Status s = (expr); \ + EXPECT_TRUE(s.ok()); \ + } while (0) + + +namespace arrow { + +template +void randint(int64_t N, T lower, T upper, std::vector* out) { + Random rng(random_seed()); + uint64_t draw; + uint64_t span = upper - lower; + T val; + for (int64_t i = 0; i < N; ++i) { + draw = rng.Uniform64(span); + val = lower + static_cast(draw); + out->push_back(val); + } +} + + +template +std::shared_ptr to_buffer(const std::vector& values) { + return std::make_shared(reinterpret_cast(values.data()), + values.size() * sizeof(T)); +} + +void random_nulls(int64_t n, double pct_null, std::vector* nulls) { + Random rng(random_seed()); + for (int i = 0; i < n; ++i) { + nulls->push_back(static_cast(rng.NextDoubleFraction() > pct_null)); + } +} + +void random_nulls(int64_t n, double pct_null, std::vector* nulls) { + Random rng(random_seed()); + for (int i = 0; i < n; ++i) { + nulls->push_back(rng.NextDoubleFraction() > pct_null); + } +} + +std::shared_ptr bytes_to_null_buffer(uint8_t* bytes, int length) { + std::shared_ptr out; + + // TODO(wesm): error checking + util::bytes_to_bits(bytes, length, &out); + return out; +} + +} // namespace arrow + +#endif // ARROW_TEST_UTIL_H_ diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc new file mode 100644 index 00000000000..492eee52b04 --- /dev/null +++ b/cpp/src/arrow/type.cc @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/type.h" + +namespace arrow { + +} // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h new file mode 100644 index 00000000000..220f99f4e88 --- /dev/null +++ b/cpp/src/arrow/type.h @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_H +#define ARROW_TYPE_H + +#include +#include + +namespace arrow { + +// Physical data type that describes the memory layout of values. See details +// for each type +enum class LayoutEnum: char { + // A physical type consisting of some non-negative number of bytes + BYTE = 0, + + // A physical type consisting of some non-negative number of bits + BIT = 1, + + // A parametric variable-length value type. Full specification requires a + // child logical type + LIST = 2, + + // A collection of multiple equal-length child arrays. Parametric type taking + // 1 or more child logical types + STRUCT = 3, + + // An array with heterogeneous value types. Parametric types taking 1 or more + // child logical types + DENSE_UNION = 4, + SPARSE_UNION = 5 +}; + + +struct LayoutType { + LayoutEnum type; + explicit LayoutType(LayoutEnum type) : type(type) {} +}; + + +// Data types in this library are all *logical*. They can be expressed as +// either a primitive physical type (bytes or bits of some fixed size), a +// nested type consisting of other data types, or another data type (e.g. a +// timestamp encoded as an int64) +// +// Any data type can be nullable + +enum class TypeEnum: char { + // A degerate NULL type represented as 0 bytes/bits + NA = 0, + + // Little-endian integer types + UINT8 = 1, + INT8 = 2, + UINT16 = 3, + INT16 = 4, + UINT32 = 5, + INT32 = 6, + UINT64 = 7, + INT64 = 8, + + // A boolean value represented as 1 byte + BOOL = 9, + + // A boolean value represented as 1 bit + BIT = 10, + + // 4-byte floating point value + FLOAT = 11, + + // 8-byte floating point value + DOUBLE = 12, + + // CHAR(N): fixed-length UTF8 string with length N + CHAR = 13, + + // UTF8 variable-length string as List + STRING = 14, + + // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) + VARCHAR = 15, + + // Variable-length bytes (no guarantee of UTF8-ness) + BINARY = 16, + + // By default, int32 days since the UNIX epoch + DATE = 17, + + // Exact timestamp encoded with int64 since UNIX epoch + // Default unit millisecond + TIMESTAMP = 18, + + // Timestamp as double seconds since the UNIX epoch + TIMESTAMP_DOUBLE = 19, + + // Exact time encoded with int64, default unit millisecond + TIME = 20, + + // Precision- and scale-based decimal type. Storage type depends on the + // parameters. + DECIMAL = 21, + + // Decimal value encoded as a text string + DECIMAL_TEXT = 22, + + // A list of some logical data type + LIST = 30, + + // Struct of logical types + STRUCT = 31, + + // Unions of logical types + DENSE_UNION = 32, + SPARSE_UNION = 33, + + // Union + JSON_SCALAR = 50, + + // User-defined type + USER = 60 +}; + + +struct DataType { + TypeEnum type; + bool nullable; + + explicit DataType(TypeEnum type, bool nullable = true) + : type(type), nullable(nullable) {} + + virtual bool Equals(const DataType* other) { + return (this == other) || (this->type == other->type && + this->nullable == other->nullable); + } + + virtual std::string ToString() const = 0; +}; + + +typedef std::shared_ptr LayoutPtr; +typedef std::shared_ptr TypePtr; + + +struct BytesType : public LayoutType { + int size; + + explicit BytesType(int size) + : LayoutType(LayoutEnum::BYTE), + size(size) {} + + BytesType(const BytesType& other) + : BytesType(other.size) {} +}; + +struct ListLayoutType : public LayoutType { + LayoutPtr value_type; + + explicit ListLayoutType(const LayoutPtr& value_type) + : LayoutType(LayoutEnum::BYTE), + value_type(value_type) {} +}; + +} // namespace arrow + +#endif // ARROW_TYPE_H diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt new file mode 100644 index 00000000000..e090aead1f8 --- /dev/null +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -0,0 +1,63 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# arrow_types +####################################### + +set(TYPES_SRCS + construct.cc + floating.cc + integer.cc + json.cc + list.cc + primitive.cc + string.cc + struct.cc + union.cc +) + +set(TYPES_LIBS +) + +add_library(arrow_types STATIC + ${TYPES_SRCS} +) +target_link_libraries(arrow_types ${TYPES_LIBS}) +SET_TARGET_PROPERTIES(arrow_types PROPERTIES LINKER_LANGUAGE CXX) + +# Headers: top level +install(FILES + boolean.h + collection.h + datetime.h + decimal.h + floating.h + integer.h + json.h + list.h + primitive.h + string.h + struct.h + union.h + DESTINATION include/arrow/types) + + +ADD_ARROW_TEST(list-test) +ADD_ARROW_TEST(primitive-test) +ADD_ARROW_TEST(string-test) +ADD_ARROW_TEST(struct-test) diff --git a/cpp/src/arrow/types/binary.h b/cpp/src/arrow/types/binary.h new file mode 100644 index 00000000000..a9f20046b58 --- /dev/null +++ b/cpp/src/arrow/types/binary.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_BINARY_H +#define ARROW_TYPES_BINARY_H + +#include +#include + +#include "arrow/type.h" + +namespace arrow { + +struct StringType : public DataType { +}; + +} // namespace arrow + +#endif // ARROW_TYPES_BINARY_H diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h new file mode 100644 index 00000000000..31388c8152d --- /dev/null +++ b/cpp/src/arrow/types/boolean.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_BOOLEAN_H +#define ARROW_TYPES_BOOLEAN_H + +#include "arrow/types/primitive.h" + +namespace arrow { + +struct BooleanType : public PrimitiveType { + PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); +}; + +typedef PrimitiveArrayImpl BooleanArray; + +// typedef PrimitiveBuilder BooleanBuilder; + +} // namespace arrow + +#endif // ARROW_TYPES_BOOLEAN_H diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h new file mode 100644 index 00000000000..59ba6141941 --- /dev/null +++ b/cpp/src/arrow/types/collection.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_COLLECTION_H +#define ARROW_TYPES_COLLECTION_H + +#include +#include + +#include "arrow/type.h" + +namespace arrow { + +template +struct CollectionType : public DataType { + std::vector child_types_; + + explicit CollectionType(bool nullable = true) : DataType(T, nullable) {} + + const TypePtr& child(int i) const { + return child_types_[i]; + } + + int num_children() const { + return child_types_.size(); + } +}; + +} // namespace arrow + +#endif // ARROW_TYPES_COLLECTION_H diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc new file mode 100644 index 00000000000..5176cafd3ba --- /dev/null +++ b/cpp/src/arrow/types/construct.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/construct.h" + +#include + +#include "arrow/types/floating.h" +#include "arrow/types/integer.h" +#include "arrow/types/list.h" +#include "arrow/types/string.h" +#include "arrow/util/status.h" + +namespace arrow { + +class ArrayBuilder; + +// Initially looked at doing this with vtables, but shared pointers makes it +// difficult + +#define BUILDER_CASE(ENUM, BuilderType) \ + case TypeEnum::ENUM: \ + *out = static_cast(new BuilderType(type)); \ + return Status::OK(); + +Status make_builder(const TypePtr& type, ArrayBuilder** out) { + switch (type->type) { + BUILDER_CASE(UINT8, UInt8Builder); + BUILDER_CASE(INT8, Int8Builder); + BUILDER_CASE(UINT16, UInt16Builder); + BUILDER_CASE(INT16, Int16Builder); + BUILDER_CASE(UINT32, UInt32Builder); + BUILDER_CASE(INT32, Int32Builder); + BUILDER_CASE(UINT64, UInt64Builder); + BUILDER_CASE(INT64, Int64Builder); + + // BUILDER_CASE(BOOL, BooleanBuilder); + + BUILDER_CASE(FLOAT, FloatBuilder); + BUILDER_CASE(DOUBLE, DoubleBuilder); + + BUILDER_CASE(STRING, StringBuilder); + + case TypeEnum::LIST: + { + ListType* list_type = static_cast(type.get()); + ArrayBuilder* value_builder; + RETURN_NOT_OK(make_builder(list_type->value_type, &value_builder)); + + // The ListBuilder takes ownership of the value_builder + ListBuilder* builder = new ListBuilder(type, value_builder); + *out = static_cast(builder); + return Status::OK(); + } + // BUILDER_CASE(CHAR, CharBuilder); + + // BUILDER_CASE(VARCHAR, VarcharBuilder); + // BUILDER_CASE(BINARY, BinaryBuilder); + + // BUILDER_CASE(DATE, DateBuilder); + // BUILDER_CASE(TIMESTAMP, TimestampBuilder); + // BUILDER_CASE(TIME, TimeBuilder); + + // BUILDER_CASE(LIST, ListBuilder); + // BUILDER_CASE(STRUCT, StructBuilder); + // BUILDER_CASE(DENSE_UNION, DenseUnionBuilder); + // BUILDER_CASE(SPARSE_UNION, SparseUnionBuilder); + + default: + return Status::NotImplemented(type->ToString()); + } +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h new file mode 100644 index 00000000000..c0bfedd27d6 --- /dev/null +++ b/cpp/src/arrow/types/construct.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_CONSTRUCT_H +#define ARROW_TYPES_CONSTRUCT_H + +#include "arrow/type.h" + +namespace arrow { + +class ArrayBuilder; +class Status; + +Status make_builder(const TypePtr& type, ArrayBuilder** out); + +} // namespace arrow + +#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h new file mode 100644 index 00000000000..b4d62523c41 --- /dev/null +++ b/cpp/src/arrow/types/datetime.h @@ -0,0 +1,79 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_DATETIME_H +#define ARROW_TYPES_DATETIME_H + +#include "arrow/type.h" + +namespace arrow { + +struct DateType : public DataType { + enum class Unit: char { + DAY = 0, + MONTH = 1, + YEAR = 2 + }; + + Unit unit; + + explicit DateType(Unit unit = Unit::DAY, bool nullable = true) + : DataType(TypeEnum::DATE, nullable), + unit(unit) {} + + DateType(const DateType& other) + : DateType(other.unit, other.nullable) {} + + static char const *name() { + return "date"; + } + + // virtual std::string ToString() { + // return name(); + // } +}; + + +struct TimestampType : public DataType { + enum class Unit: char { + SECOND = 0, + MILLI = 1, + MICRO = 2, + NANO = 3 + }; + + Unit unit; + + explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) + : DataType(TypeEnum::TIMESTAMP, nullable), + unit(unit) {} + + TimestampType(const TimestampType& other) + : TimestampType(other.unit, other.nullable) {} + + static char const *name() { + return "timestamp"; + } + + // virtual std::string ToString() { + // return name(); + // } +}; + +} // namespace arrow + +#endif // ARROW_TYPES_DATETIME_H diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h new file mode 100644 index 00000000000..464c3ff8da9 --- /dev/null +++ b/cpp/src/arrow/types/decimal.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_DECIMAL_H +#define ARROW_TYPES_DECIMAL_H + +#include "arrow/type.h" + +namespace arrow { + +struct DecimalType : public DataType { + int precision; + int scale; +}; + +} // namespace arrow + +#endif // ARROW_TYPES_DECIMAL_H diff --git a/cpp/src/arrow/types/floating.cc b/cpp/src/arrow/types/floating.cc new file mode 100644 index 00000000000..bde28266e63 --- /dev/null +++ b/cpp/src/arrow/types/floating.cc @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/floating.h" + +namespace arrow { + +} // namespace arrow diff --git a/cpp/src/arrow/types/floating.h b/cpp/src/arrow/types/floating.h new file mode 100644 index 00000000000..7551ce665a2 --- /dev/null +++ b/cpp/src/arrow/types/floating.h @@ -0,0 +1,43 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_FLOATING_H +#define ARROW_TYPES_FLOATING_H + +#include + +#include "arrow/types/primitive.h" + +namespace arrow { + +struct FloatType : public PrimitiveType { + PRIMITIVE_DECL(FloatType, float, FLOAT, 4, "float"); +}; + +struct DoubleType : public PrimitiveType { + PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); +}; + +typedef PrimitiveArrayImpl FloatArray; +typedef PrimitiveArrayImpl DoubleArray; + +typedef PrimitiveBuilder FloatBuilder; +typedef PrimitiveBuilder DoubleBuilder; + +} // namespace arrow + +#endif // ARROW_TYPES_FLOATING_H diff --git a/cpp/src/arrow/types/integer.cc b/cpp/src/arrow/types/integer.cc new file mode 100644 index 00000000000..46965366169 --- /dev/null +++ b/cpp/src/arrow/types/integer.cc @@ -0,0 +1,22 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/integer.h" + +namespace arrow { + +} // namespace arrow diff --git a/cpp/src/arrow/types/integer.h b/cpp/src/arrow/types/integer.h new file mode 100644 index 00000000000..7e5eab55be0 --- /dev/null +++ b/cpp/src/arrow/types/integer.h @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_INTEGER_H +#define ARROW_TYPES_INTEGER_H + +#include +#include + +#include "arrow/types/primitive.h" + +namespace arrow { + +struct UInt8Type : public PrimitiveType { + PRIMITIVE_DECL(UInt8Type, uint8_t, UINT8, 1, "uint8"); +}; + +struct Int8Type : public PrimitiveType { + PRIMITIVE_DECL(Int8Type, int8_t, INT8, 1, "int8"); +}; + +struct UInt16Type : public PrimitiveType { + PRIMITIVE_DECL(UInt16Type, uint16_t, UINT16, 2, "uint16"); +}; + +struct Int16Type : public PrimitiveType { + PRIMITIVE_DECL(Int16Type, int16_t, INT16, 2, "int16"); +}; + +struct UInt32Type : public PrimitiveType { + PRIMITIVE_DECL(UInt32Type, uint32_t, UINT32, 4, "uint32"); +}; + +struct Int32Type : public PrimitiveType { + PRIMITIVE_DECL(Int32Type, int32_t, INT32, 4, "int32"); +}; + +struct UInt64Type : public PrimitiveType { + PRIMITIVE_DECL(UInt64Type, uint64_t, UINT64, 8, "uint64"); +}; + +struct Int64Type : public PrimitiveType { + PRIMITIVE_DECL(Int64Type, int64_t, INT64, 8, "int64"); +}; + +// Array containers + +typedef PrimitiveArrayImpl UInt8Array; +typedef PrimitiveArrayImpl Int8Array; + +typedef PrimitiveArrayImpl UInt16Array; +typedef PrimitiveArrayImpl Int16Array; + +typedef PrimitiveArrayImpl UInt32Array; +typedef PrimitiveArrayImpl Int32Array; + +typedef PrimitiveArrayImpl UInt64Array; +typedef PrimitiveArrayImpl Int64Array; + +// Builders + +typedef PrimitiveBuilder UInt8Builder; +typedef PrimitiveBuilder UInt16Builder; +typedef PrimitiveBuilder UInt32Builder; +typedef PrimitiveBuilder UInt64Builder; + +typedef PrimitiveBuilder Int8Builder; +typedef PrimitiveBuilder Int16Builder; +typedef PrimitiveBuilder Int32Builder; +typedef PrimitiveBuilder Int64Builder; + +} // namespace arrow + +#endif // ARROW_TYPES_INTEGER_H diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc new file mode 100644 index 00000000000..b29b95715fe --- /dev/null +++ b/cpp/src/arrow/types/json.cc @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/json.h" + +#include + +#include "arrow/types/boolean.h" +#include "arrow/types/integer.h" +#include "arrow/types/floating.h" +#include "arrow/types/null.h" +#include "arrow/types/string.h" +#include "arrow/types/union.h" + +namespace arrow { + +static const TypePtr Null(new NullType()); +static const TypePtr Int32(new Int32Type()); +static const TypePtr String(new StringType()); +static const TypePtr Double(new DoubleType()); +static const TypePtr Bool(new BooleanType()); + +static const std::vector json_types = {Null, Int32, String, + Double, Bool}; +TypePtr JSONScalar::dense_type = TypePtr(new DenseUnionType(json_types)); +TypePtr JSONScalar::sparse_type = TypePtr(new SparseUnionType(json_types)); + +} // namespace arrow diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h new file mode 100644 index 00000000000..91fd132408f --- /dev/null +++ b/cpp/src/arrow/types/json.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_JSON_H +#define ARROW_TYPES_JSON_H + +#include "arrow/type.h" + +namespace arrow { + +struct JSONScalar : public DataType { + bool dense; + + static TypePtr dense_type; + static TypePtr sparse_type; + + explicit JSONScalar(bool dense = true, bool nullable = true) + : DataType(TypeEnum::JSON_SCALAR, nullable), + dense(dense) {} +}; + +} // namespace arrow + +#endif // ARROW_TYPES_JSON_H diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc new file mode 100644 index 00000000000..47673ff898b --- /dev/null +++ b/cpp/src/arrow/types/list-test.cc @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/types/construct.h" +#include "arrow/types/integer.h" +#include "arrow/types/list.h" +#include "arrow/types/string.h" +#include "arrow/types/test-common.h" +#include "arrow/util/status.h" + +using std::string; +using std::unique_ptr; +using std::vector; + +namespace arrow { + +class ArrayBuilder; + +TEST(TypesTest, TestListType) { + std::shared_ptr vt = std::make_shared(); + + ListType list_type(vt); + ListType list_type_nn(vt, false); + + ASSERT_EQ(list_type.type, TypeEnum::LIST); + ASSERT_TRUE(list_type.nullable); + ASSERT_FALSE(list_type_nn.nullable); + + ASSERT_EQ(list_type.name(), string("list")); + ASSERT_EQ(list_type.ToString(), string("list")); + + ASSERT_EQ(list_type.value_type->type, vt->type); + ASSERT_EQ(list_type.value_type->type, vt->type); + + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ(lt->ToString(), string("list")); + + ListType lt2(lt); + ASSERT_EQ(lt2.ToString(), string("list>")); +} + +// ---------------------------------------------------------------------- +// List tests + +class TestListBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + + value_type_ = TypePtr(new Int32Type()); + type_ = TypePtr(new ListType(value_type_)); + + ArrayBuilder* tmp; + ASSERT_OK(make_builder(type_, &tmp)); + builder_.reset(static_cast(tmp)); + } + + void Done() { + Array* out; + ASSERT_OK(builder_->ToArray(&out)); + result_.reset(static_cast(out)); + } + + protected: + TypePtr value_type_; + TypePtr type_; + + unique_ptr builder_; + unique_ptr result_; +}; + + +TEST_F(TestListBuilder, TestResize) { +} + +TEST_F(TestListBuilder, TestAppendNull) { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); + + Done(); + + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + + ASSERT_EQ(0, result_->offsets()[0]); + ASSERT_EQ(0, result_->offset(1)); + ASSERT_EQ(0, result_->offset(2)); + + Int32Array* values = static_cast(result_->values().get()); + ASSERT_EQ(0, values->length()); +} + +TEST_F(TestListBuilder, TestBasics) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_null = {0, 1, 0}; + + Int32Builder* vb = static_cast(builder_->value_builder()); + + int pos = 0; + for (size_t i = 0; i < lengths.size(); ++i) { + ASSERT_OK(builder_->Append(is_null[i] > 0)); + for (int j = 0; j < lengths[i]; ++j) { + ASSERT_OK(vb->Append(values[pos++])); + } + } + + Done(); + + ASSERT_TRUE(result_->nullable()); + ASSERT_TRUE(result_->values()->nullable()); + + ASSERT_EQ(3, result_->length()); + vector ex_offsets = {0, 3, 3, 7}; + for (size_t i = 0; i < ex_offsets.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result_->offset(i)); + } + + for (int i = 0; i < result_->length(); ++i) { + ASSERT_EQ(static_cast(is_null[i]), result_->IsNull(i)); + } + + ASSERT_EQ(7, result_->values()->length()); + Int32Array* varr = static_cast(result_->values().get()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } +} + +TEST_F(TestListBuilder, TestBasicsNonNullable) { +} + + +TEST_F(TestListBuilder, TestZeroLength) { + // All buffers are null + Done(); +} + + +} // namespace arrow diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc new file mode 100644 index 00000000000..f0ff5bf928a --- /dev/null +++ b/cpp/src/arrow/types/list.cc @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/list.h" + +#include +#include + +namespace arrow { + +std::string ListType::ToString() const { + std::stringstream s; + s << "list<" << value_type->ToString() << ">"; + return s.str(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h new file mode 100644 index 00000000000..0f1116257c5 --- /dev/null +++ b/cpp/src/arrow/types/list.h @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_LIST_H +#define ARROW_TYPES_LIST_H + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/type.h" +#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +struct ListType : public DataType { + // List can contain any other logical value type + TypePtr value_type; + + explicit ListType(const TypePtr& value_type, bool nullable = true) + : DataType(TypeEnum::LIST, nullable), + value_type(value_type) {} + + static char const *name() { + return "list"; + } + + virtual std::string ToString() const; +}; + + +class ListArray : public Array { + public: + ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} + + ListArray(const TypePtr& type, int64_t length, std::shared_ptr offsets, + const ArrayPtr& values, std::shared_ptr nulls = nullptr) { + Init(type, length, offsets, values, nulls); + } + + virtual ~ListArray() {} + + void Init(const TypePtr& type, int64_t length, std::shared_ptr offsets, + const ArrayPtr& values, std::shared_ptr nulls = nullptr) { + offset_buf_ = offsets; + offsets_ = offsets == nullptr? nullptr : + reinterpret_cast(offset_buf_->data()); + + values_ = values; + Array::Init(type, length, nulls); + } + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. + const ArrayPtr& values() const {return values_;} + + const int32_t* offsets() const { return offsets_;} + + int32_t offset(int i) const { return offsets_[i];} + + // Neither of these functions will perform boundschecking + int32_t value_offset(int i) { return offsets_[i];} + int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i];} + + protected: + std::shared_ptr offset_buf_; + const int32_t* offsets_; + ArrayPtr values_; +}; + +// ---------------------------------------------------------------------- +// Array builder + + +// Builder class for variable-length list array value types +// +// To use this class, you must append values to the child array builder and use +// the Append function to delimit each distinct list value (once the values +// have been appended to the child array) +class ListBuilder : public Int32Builder { + public: + ListBuilder(const TypePtr& type, ArrayBuilder* value_builder) + : Int32Builder(type) { + value_builder_.reset(value_builder); + } + + Status Init(int64_t elements) { + // One more than requested. + // + // XXX: This is slightly imprecise, because we might trigger null mask + // resizes that are unnecessary when creating arrays with power-of-two size + return Int32Builder::Init(elements + 1); + } + + Status Resize(int64_t capacity) { + // Need space for the end offset + RETURN_NOT_OK(Int32Builder::Resize(capacity + 1)); + + // Slight hack, as the "real" capacity is one less + --capacity_; + return Status::OK(); + } + + // Vector append + // + // If passed, null_bytes is of equal length to values, and any nonzero byte + // will be considered as a null for that slot + Status Append(T* values, int64_t length, uint8_t* null_bytes = nullptr) { + if (length_ + length > capacity_) { + int64_t new_capacity = util::next_power2(length_ + length); + RETURN_NOT_OK(Resize(new_capacity)); + } + memcpy(raw_buffer() + length_, values, length * elsize_); + + if (nullable_ && null_bytes != nullptr) { + // If null_bytes is all not null, then none of the values are null + for (int i = 0; i < length; ++i) { + util::set_bit(null_bits_, length_ + i, static_cast(null_bytes[i])); + } + } + + length_ += length; + return Status::OK(); + } + + // Initialize an array type instance with the results of this builder + // Transfers ownership of all buffers + template + Status Transfer(Container* out) { + Array* child_values; + RETURN_NOT_OK(value_builder_->ToArray(&child_values)); + + // Add final offset if the length is non-zero + if (length_) { + raw_buffer()[length_] = child_values->length(); + } + + out->Init(type_, length_, values_, ArrayPtr(child_values), nulls_); + values_ = nulls_ = nullptr; + capacity_ = length_ = 0; + return Status::OK(); + } + + virtual Status ToArray(Array** out) { + ListArray* result = new ListArray(); + RETURN_NOT_OK(Transfer(result)); + *out = static_cast(result); + return Status::OK(); + } + + // Start a new variable-length list slot + // + // This function should be called before beginning to append elements to the + // value builder + Status Append(bool is_null = false) { + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); + } + if (nullable_) { + util::set_bit(null_bits_, length_, is_null); + } + + raw_buffer()[length_++] = value_builder_->length(); + return Status::OK(); + } + + // Status Append(int32_t* offsets, int length, uint8_t* null_bytes) { + // return Int32Builder::Append(offsets, length, null_bytes); + // } + + Status AppendNull() { + return Append(true); + } + + ArrayBuilder* value_builder() const { return value_builder_.get();} + + protected: + std::unique_ptr value_builder_; +}; + + +} // namespace arrow + +#endif // ARROW_TYPES_LIST_H diff --git a/cpp/src/arrow/types/null.h b/cpp/src/arrow/types/null.h new file mode 100644 index 00000000000..c67f752d409 --- /dev/null +++ b/cpp/src/arrow/types/null.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_NULL_H +#define ARROW_TYPES_NULL_H + +#include +#include + +#include "arrow/type.h" + +namespace arrow { + +struct NullType : public PrimitiveType { + PRIMITIVE_DECL(NullType, void, NA, 0, "null"); +}; + +} // namespace arrow + +#endif // ARROW_TYPES_NULL_H diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc new file mode 100644 index 00000000000..12968608094 --- /dev/null +++ b/cpp/src/arrow/types/primitive-test.cc @@ -0,0 +1,345 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/types/boolean.h" +#include "arrow/types/construct.h" +#include "arrow/types/floating.h" +#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" +#include "arrow/types/test-common.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +using std::string; +using std::unique_ptr; +using std::vector; + +namespace arrow { + +TEST(TypesTest, TestBytesType) { + BytesType t1(3); + + ASSERT_EQ(t1.type, LayoutEnum::BYTE); + ASSERT_EQ(t1.size, 3); +} + + +#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ + TEST(TypesTest, TestPrimitive_##ENUM) { \ + KLASS tp; \ + KLASS tp_nn(false); \ + \ + ASSERT_EQ(tp.type, TypeEnum::ENUM); \ + ASSERT_EQ(tp.name(), string(NAME)); \ + ASSERT_TRUE(tp.nullable); \ + ASSERT_FALSE(tp_nn.nullable); \ + \ + KLASS tp_copy = tp_nn; \ + ASSERT_FALSE(tp_copy.nullable); \ + } + +PRIMITIVE_TEST(Int8Type, INT8, "int8"); +PRIMITIVE_TEST(Int16Type, INT16, "int16"); +PRIMITIVE_TEST(Int32Type, INT32, "int32"); +PRIMITIVE_TEST(Int64Type, INT64, "int64"); +PRIMITIVE_TEST(UInt8Type, UINT8, "uint8"); +PRIMITIVE_TEST(UInt16Type, UINT16, "uint16"); +PRIMITIVE_TEST(UInt32Type, UINT32, "uint32"); +PRIMITIVE_TEST(UInt64Type, UINT64, "uint64"); + +PRIMITIVE_TEST(FloatType, FLOAT, "float"); +PRIMITIVE_TEST(DoubleType, DOUBLE, "double"); + +PRIMITIVE_TEST(BooleanType, BOOL, "bool"); + +// ---------------------------------------------------------------------- +// Primitive type tests + +TEST_F(TestBuilder, TestResize) { + builder_->Init(10); + ASSERT_EQ(2, builder_->nulls()->size()); + + builder_->Resize(30); + ASSERT_EQ(4, builder_->nulls()->size()); +} + +template +class TestPrimitiveBuilder : public TestBuilder { + public: + typedef typename Attrs::ArrayType ArrayType; + typedef typename Attrs::BuilderType BuilderType; + typedef typename Attrs::T T; + + void SetUp() { + TestBuilder::SetUp(); + + type_ = Attrs::type(); + type_nn_ = Attrs::type(false); + + ArrayBuilder* tmp; + ASSERT_OK(make_builder(type_, &tmp)); + builder_.reset(static_cast(tmp)); + + ASSERT_OK(make_builder(type_nn_, &tmp)); + builder_nn_.reset(static_cast(tmp)); + } + + void RandomData(int64_t N, double pct_null = 0.1) { + Attrs::draw(N, &draws_); + random_nulls(N, pct_null, &nulls_); + } + + void CheckNullable() { + ArrayType result; + ArrayType expected; + int64_t size = builder_->length(); + + auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), + size * sizeof(T)); + + auto ex_nulls = bytes_to_null_buffer(nulls_.data(), size); + + expected.Init(size, ex_data, ex_nulls); + ASSERT_OK(builder_->Transfer(&result)); + + // Builder is now reset + ASSERT_EQ(0, builder_->length()); + ASSERT_EQ(0, builder_->capacity()); + ASSERT_EQ(nullptr, builder_->buffer()); + + ASSERT_TRUE(result.Equals(expected)); + } + + void CheckNonNullable() { + ArrayType result; + ArrayType expected; + int64_t size = builder_nn_->length(); + + auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), + size * sizeof(T)); + + expected.Init(size, ex_data); + ASSERT_OK(builder_nn_->Transfer(&result)); + + // Builder is now reset + ASSERT_EQ(0, builder_nn_->length()); + ASSERT_EQ(0, builder_nn_->capacity()); + ASSERT_EQ(nullptr, builder_nn_->buffer()); + + ASSERT_TRUE(result.Equals(expected)); + } + + protected: + TypePtr type_; + TypePtr type_nn_; + unique_ptr builder_; + unique_ptr builder_nn_; + + vector draws_; + vector nulls_; +}; + +#define PTYPE_DECL(CapType, c_type) \ + typedef CapType##Array ArrayType; \ + typedef CapType##Builder BuilderType; \ + typedef CapType##Type Type; \ + typedef c_type T; \ + \ + static TypePtr type(bool nullable = true) { \ + return TypePtr(new Type(nullable)); \ + } + +#define PINT_DECL(CapType, c_type, LOWER, UPPER) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type); \ + static void draw(int64_t N, vector* draws) { \ + randint(N, LOWER, UPPER, draws); \ + } \ + } + +PINT_DECL(UInt8, uint8_t, 0, UINT8_MAX); +PINT_DECL(UInt16, uint16_t, 0, UINT16_MAX); +PINT_DECL(UInt32, uint32_t, 0, UINT32_MAX); +PINT_DECL(UInt64, uint64_t, 0, UINT64_MAX); + +PINT_DECL(Int8, int8_t, INT8_MIN, INT8_MAX); +PINT_DECL(Int16, int16_t, INT16_MIN, INT16_MAX); +PINT_DECL(Int32, int32_t, INT32_MIN, INT32_MAX); +PINT_DECL(Int64, int64_t, INT64_MIN, INT64_MAX); + +typedef ::testing::Types Primitives; + +TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); + +#define DECL_T() \ + typedef typename TestFixture::T T; + +#define DECL_ARRAYTYPE() \ + typedef typename TestFixture::ArrayType ArrayType; + + +TYPED_TEST(TestPrimitiveBuilder, TestInit) { + DECL_T(); + + int64_t n = 1000; + ASSERT_OK(this->builder_->Init(n)); + ASSERT_EQ(n, this->builder_->capacity()); + ASSERT_EQ(n * sizeof(T), this->builder_->buffer()->size()); + + // unsure if this should go in all builder classes + ASSERT_EQ(0, this->builder_->num_children()); +} + +TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { + int size = 10000; + for (int i = 0; i < size; ++i) { + ASSERT_OK(this->builder_->AppendNull()); + } + + Array* result; + ASSERT_OK(this->builder_->ToArray(&result)); + unique_ptr holder(result); + + for (int i = 0; i < size; ++i) { + ASSERT_TRUE(result->IsNull(i)); + } +} + + +TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { + DECL_T(); + + int size = 10000; + + vector& draws = this->draws_; + vector& nulls = this->nulls_; + + this->RandomData(size); + + int i; + // Append the first 1000 + for (i = 0; i < 1000; ++i) { + ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + ASSERT_OK(this->builder_nn_->Append(draws[i])); + } + + ASSERT_EQ(1000, this->builder_->length()); + ASSERT_EQ(1024, this->builder_->capacity()); + + ASSERT_EQ(1000, this->builder_nn_->length()); + ASSERT_EQ(1024, this->builder_nn_->capacity()); + + // Append the next 9000 + for (i = 1000; i < size; ++i) { + ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + ASSERT_OK(this->builder_nn_->Append(draws[i])); + } + + ASSERT_EQ(size, this->builder_->length()); + ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); + + ASSERT_EQ(size, this->builder_nn_->length()); + ASSERT_EQ(util::next_power2(size), this->builder_nn_->capacity()); + + this->CheckNullable(); + this->CheckNonNullable(); +} + + +TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { + DECL_T(); + + int size = 10000; + this->RandomData(size); + + vector& draws = this->draws_; + vector& nulls = this->nulls_; + + // first slug + int K = 1000; + + ASSERT_OK(this->builder_->Append(draws.data(), K, nulls.data())); + ASSERT_OK(this->builder_nn_->Append(draws.data(), K)); + + ASSERT_EQ(1000, this->builder_->length()); + ASSERT_EQ(1024, this->builder_->capacity()); + + ASSERT_EQ(1000, this->builder_nn_->length()); + ASSERT_EQ(1024, this->builder_nn_->capacity()); + + // Append the next 9000 + ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, nulls.data() + K)); + ASSERT_OK(this->builder_nn_->Append(draws.data() + K, size - K)); + + ASSERT_EQ(size, this->builder_->length()); + ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); + + this->CheckNullable(); + this->CheckNonNullable(); +} + +TYPED_TEST(TestPrimitiveBuilder, TestAdvance) { + int n = 1000; + ASSERT_OK(this->builder_->Init(n)); + + ASSERT_OK(this->builder_->Advance(100)); + ASSERT_EQ(100, this->builder_->length()); + + ASSERT_OK(this->builder_->Advance(900)); + ASSERT_RAISES(Invalid, this->builder_->Advance(1)); +} + +TYPED_TEST(TestPrimitiveBuilder, TestResize) { + DECL_T(); + + int cap = MIN_BUILDER_CAPACITY * 2; + + ASSERT_OK(this->builder_->Resize(cap)); + ASSERT_EQ(cap, this->builder_->capacity()); + + ASSERT_EQ(cap * sizeof(T), this->builder_->buffer()->size()); + ASSERT_EQ(util::ceil_byte(cap) / 8, this->builder_->nulls()->size()); +} + +TYPED_TEST(TestPrimitiveBuilder, TestReserve) { + int n = 100; + ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_EQ(0, this->builder_->length()); + ASSERT_EQ(MIN_BUILDER_CAPACITY, this->builder_->capacity()); + + ASSERT_OK(this->builder_->Advance(100)); + ASSERT_OK(this->builder_->Reserve(MIN_BUILDER_CAPACITY)); + + ASSERT_EQ(util::next_power2(MIN_BUILDER_CAPACITY + 100), + this->builder_->capacity()); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc new file mode 100644 index 00000000000..2612e8ca7fd --- /dev/null +++ b/cpp/src/arrow/types/primitive.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/primitive.h" + +#include + +#include "arrow/util/buffer.h" + +namespace arrow { + +// ---------------------------------------------------------------------- +// Primitive array base + +void PrimitiveArray::Init(const TypePtr& type, int64_t length, + const std::shared_ptr& data, + const std::shared_ptr& nulls) { + Array::Init(type, length, nulls); + data_ = data; + raw_data_ = data == nullptr? nullptr : data_->data(); +} + +bool PrimitiveArray::Equals(const PrimitiveArray& other) const { + if (this == &other) return true; + if (type_->nullable != other.type_->nullable) return false; + + bool equal_data = data_->Equals(*other.data_, length_); + if (type_->nullable) { + return equal_data && + nulls_->Equals(*other.nulls_, util::ceil_byte(length_) / 8); + } else { + return equal_data; + } +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h new file mode 100644 index 00000000000..a41911224e0 --- /dev/null +++ b/cpp/src/arrow/types/primitive.h @@ -0,0 +1,240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_PRIMITIVE_H +#define ARROW_TYPES_PRIMITIVE_H + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +template +struct PrimitiveType : public DataType { + explicit PrimitiveType(bool nullable = true) + : DataType(Derived::type_enum, nullable) {} + + virtual std::string ToString() const { + return std::string(static_cast(this)->name()); + } +}; + +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ + static constexpr int size = SIZE; \ + \ + explicit TYPENAME(bool nullable = true) \ + : PrimitiveType(nullable) {} \ + \ + static const char* name() { \ + return NAME; \ + } + + +// Base class for fixed-size logical types +class PrimitiveArray : public Array { + public: + PrimitiveArray() : Array(), data_(nullptr), raw_data_(nullptr) {} + + virtual ~PrimitiveArray() {} + + void Init(const TypePtr& type, int64_t length, const std::shared_ptr& data, + const std::shared_ptr& nulls = nullptr); + + const std::shared_ptr& data() const { return data_;} + + bool Equals(const PrimitiveArray& other) const; + + protected: + std::shared_ptr data_; + const uint8_t* raw_data_; +}; + + +template +class PrimitiveArrayImpl : public PrimitiveArray { + public: + typedef typename TypeClass::c_type T; + + PrimitiveArrayImpl() : PrimitiveArray() {} + + PrimitiveArrayImpl(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& nulls = nullptr) { + Init(length, data, nulls); + } + + void Init(int64_t length, const std::shared_ptr& data, + const std::shared_ptr& nulls = nullptr) { + TypePtr type(new TypeClass(nulls != nullptr)); + PrimitiveArray::Init(type, length, data, nulls); + } + + bool Equals(const PrimitiveArrayImpl& other) const { + return PrimitiveArray::Equals(*static_cast(&other)); + } + + const T* raw_data() const { return reinterpret_cast(raw_data_);} + + T Value(int64_t i) const { + return raw_data()[i]; + } + + TypeClass* exact_type() const { + return static_cast(type_); + } +}; + + +template +class PrimitiveBuilder : public ArrayBuilder { + public: + typedef typename Type::c_type T; + + explicit PrimitiveBuilder(const TypePtr& type) + : ArrayBuilder(type), values_(nullptr) { + elsize_ = sizeof(T); + } + + virtual ~PrimitiveBuilder() {} + + Status Resize(int64_t capacity) { + // XXX: Set floor size for now + if (capacity < MIN_BUILDER_CAPACITY) { + capacity = MIN_BUILDER_CAPACITY; + } + + if (capacity_ == 0) { + RETURN_NOT_OK(Init(capacity)); + } else { + RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); + RETURN_NOT_OK(values_->Resize(capacity * elsize_)); + capacity_ = capacity; + } + return Status::OK(); + } + + Status Init(int64_t capacity) { + RETURN_NOT_OK(ArrayBuilder::Init(capacity)); + + values_ = std::make_shared(); + return values_->Resize(capacity * elsize_); + } + + Status Reserve(int64_t elements) { + if (length_ + elements > capacity_) { + int64_t new_capacity = util::next_power2(length_ + elements); + return Resize(new_capacity); + } + return Status::OK(); + } + + Status Advance(int64_t elements) { + return ArrayBuilder::Advance(elements); + } + + // Scalar append + Status Append(T val, bool is_null = false) { + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); + } + if (nullable_) { + util::set_bit(null_bits_, length_, is_null); + } + raw_buffer()[length_++] = val; + return Status::OK(); + } + + // Vector append + // + // If passed, null_bytes is of equal length to values, and any nonzero byte + // will be considered as a null for that slot + Status Append(const T* values, int64_t length, uint8_t* null_bytes = nullptr) { + if (length_ + length > capacity_) { + int64_t new_capacity = util::next_power2(length_ + length); + RETURN_NOT_OK(Resize(new_capacity)); + } + memcpy(raw_buffer() + length_, values, length * elsize_); + + if (nullable_ && null_bytes != nullptr) { + // If null_bytes is all not null, then none of the values are null + for (int64_t i = 0; i < length; ++i) { + util::set_bit(null_bits_, length_ + i, static_cast(null_bytes[i])); + } + } + + length_ += length; + return Status::OK(); + } + + Status AppendNull() { + if (!nullable_) { + return Status::Invalid("not nullable"); + } + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); + } + util::set_bit(null_bits_, length_++, true); + return Status::OK(); + } + + // Initialize an array type instance with the results of this builder + // Transfers ownership of all buffers + Status Transfer(PrimitiveArray* out) { + out->Init(type_, length_, values_, nulls_); + values_ = nulls_ = nullptr; + capacity_ = length_ = 0; + return Status::OK(); + } + + Status Transfer(ArrayType* out) { + return Transfer(static_cast(out)); + } + + virtual Status ToArray(Array** out) { + ArrayType* result = new ArrayType(); + RETURN_NOT_OK(Transfer(result)); + *out = static_cast(result); + return Status::OK(); + } + + T* raw_buffer() { + return reinterpret_cast(values_->mutable_data()); + } + + std::shared_ptr buffer() const { + return values_; + } + + protected: + std::shared_ptr values_; + int64_t elsize_; +}; + +} // namespace arrow + +#endif // ARROW_TYPES_PRIMITIVE_H diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc new file mode 100644 index 00000000000..6dba3fdcbb6 --- /dev/null +++ b/cpp/src/arrow/types/string-test.cc @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/types/construct.h" +#include "arrow/types/integer.h" +#include "arrow/types/string.h" +#include "arrow/types/test-common.h" +#include "arrow/util/status.h" + +using std::string; +using std::unique_ptr; +using std::vector; + +namespace arrow { + + +TEST(TypesTest, TestCharType) { + CharType t1(5); + + ASSERT_EQ(t1.type, TypeEnum::CHAR); + ASSERT_TRUE(t1.nullable); + ASSERT_EQ(t1.size, 5); + + ASSERT_EQ(t1.ToString(), string("char(5)")); + + // Test copy constructor + CharType t2 = t1; + ASSERT_EQ(t2.type, TypeEnum::CHAR); + ASSERT_TRUE(t2.nullable); + ASSERT_EQ(t2.size, 5); +} + + +TEST(TypesTest, TestVarcharType) { + VarcharType t1(5); + + ASSERT_EQ(t1.type, TypeEnum::VARCHAR); + ASSERT_TRUE(t1.nullable); + ASSERT_EQ(t1.size, 5); + ASSERT_EQ(t1.physical_type.size, 6); + + ASSERT_EQ(t1.ToString(), string("varchar(5)")); + + // Test copy constructor + VarcharType t2 = t1; + ASSERT_EQ(t2.type, TypeEnum::VARCHAR); + ASSERT_TRUE(t2.nullable); + ASSERT_EQ(t2.size, 5); + ASSERT_EQ(t2.physical_type.size, 6); +} + +TEST(TypesTest, TestStringType) { + StringType str; + StringType str_nn(false); + + ASSERT_EQ(str.type, TypeEnum::STRING); + ASSERT_EQ(str.name(), string("string")); + ASSERT_TRUE(str.nullable); + ASSERT_FALSE(str_nn.nullable); +} + +// ---------------------------------------------------------------------- +// String container + +class TestStringContainer : public ::testing::Test { + public: + void SetUp() { + chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; + offsets_ = {0, 1, 1, 1, 3, 6}; + nulls_ = {0, 0, 1, 0, 0}; + expected_ = {"a", "", "", "bb", "ccc"}; + + MakeArray(); + } + + void MakeArray() { + length_ = offsets_.size() - 1; + int64_t nchars = chars_.size(); + + value_buf_ = to_buffer(chars_); + values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); + + offsets_buf_ = to_buffer(offsets_); + + nulls_buf_ = bytes_to_null_buffer(nulls_.data(), nulls_.size()); + strings_.Init(length_, offsets_buf_, values_, nulls_buf_); + } + + protected: + vector offsets_; + vector chars_; + vector nulls_; + + vector expected_; + + std::shared_ptr value_buf_; + std::shared_ptr offsets_buf_; + std::shared_ptr nulls_buf_; + + int64_t length_; + + ArrayPtr values_; + StringArray strings_; +}; + + +TEST_F(TestStringContainer, TestArrayBasics) { + ASSERT_EQ(length_, strings_.length()); + ASSERT_TRUE(strings_.nullable()); +} + +TEST_F(TestStringContainer, TestType) { + TypePtr type = strings_.type(); + + ASSERT_EQ(TypeEnum::STRING, type->type); + ASSERT_EQ(TypeEnum::STRING, strings_.type_enum()); +} + + +TEST_F(TestStringContainer, TestListFunctions) { + int pos = 0; + for (size_t i = 0; i < expected_.size(); ++i) { + ASSERT_EQ(pos, strings_.value_offset(i)); + ASSERT_EQ(expected_[i].size(), strings_.value_length(i)); + pos += expected_[i].size(); + } +} + + +TEST_F(TestStringContainer, TestDestructor) { + auto arr = std::make_shared(length_, offsets_buf_, values_, nulls_buf_); +} + +TEST_F(TestStringContainer, TestGetString) { + for (size_t i = 0; i < expected_.size(); ++i) { + if (nulls_[i]) { + ASSERT_TRUE(strings_.IsNull(i)); + } else { + ASSERT_EQ(expected_[i], strings_.GetString(i)); + } + } +} + +// ---------------------------------------------------------------------- +// String builder tests + +class TestStringBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + type_ = TypePtr(new StringType()); + + ArrayBuilder* tmp; + ASSERT_OK(make_builder(type_, &tmp)); + builder_.reset(static_cast(tmp)); + } + + void Done() { + Array* out; + ASSERT_OK(builder_->ToArray(&out)); + result_.reset(static_cast(out)); + } + + protected: + TypePtr type_; + + unique_ptr builder_; + unique_ptr result_; +}; + +TEST_F(TestStringBuilder, TestAttrs) { + ASSERT_FALSE(builder_->value_builder()->nullable()); +} + +TEST_F(TestStringBuilder, TestScalarAppend) { + vector strings = {"a", "bb", "", "", "ccc"}; + vector is_null = {0, 0, 0, 1, 0}; + + int N = strings.size(); + int reps = 1000; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (is_null[i]) { + builder_->AppendNull(); + } else { + builder_->Append(strings[i]); + } + } + } + Done(); + + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps * 6, result_->values()->length()); + + int64_t length; + int64_t pos = 0; + for (int i = 0; i < N * reps; ++i) { + if (is_null[i % N]) { + ASSERT_TRUE(result_->IsNull(i)); + } else { + ASSERT_FALSE(result_->IsNull(i)); + result_->GetValue(i, &length); + ASSERT_EQ(pos, result_->offset(i)); + ASSERT_EQ(strings[i % N].size(), length); + ASSERT_EQ(strings[i % N], result_->GetString(i)); + + pos += length; + } + } +} + +TEST_F(TestStringBuilder, TestZeroLength) { + // All buffers are null + Done(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc new file mode 100644 index 00000000000..f3dfbdc50f7 --- /dev/null +++ b/cpp/src/arrow/types/string.cc @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/string.h" + +#include +#include + +namespace arrow { + +std::string CharType::ToString() const { + std::stringstream s; + s << "char(" << size << ")"; + return s.str(); +} + + +std::string VarcharType::ToString() const { + std::stringstream s; + s << "varchar(" << size << ")"; + return s.str(); +} + +TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type(false)); + +} // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h new file mode 100644 index 00000000000..30d6e247db1 --- /dev/null +++ b/cpp/src/arrow/types/string.h @@ -0,0 +1,181 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_STRING_H +#define ARROW_TYPES_STRING_H + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/type.h" +#include "arrow/types/integer.h" +#include "arrow/types/list.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +class ArrayBuilder; + +struct CharType : public DataType { + int size; + + BytesType physical_type; + + explicit CharType(int size, bool nullable = true) + : DataType(TypeEnum::CHAR, nullable), + size(size), + physical_type(BytesType(size)) {} + + CharType(const CharType& other) + : CharType(other.size, other.nullable) {} + + virtual std::string ToString() const; +}; + + +// Variable-length, null-terminated strings, up to a certain length +struct VarcharType : public DataType { + int size; + + BytesType physical_type; + + explicit VarcharType(int size, bool nullable = true) + : DataType(TypeEnum::VARCHAR, nullable), + size(size), + physical_type(BytesType(size + 1)) {} + VarcharType(const VarcharType& other) + : VarcharType(other.size, other.nullable) {} + + virtual std::string ToString() const; +}; + +static const LayoutPtr byte1(new BytesType(1)); +static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); + +// String is a logical type consisting of a physical list of 1-byte values +struct StringType : public DataType { + explicit StringType(bool nullable = true) + : DataType(TypeEnum::STRING, nullable) {} + + StringType(const StringType& other) + : StringType(other.nullable) {} + + const LayoutPtr& physical_type() { + return physical_string; + } + + static char const *name() { + return "string"; + } + + virtual std::string ToString() const { + return name(); + } +}; + + +// TODO: add a BinaryArray layer in between +class StringArray : public ListArray { + public: + StringArray() : ListArray(), bytes_(nullptr), raw_bytes_(nullptr) {} + + StringArray(int64_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, + const std::shared_ptr& nulls = nullptr) { + Init(length, offsets, values, nulls); + } + + void Init(const TypePtr& type, int64_t length, + const std::shared_ptr& offsets, + const ArrayPtr& values, + const std::shared_ptr& nulls = nullptr) { + ListArray::Init(type, length, offsets, values, nulls); + + // TODO: type validation for values array + + // For convenience + bytes_ = static_cast(values.get()); + raw_bytes_ = bytes_->raw_data(); + } + + void Init(int64_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, + const std::shared_ptr& nulls = nullptr) { + TypePtr type(new StringType(nulls != nullptr)); + Init(type, length, offsets, values, nulls); + } + + // Compute the pointer t + const uint8_t* GetValue(int64_t i, int64_t* out_length) const { + int32_t pos = offsets_[i]; + *out_length = offsets_[i + 1] - pos; + return raw_bytes_ + pos; + } + + // Construct a std::string + std::string GetString(int64_t i) const { + int64_t nchars; + const uint8_t* str = GetValue(i, &nchars); + return std::string(reinterpret_cast(str), nchars); + } + + private: + UInt8Array* bytes_; + const uint8_t* raw_bytes_; +}; + +// Array builder + + + +class StringBuilder : public ListBuilder { + public: + explicit StringBuilder(const TypePtr& type) : + ListBuilder(type, static_cast(new UInt8Builder(value_type_))) { + byte_builder_ = static_cast(value_builder_.get()); + } + + Status Append(const std::string& value) { + RETURN_NOT_OK(ListBuilder::Append()); + return byte_builder_->Append(reinterpret_cast(value.c_str()), + value.size()); + } + + Status Append(const uint8_t* value, int64_t length); + Status Append(const std::vector& values, + uint8_t* null_bytes); + + virtual Status ToArray(Array** out) { + StringArray* result = new StringArray(); + RETURN_NOT_OK(ListBuilder::Transfer(result)); + *out = static_cast(result); + return Status::OK(); + } + + protected: + UInt8Builder* byte_builder_; + + static TypePtr value_type_; +}; + +} // namespace arrow + +#endif // ARROW_TYPES_STRING_H diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc new file mode 100644 index 00000000000..644b5457d58 --- /dev/null +++ b/cpp/src/arrow/types/struct-test.cc @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include + +#include "arrow/field.h" +#include "arrow/type.h" +#include "arrow/types/integer.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" + +using std::string; +using std::vector; + +namespace arrow { + +TEST(TestStructType, Basics) { + TypePtr f0_type = TypePtr(new Int32Type()); + Field f0("f0", f0_type); + + TypePtr f1_type = TypePtr(new StringType()); + Field f1("f1", f1_type); + + TypePtr f2_type = TypePtr(new UInt8Type()); + Field f2("f2", f2_type); + + vector fields = {f0, f1, f2}; + + StructType struct_type(fields, true); + StructType struct_type_nn(fields, false); + + ASSERT_TRUE(struct_type.nullable); + ASSERT_FALSE(struct_type_nn.nullable); + + ASSERT_TRUE(struct_type.field(0).Equals(f0)); + ASSERT_TRUE(struct_type.field(1).Equals(f1)); + ASSERT_TRUE(struct_type.field(2).Equals(f2)); + + ASSERT_EQ(struct_type.ToString(), "struct"); + + // TODO: out of bounds for field(...) +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc new file mode 100644 index 00000000000..b7be5d8245f --- /dev/null +++ b/cpp/src/arrow/types/struct.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/struct.h" + +#include +#include +#include + +namespace arrow { + +std::string StructType::ToString() const { + std::stringstream s; + s << "struct<"; + for (size_t i = 0; i < fields_.size(); ++i) { + if (i > 0) s << ", "; + const Field& field = fields_[i]; + s << field.name << ": " << field.type->ToString(); + } + s << ">"; + return s.str(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h new file mode 100644 index 00000000000..7d8885b830d --- /dev/null +++ b/cpp/src/arrow/types/struct.h @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_STRUCT_H +#define ARROW_TYPES_STRUCT_H + +#include +#include + +#include "arrow/field.h" +#include "arrow/type.h" + +namespace arrow { + +struct StructType : public DataType { + std::vector fields_; + + StructType(const std::vector& fields, + bool nullable = true) + : DataType(TypeEnum::STRUCT, nullable) { + fields_ = fields; + } + + const Field& field(int i) const { + return fields_[i]; + } + + int num_children() const { + return fields_.size(); + } + + virtual std::string ToString() const; +}; + +} // namespace arrow + +#endif // ARROW_TYPES_STRUCT_H diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h new file mode 100644 index 00000000000..267e48a7f25 --- /dev/null +++ b/cpp/src/arrow/types/test-common.h @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_TEST_COMMON_H +#define ARROW_TYPES_TEST_COMMON_H + +#include +#include +#include +#include + +#include "arrow/test-util.h" +#include "arrow/type.h" + +using std::unique_ptr; + +namespace arrow { + +class TestBuilder : public ::testing::Test { + public: + void SetUp() { + type_ = TypePtr(new UInt8Type()); + type_nn_ = TypePtr(new UInt8Type(false)); + builder_.reset(new UInt8Builder(type_)); + builder_nn_.reset(new UInt8Builder(type_nn_)); + } + protected: + TypePtr type_; + TypePtr type_nn_; + unique_ptr builder_; + unique_ptr builder_nn_; +}; + +} // namespace arrow + +#endif // ARROW_TYPES_TEST_COMMON_H diff --git a/cpp/src/arrow/types/union.cc b/cpp/src/arrow/types/union.cc new file mode 100644 index 00000000000..54f41a7eef6 --- /dev/null +++ b/cpp/src/arrow/types/union.cc @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/union.h" + +#include +#include +#include + +#include "arrow/type.h" + +namespace arrow { + +static inline std::string format_union(const std::vector& child_types) { + std::stringstream s; + s << "union<"; + for (size_t i = 0; i < child_types.size(); ++i) { + if (i) s << ", "; + s << child_types[i]->ToString(); + } + s << ">"; + return s.str(); +} + +std::string DenseUnionType::ToString() const { + return format_union(child_types_); +} + + +std::string SparseUnionType::ToString() const { + return format_union(child_types_); +} + + +} // namespace arrow diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h new file mode 100644 index 00000000000..7b66c3b88bf --- /dev/null +++ b/cpp/src/arrow/types/union.h @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPES_UNION_H +#define ARROW_TYPES_UNION_H + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/type.h" +#include "arrow/types/collection.h" + +namespace arrow { + +class Buffer; + +struct DenseUnionType : public CollectionType { + typedef CollectionType Base; + + DenseUnionType(const std::vector& child_types, + bool nullable = true) + : Base(nullable) { + child_types_ = child_types; + } + + virtual std::string ToString() const; +}; + + +struct SparseUnionType : public CollectionType { + typedef CollectionType Base; + + SparseUnionType(const std::vector& child_types, + bool nullable = true) + : Base(nullable) { + child_types_ = child_types; + } + + virtual std::string ToString() const; +}; + + +class UnionArray : public Array { + public: + UnionArray() : Array() {} + + protected: + // The data are types encoded as int16 + Buffer* types_; + std::vector > children_; +}; + + +class DenseUnionArray : public UnionArray { + public: + DenseUnionArray() : UnionArray() {} + + protected: + Buffer* offset_buf_; +}; + + +class SparseUnionArray : public UnionArray { + public: + SparseUnionArray() : UnionArray() {} +}; + +} // namespace arrow + +#endif // ARROW_TYPES_UNION_H diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt new file mode 100644 index 00000000000..88e3f7a656d --- /dev/null +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -0,0 +1,81 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# arrow_util +####################################### + +set(UTIL_SRCS + bit-util.cc + buffer.cc + status.cc +) + +set(UTIL_LIBS + rt) + +add_library(arrow_util STATIC + ${UTIL_SRCS} +) +target_link_libraries(arrow_util ${UTIL_LIBS}) +SET_TARGET_PROPERTIES(arrow_util PROPERTIES LINKER_LANGUAGE CXX) + +# Headers: top level +install(FILES + bit-util.h + buffer.h + macros.h + status.h + DESTINATION include/arrow/util) + +####################################### +# arrow_test_util +####################################### + +add_library(arrow_test_util) +target_link_libraries(arrow_test_util + arrow_util) + +SET_TARGET_PROPERTIES(arrow_test_util PROPERTIES LINKER_LANGUAGE CXX) + +####################################### +# arrow_test_main +####################################### + +add_library(arrow_test_main + test_main.cc) + +if (APPLE) + target_link_libraries(arrow_test_main + gtest + arrow_util + arrow_test_util + dl) + set_target_properties(arrow_test_main + PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") +else() + target_link_libraries(arrow_test_main + gtest + arrow_util + arrow_test_util + pthread + dl + ) +endif() + +ADD_ARROW_TEST(bit-util-test) +ADD_ARROW_TEST(buffer-test) diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc new file mode 100644 index 00000000000..7506ca5b553 --- /dev/null +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/util/bit-util.h" + +namespace arrow { + +TEST(UtilTests, TestNextPower2) { + using util::next_power2; + + ASSERT_EQ(8, next_power2(6)); + ASSERT_EQ(8, next_power2(8)); + + ASSERT_EQ(1, next_power2(1)); + ASSERT_EQ(256, next_power2(131)); + + ASSERT_EQ(1024, next_power2(1000)); + + ASSERT_EQ(4096, next_power2(4000)); + + ASSERT_EQ(65536, next_power2(64000)); + + ASSERT_EQ(1LL << 32, next_power2((1LL << 32) - 1)); + ASSERT_EQ(1LL << 31, next_power2((1LL << 31) - 1)); + ASSERT_EQ(1LL << 62, next_power2((1LL << 62) - 1)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc new file mode 100644 index 00000000000..d2ddd6584a8 --- /dev/null +++ b/cpp/src/arrow/util/bit-util.cc @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +void util::bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits) { + for (int i = 0; i < length; ++i) { + set_bit(bits, i, static_cast(bytes[i])); + } +} + +Status util::bytes_to_bits(uint8_t* bytes, int length, + std::shared_ptr* out) { + int bit_length = ceil_byte(length) / 8; + + auto buffer = std::make_shared(); + RETURN_NOT_OK(buffer->Resize(bit_length)); + memset(buffer->mutable_data(), 0, bit_length); + bytes_to_bits(bytes, length, buffer->mutable_data()); + + *out = buffer; + + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h new file mode 100644 index 00000000000..61dffa30423 --- /dev/null +++ b/cpp/src/arrow/util/bit-util.h @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_BIT_UTIL_H +#define ARROW_UTIL_BIT_UTIL_H + +#include +#include +#include + +#include "arrow/util/buffer.h" + +namespace arrow { + +class Status; + +namespace util { + +static inline int64_t ceil_byte(int64_t size) { + return (size + 7) & ~7; +} + +static inline int64_t ceil_2bytes(int64_t size) { + return (size + 15) & ~15; +} + +static inline bool get_bit(const uint8_t* bits, int i) { + return bits[i / 8] & (1 << (i % 8)); +} + +static inline void set_bit(uint8_t* bits, int i, bool is_set) { + bits[i / 8] |= (1 << (i % 8)) * is_set; +} + +static inline int64_t next_power2(int64_t n) { + n--; + n |= n >> 1; + n |= n >> 2; + n |= n >> 4; + n |= n >> 8; + n |= n >> 16; + n |= n >> 32; + n++; + return n; +} + +void bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits); +Status bytes_to_bits(uint8_t*, int, std::shared_ptr*); + +} // namespace util + +} // namespace arrow + +#endif // ARROW_UTIL_BIT_UTIL_H diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc new file mode 100644 index 00000000000..edfd08e850b --- /dev/null +++ b/cpp/src/arrow/util/buffer-test.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "arrow/test-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +using std::string; + +namespace arrow { + +class TestBuffer : public ::testing::Test { +}; + +TEST_F(TestBuffer, Resize) { + OwnedMutableBuffer buf; + + ASSERT_EQ(0, buf.size()); + ASSERT_OK(buf.Resize(100)); + ASSERT_EQ(100, buf.size()); + ASSERT_OK(buf.Resize(200)); + ASSERT_EQ(200, buf.size()); + + // Make it smaller, too + ASSERT_OK(buf.Resize(50)); + ASSERT_EQ(50, buf.size()); +} + +TEST_F(TestBuffer, ResizeOOM) { + // realloc fails, even though there may be no explicit limit + OwnedMutableBuffer buf; + ASSERT_OK(buf.Resize(100)); + int64_t to_alloc = std::numeric_limits::max(); + ASSERT_RAISES(OutOfMemory, buf.Resize(to_alloc)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc new file mode 100644 index 00000000000..2fb34d59e0b --- /dev/null +++ b/cpp/src/arrow/util/buffer.cc @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/buffer.h" + +#include + +#include "arrow/util/status.h" + +namespace arrow { + +Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, + int64_t size) { + data_ = parent->data() + offset; + size_ = size; + parent_ = parent; +} + +std::shared_ptr MutableBuffer::GetImmutableView() { + return std::make_shared(this->get_shared_ptr(), 0, size()); +} + +OwnedMutableBuffer::OwnedMutableBuffer() : + MutableBuffer(nullptr, 0) {} + +Status OwnedMutableBuffer::Resize(int64_t new_size) { + size_ = new_size; + try { + buffer_owner_.resize(new_size); + } catch (const std::bad_alloc& e) { + return Status::OutOfMemory("resize failed"); + } + data_ = buffer_owner_.data(); + mutable_data_ = buffer_owner_.data(); + + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h new file mode 100644 index 00000000000..3e4183936b3 --- /dev/null +++ b/cpp/src/arrow/util/buffer.h @@ -0,0 +1,133 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_BUFFER_H +#define ARROW_UTIL_BUFFER_H + +#include +#include +#include +#include +#include + +#include "arrow/util/macros.h" + +namespace arrow { + +class Status; + +// ---------------------------------------------------------------------- +// Buffer classes + +// Immutable API for a chunk of bytes which may or may not be owned by the +// class instance +class Buffer : public std::enable_shared_from_this { + public: + Buffer(const uint8_t* data, int64_t size) : + data_(data), + size_(size) {} + + // An offset into data that is owned by another buffer, but we want to be + // able to retain a valid pointer to it even after other shared_ptr's to the + // parent buffer have been destroyed + Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size); + + std::shared_ptr get_shared_ptr() { + return shared_from_this(); + } + + // Return true if both buffers are the same size and contain the same bytes + // up to the number of compared bytes + bool Equals(const Buffer& other, int64_t nbytes) const { + return this == &other || + (size_ >= nbytes && other.size_ >= nbytes && + !memcmp(data_, other.data_, nbytes)); + } + + bool Equals(const Buffer& other) const { + return this == &other || + (size_ == other.size_ && !memcmp(data_, other.data_, size_)); + } + + const uint8_t* data() const { + return data_; + } + + int64_t size() const { + return size_; + } + + // Returns true if this Buffer is referencing memory (possibly) owned by some + // other buffer + bool is_shared() const { + return static_cast(parent_); + } + + const std::shared_ptr parent() const { + return parent_; + } + + protected: + const uint8_t* data_; + int64_t size_; + + // nullptr by default, but may be set + std::shared_ptr parent_; + + private: + DISALLOW_COPY_AND_ASSIGN(Buffer); +}; + +// A Buffer whose contents can be mutated. May or may not own its data. +class MutableBuffer : public Buffer { + public: + MutableBuffer(uint8_t* data, int64_t size) : + Buffer(data, size) { + mutable_data_ = data; + } + + uint8_t* mutable_data() { + return mutable_data_; + } + + // Get a read-only view of this buffer + std::shared_ptr GetImmutableView(); + + protected: + MutableBuffer() : + Buffer(nullptr, 0), + mutable_data_(nullptr) {} + + uint8_t* mutable_data_; +}; + +// A MutableBuffer whose memory is owned by the class instance. For example, +// for reading data out of files that you want to deallocate when this class is +// garbage-collected +class OwnedMutableBuffer : public MutableBuffer { + public: + OwnedMutableBuffer(); + Status Resize(int64_t new_size); + + private: + // TODO: aligned allocations + std::vector buffer_owner_; +}; + +} // namespace arrow + +#endif // ARROW_UTIL_BUFFER_H diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h new file mode 100644 index 00000000000..069e627c90e --- /dev/null +++ b/cpp/src/arrow/util/macros.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_MACROS_H +#define ARROW_UTIL_MACROS_H + +// From Google gutil +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ + void operator=(const TypeName&) = delete + +#endif // ARROW_UTIL_MACROS_H diff --git a/cpp/src/arrow/util/random.h b/cpp/src/arrow/util/random.h new file mode 100644 index 00000000000..64c197ef080 --- /dev/null +++ b/cpp/src/arrow/util/random.h @@ -0,0 +1,128 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// Moved from Kudu http://github.com/cloudera/kudu + +#ifndef ARROW_UTIL_RANDOM_H_ +#define ARROW_UTIL_RANDOM_H_ + +#include + +#include + +namespace arrow { + +namespace random_internal { + +static const uint32_t M = 2147483647L; // 2^31-1 +const double kTwoPi = 6.283185307179586476925286; + +} // namespace random_internal + +// A very simple random number generator. Not especially good at +// generating truly random bits, but good enough for our needs in this +// package. This implementation is not thread-safe. +class Random { + public: + explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { + // Avoid bad seeds. + if (seed_ == 0 || seed_ == random_internal::M) { + seed_ = 1; + } + } + + // Next pseudo-random 32-bit unsigned integer. + // FIXME: This currently only generates 31 bits of randomness. + // The MSB will always be zero. + uint32_t Next() { + static const uint64_t A = 16807; // bits 14, 8, 7, 5, 2, 1, 0 + // We are computing + // seed_ = (seed_ * A) % M, where M = 2^31-1 + // + // seed_ must not be zero or M, or else all subsequent computed values + // will be zero or M respectively. For all other values, seed_ will end + // up cycling through every number in [1,M-1] + uint64_t product = seed_ * A; + + // Compute (product % M) using the fact that ((x << 31) % M) == x. + seed_ = static_cast((product >> 31) + (product & random_internal::M)); + // The first reduction may overflow by 1 bit, so we may need to + // repeat. mod == M is not possible; using > allows the faster + // sign-bit-based test. + if (seed_ > random_internal::M) { + seed_ -= random_internal::M; + } + return seed_; + } + + // Alias for consistency with Next64 + uint32_t Next32() { return Next(); } + + // Next pseudo-random 64-bit unsigned integer. + // FIXME: This currently only generates 62 bits of randomness due to Next() + // only giving 31 bits of randomness. The 2 most significant bits will always + // be zero. + uint64_t Next64() { + uint64_t large = Next(); + // Only shift by 31 bits so we end up with zeros in MSB and not scattered + // throughout the 64-bit word. This is due to the weakness in Next() noted + // above. + large <<= 31; + large |= Next(); + return large; + } + + // Returns a uniformly distributed value in the range [0..n-1] + // REQUIRES: n > 0 + uint32_t Uniform(uint32_t n) { return Next() % n; } + + // Alias for consistency with Uniform64 + uint32_t Uniform32(uint32_t n) { return Uniform(n); } + + // Returns a uniformly distributed 64-bit value in the range [0..n-1] + // REQUIRES: n > 0 + uint64_t Uniform64(uint64_t n) { return Next64() % n; } + + // Randomly returns true ~"1/n" of the time, and false otherwise. + // REQUIRES: n > 0 + bool OneIn(int n) { return (Next() % n) == 0; } + + // Skewed: pick "base" uniformly from range [0,max_log] and then + // return "base" random bits. The effect is to pick a number in the + // range [0,2^max_log-1] with exponential bias towards smaller numbers. + uint32_t Skewed(int max_log) { + return Uniform(1 << Uniform(max_log + 1)); + } + + // Creates a normal distribution variable using the + // Box-Muller transform. See: + // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform + // Adapted from WebRTC source code at: + // webrtc/trunk/modules/video_coding/main/test/test_util.cc + double Normal(double mean, double std_dev) { + double uniform1 = (Next() + 1.0) / (random_internal::M + 1.0); + double uniform2 = (Next() + 1.0) / (random_internal::M + 1.0); + return (mean + std_dev * sqrt(-2 * ::log(uniform1)) * + cos(random_internal::kTwoPi * uniform2)); + } + + // Return a random number between 0.0 and 1.0 inclusive. + double NextDoubleFraction() { + return Next() / static_cast(random_internal::M + 1.0); + } + + private: + uint32_t seed_; +}; + + +uint32_t random_seed() { + // TODO: use system time to get a reasonably random seed + return 0; +} + + +} // namespace arrow + +#endif // ARROW_UTIL_RANDOM_H_ diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc new file mode 100644 index 00000000000..c64b8a3d5f8 --- /dev/null +++ b/cpp/src/arrow/util/status.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#include "arrow/util/status.h" + +#include + +namespace arrow { + +Status::Status(StatusCode code, const std::string& msg, int16_t posix_code) { + assert(code != StatusCode::OK); + const uint32_t size = msg.size(); + char* result = new char[size + 7]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast(code); + memcpy(result + 5, &posix_code, sizeof(posix_code)); + memcpy(result + 7, msg.c_str(), msg.size()); + state_ = result; +} + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 7]; + memcpy(result, state, size + 7); + return result; +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h new file mode 100644 index 00000000000..47fda40db25 --- /dev/null +++ b/cpp/src/arrow/util/status.h @@ -0,0 +1,152 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +// Adapted from Kudu github.com/cloudera/kudu + +#ifndef ARROW_STATUS_H_ +#define ARROW_STATUS_H_ + +#include +#include +#include + +// Return the given status if it is not OK. +#define ARROW_RETURN_NOT_OK(s) do { \ + ::arrow::Status _s = (s); \ + if (!_s.ok()) return _s; \ + } while (0); + +// Return the given status if it is not OK, but first clone it and +// prepend the given message. +#define ARROW_RETURN_NOT_OK_PREPEND(s, msg) do { \ + ::arrow::Status _s = (s); \ + if (::gutil::PREDICT_FALSE(!_s.ok())) return _s.CloneAndPrepend(msg); \ + } while (0); + +// Return 'to_return' if 'to_call' returns a bad status. +// The substitution for 'to_return' may reference the variable +// 's' for the bad status. +#define ARROW_RETURN_NOT_OK_RET(to_call, to_return) do { \ + ::arrow::Status s = (to_call); \ + if (::gutil::PREDICT_FALSE(!s.ok())) return (to_return); \ + } while (0); + +// If 'to_call' returns a bad status, CHECK immediately with a logged message +// of 'msg' followed by the status. +#define ARROW_CHECK_OK_PREPEND(to_call, msg) do { \ +::arrow::Status _s = (to_call); \ +ARROW_CHECK(_s.ok()) << (msg) << ": " << _s.ToString(); \ +} while (0); + +// If the status is bad, CHECK immediately, appending the status to the +// logged message. +#define ARROW_CHECK_OK(s) ARROW_CHECK_OK_PREPEND(s, "Bad status") + +namespace arrow { + +#define RETURN_NOT_OK(s) do { \ + Status _s = (s); \ + if (!_s.ok()) return _s; \ + } while (0); + +enum class StatusCode: char { + OK = 0, + OutOfMemory = 1, + KeyError = 2, + Invalid = 3, + + NotImplemented = 10, +}; + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status OutOfMemory(const std::string& msg, int16_t posix_code = -1) { + return Status(StatusCode::OutOfMemory, msg, posix_code); + } + + static Status KeyError(const std::string& msg) { + return Status(StatusCode::KeyError, msg, -1); + } + + static Status NotImplemented(const std::string& msg) { + return Status(StatusCode::NotImplemented, msg, -1); + } + + static Status Invalid(const std::string& msg) { + return Status(StatusCode::Invalid, msg, -1); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + bool IsKeyError() const { return code() == StatusCode::KeyError; } + bool IsInvalid() const { return code() == StatusCode::Invalid; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + // Return a string representation of the status code, without the message + // text or posix code information. + std::string CodeAsString() const; + + // Get the POSIX code associated with this Status, or -1 if there is none. + int16_t posix_code() const; + + private: + // OK status has a NULL state_. Otherwise, state_ is a new[] array + // of the following form: + // state_[0..3] == length of message + // state_[4] == code + // state_[5..6] == posix_code + // state_[7..] == message + const char* state_; + + StatusCode code() const { + return ((state_ == NULL) ? + StatusCode::OK : static_cast(state_[4])); + } + + Status(StatusCode code, const std::string& msg, int16_t posix_code); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); +} + +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); + } +} + +} // namespace arrow + + +#endif // ARROW_STATUS_H_ diff --git a/cpp/src/arrow/util/test_main.cc b/cpp/src/arrow/util/test_main.cc new file mode 100644 index 00000000000..00139f36742 --- /dev/null +++ b/cpp/src/arrow/util/test_main.cc @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + + int ret = RUN_ALL_TESTS(); + + return ret; +} diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh new file mode 100755 index 00000000000..46794def400 --- /dev/null +++ b/cpp/thirdparty/build_thirdparty.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +set -x +set -e +TP_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) + +source $TP_DIR/versions.sh +PREFIX=$TP_DIR/installed + +################################################################################ + +if [ "$#" = "0" ]; then + F_ALL=1 +else + # Allow passing specific libs to build on the command line + for arg in "$*"; do + case $arg in + "gtest") F_GTEST=1 ;; + *) echo "Unknown module: $arg"; exit 1 ;; + esac + done +fi + +################################################################################ + +# Determine how many parallel jobs to use for make based on the number of cores +if [[ "$OSTYPE" =~ ^linux ]]; then + PARALLEL=$(grep -c processor /proc/cpuinfo) +elif [[ "$OSTYPE" == "darwin"* ]]; then + PARALLEL=$(sysctl -n hw.ncpu) +else + echo Unsupported platform $OSTYPE + exit 1 +fi + +mkdir -p "$PREFIX/include" +mkdir -p "$PREFIX/lib" + +# On some systems, autotools installs libraries to lib64 rather than lib. Fix +# this by setting up lib64 as a symlink to lib. We have to do this step first +# to handle cases where one third-party library depends on another. +ln -sf lib "$PREFIX/lib64" + +# use the compiled tools +export PATH=$PREFIX/bin:$PATH + + +# build googletest +if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then + cd $TP_DIR/$GTEST_BASEDIR + + if [[ "$OSTYPE" == "darwin"* ]]; then + CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="-std=c++11 -stdlib=libc++ -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" + else + CXXFLAGS=-fPIC cmake . + fi + + make VERBOSE=1 +fi + +echo "---------------------" +echo "Thirdparty dependencies built and installed into $PREFIX successfully" diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh new file mode 100755 index 00000000000..8ffb22a93f7 --- /dev/null +++ b/cpp/thirdparty/download_thirdparty.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -x +set -e + +TP_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) + +source $TP_DIR/versions.sh + +download_extract_and_cleanup() { + filename=$TP_DIR/$(basename "$1") + curl -#LC - "$1" -o $filename + tar xzf $filename -C $TP_DIR + rm $filename +} + +if [ ! -d ${GTEST_BASEDIR} ]; then + echo "Fetching gtest" + download_extract_and_cleanup $GTEST_URL +fi diff --git a/cpp/thirdparty/versions.sh b/cpp/thirdparty/versions.sh new file mode 100755 index 00000000000..12ad56ef001 --- /dev/null +++ b/cpp/thirdparty/versions.sh @@ -0,0 +1,3 @@ +GTEST_VERSION=1.7.0 +GTEST_URL="https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz" +GTEST_BASEDIR=googletest-release-$GTEST_VERSION From 7e76e3aee92122f39702241db2d0eaea86fd3e8c Mon Sep 17 00:00:00 2001 From: proflin Date: Fri, 19 Feb 2016 23:07:17 +0800 Subject: [PATCH 006/210] ARROW-5: Update drill-fmpp-maven-plugin to 1.5.0 This closes #1. --- java/pom.xml | 2 -- java/vector/pom.xml | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/java/pom.xml b/java/pom.xml index 8a3b192e13e..4ee4ff4f760 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -36,8 +36,6 @@ 2 2.7.1 2.7.1 - 0.9.15 - 2.3.21 diff --git a/java/vector/pom.xml b/java/vector/pom.xml index e693344221b..1fef81b7eba 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -106,7 +106,7 @@ org.apache.drill.tools drill-fmpp-maven-plugin - 1.4.0 + 1.5.0 generate-fmpp From e9cc8ce390a1ab28bf71ce6eeb66c915140e2cb9 Mon Sep 17 00:00:00 2001 From: Jacques Nadeau Date: Fri, 19 Feb 2016 18:42:35 -0800 Subject: [PATCH 007/210] ARROW-5: Correct Apache Maven repo for maven plugin use --- java/vector/pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 1fef81b7eba..df5389261ba 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -60,7 +60,7 @@ apache apache - https://repo.maven.apache.org/ + https://repo.maven.apache.org/maven2/ true From e6905effbb9383afd2423a4f86cf9a33ca680b9d Mon Sep 17 00:00:00 2001 From: proflin Date: Sat, 20 Feb 2016 15:50:45 +0800 Subject: [PATCH 008/210] ARROW-9: Replace straggler references to Drill - Renaming drill to arrow for TestBaseAllocator - Fix ArrowBuffer as ArrowBuf - Replace Drill with Arrow for ValueHolder This closes #2. --- .../main/java/io/netty/buffer/ArrowBuf.java | 36 +-- .../io/netty/buffer/ExpandableByteBuf.java | 2 +- .../netty/buffer/PooledByteBufAllocatorL.java | 6 +- .../buffer/UnsafeDirectLittleEndian.java | 4 +- .../arrow/memory/AllocationManager.java | 34 +-- ...ocator.java => ArrowByteBufAllocator.java} | 10 +- .../apache/arrow/memory/BaseAllocator.java | 22 +- .../apache/arrow/memory/BufferAllocator.java | 4 +- .../apache/arrow/memory/BufferManager.java | 2 +- .../org/apache/arrow/memory/package-info.java | 2 +- .../arrow/memory/TestBaseAllocator.java | 232 +++++++++--------- .../main/codegen/templates/ListWriters.java | 2 +- .../complex/AbstractContainerVector.java | 2 +- .../vector/complex/AbstractMapVector.java | 2 +- .../arrow/vector/holders/ValueHolder.java | 4 +- .../vector/util/ByteFunctionHelpers.java | 16 +- .../arrow/vector/util/DecimalUtility.java | 16 +- 17 files changed, 198 insertions(+), 198 deletions(-) rename java/memory/src/main/java/org/apache/arrow/memory/{DrillByteBufAllocator.java => ArrowByteBufAllocator.java} (92%) diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index f033ba6538e..bbec26aa85c 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -56,7 +56,7 @@ public final class ArrowBuf extends AbstractByteBuf implements AutoCloseable { private final boolean isEmpty; private volatile int length; private final HistoricalLog historicalLog = BaseAllocator.DEBUG ? - new HistoricalLog(BaseAllocator.DEBUG_LOG_LENGTH, "DrillBuf[%d]", id) : null; + new HistoricalLog(BaseAllocator.DEBUG_LOG_LENGTH, "ArrowBuf[%d]", id) : null; public ArrowBuf( final AtomicInteger refCnt, @@ -155,18 +155,18 @@ private void ensure(int width) { } /** - * Create a new DrillBuf that is associated with an alternative allocator for the purposes of memory ownership and - * accounting. This has no impact on the reference counting for the current DrillBuf except in the situation where the + * Create a new ArrowBuf that is associated with an alternative allocator for the purposes of memory ownership and + * accounting. This has no impact on the reference counting for the current ArrowBuf except in the situation where the * passed in Allocator is the same as the current buffer. * - * This operation has no impact on the reference count of this DrillBuf. The newly created DrillBuf with either have a + * This operation has no impact on the reference count of this ArrowBuf. The newly created ArrowBuf with either have a * reference count of 1 (in the case that this is the first time this memory is being associated with the new * allocator) or the current value of the reference count + 1 for the other AllocationManager/BufferLedger combination * in the case that the provided allocator already had an association to this underlying memory. * * @param target * The target allocator to create an association with. - * @return A new DrillBuf which shares the same underlying memory as this DrillBuf. + * @return A new ArrowBuf which shares the same underlying memory as this ArrowBuf. */ public ArrowBuf retain(BufferAllocator target) { @@ -178,17 +178,17 @@ public ArrowBuf retain(BufferAllocator target) { historicalLog.recordEvent("retain(%s)", target.getName()); } final BufferLedger otherLedger = this.ledger.getLedgerForAllocator(target); - return otherLedger.newDrillBuf(offset, length, null); + return otherLedger.newArrowBuf(offset, length, null); } /** - * Transfer the memory accounting ownership of this DrillBuf to another allocator. This will generate a new DrillBuf - * that carries an association with the underlying memory of this DrillBuf. If this DrillBuf is connected to the + * Transfer the memory accounting ownership of this ArrowBuf to another allocator. This will generate a new ArrowBuf + * that carries an association with the underlying memory of this ArrowBuf. If this ArrowBuf is connected to the * owning BufferLedger of this memory, that memory ownership/accounting will be transferred to the taret allocator. If - * this DrillBuf does not currently own the memory underlying it (and is only associated with it), this does not - * transfer any ownership to the newly created DrillBuf. + * this ArrowBuf does not currently own the memory underlying it (and is only associated with it), this does not + * transfer any ownership to the newly created ArrowBuf. * - * This operation has no impact on the reference count of this DrillBuf. The newly created DrillBuf with either have a + * This operation has no impact on the reference count of this ArrowBuf. The newly created ArrowBuf with either have a * reference count of 1 (in the case that this is the first time this memory is being associated with the new * allocator) or the current value of the reference count for the other AllocationManager/BufferLedger combination in * the case that the provided allocator already had an association to this underlying memory. @@ -203,7 +203,7 @@ public ArrowBuf retain(BufferAllocator target) { * @param target * The allocator to transfer ownership to. * @return A new transfer result with the impact of the transfer (whether it was overlimit) as well as the newly - * created DrillBuf. + * created ArrowBuf. */ public TransferResult transferOwnership(BufferAllocator target) { @@ -212,7 +212,7 @@ public TransferResult transferOwnership(BufferAllocator target) { } final BufferLedger otherLedger = this.ledger.getLedgerForAllocator(target); - final ArrowBuf newBuf = otherLedger.newDrillBuf(offset, length, null); + final ArrowBuf newBuf = otherLedger.newArrowBuf(offset, length, null); final boolean allocationFit = this.ledger.transferBalance(otherLedger); return new TransferResult(allocationFit, newBuf); } @@ -267,7 +267,7 @@ public boolean release(int decrement) { if (refCnt < 0) { throw new IllegalStateException( - String.format("DrillBuf[%d] refCnt has gone negative. Buffer Info: %s", id, toVerboseString())); + String.format("ArrowBuf[%d] refCnt has gone negative. Buffer Info: %s", id, toVerboseString())); } return refCnt == 0; @@ -370,7 +370,7 @@ public ArrowBuf slice(int index, int length) { * Re the behavior of reference counting, see http://netty.io/wiki/reference-counted-objects.html#wiki-h3-5, which * explains that derived buffers share their reference count with their parent */ - final ArrowBuf newBuf = ledger.newDrillBuf(offset + index, length); + final ArrowBuf newBuf = ledger.newArrowBuf(offset + index, length); newBuf.writerIndex(length); return newBuf; } @@ -437,7 +437,7 @@ public long memoryAddress() { @Override public String toString() { - return String.format("DrillBuf[%d], udle: [%d %d..%d]", id, udle.id, offset, offset + capacity()); + return String.format("ArrowBuf[%d], udle: [%d %d..%d]", id, udle.id, offset, offset + capacity()); } @Override @@ -782,7 +782,7 @@ public void close() { } /** - * Returns the possible memory consumed by this DrillBuf in the worse case scenario. (not shared, connected to larger + * Returns the possible memory consumed by this ArrowBuf in the worse case scenario. (not shared, connected to larger * underlying buffer of allocated memory) * * @return Size in bytes. @@ -833,7 +833,7 @@ public String toHexString(final int start, final int length) { } /** - * Get the integer id assigned to this DrillBuf for debugging purposes. + * Get the integer id assigned to this ArrowBuf for debugging purposes. * * @return integer id */ diff --git a/java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java b/java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java index 59886474923..7fb884daa39 100644 --- a/java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ExpandableByteBuf.java @@ -20,7 +20,7 @@ import org.apache.arrow.memory.BufferAllocator; /** - * Allows us to decorate DrillBuf to make it expandable so that we can use them in the context of the Netty framework + * Allows us to decorate ArrowBuf to make it expandable so that we can use them in the context of the Netty framework * (thus supporting RPC level memory accounting). */ public class ExpandableByteBuf extends MutableWrappedByteBuf { diff --git a/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 1610028df9d..0b6e3f7f839 100644 --- a/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -32,7 +32,7 @@ import com.codahale.metrics.MetricRegistry; /** - * The base allocator that we use for all of Drill's memory management. Returns UnsafeDirectLittleEndian buffers. + * The base allocator that we use for all of Arrow's memory management. Returns UnsafeDirectLittleEndian buffers. */ public class PooledByteBufAllocatorL { private static final org.slf4j.Logger memoryLogger = org.slf4j.LoggerFactory.getLogger("drill.allocator"); @@ -184,7 +184,7 @@ private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCa private UnsupportedOperationException fail() { return new UnsupportedOperationException( - "Drill requries that the JVM used supports access sun.misc.Unsafe. This platform didn't provide that functionality."); + "Arrow requries that the JVM used supports access sun.misc.Unsafe. This platform didn't provide that functionality."); } public UnsafeDirectLittleEndian directBuffer(int initialCapacity, int maxCapacity) { @@ -197,7 +197,7 @@ public UnsafeDirectLittleEndian directBuffer(int initialCapacity, int maxCapacit @Override public ByteBuf heapBuffer(int initialCapacity, int maxCapacity) { - throw new UnsupportedOperationException("Drill doesn't support using heap buffers."); + throw new UnsupportedOperationException("Arrow doesn't support using heap buffers."); } diff --git a/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java index 6495d5d371e..a94c6d19883 100644 --- a/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java +++ b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java @@ -24,7 +24,7 @@ import java.util.concurrent.atomic.AtomicLong; /** - * The underlying class we use for little-endian access to memory. Is used underneath DrillBufs to abstract away the + * The underlying class we use for little-endian access to memory. Is used underneath ArrowBufs to abstract away the * Netty classes and underlying Netty memory management. */ public final class UnsafeDirectLittleEndian extends WrappedByteBuf { @@ -55,7 +55,7 @@ public final class UnsafeDirectLittleEndian extends WrappedByteBuf { private UnsafeDirectLittleEndian(AbstractByteBuf buf, boolean fake, AtomicLong bufferCount, AtomicLong bufferSize) { super(buf); if (!NATIVE_ORDER || buf.order() != ByteOrder.BIG_ENDIAN) { - throw new IllegalStateException("Drill only runs on LittleEndian systems."); + throw new IllegalStateException("Arrow only runs on LittleEndian systems."); } this.bufferCount = bufferCount; diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index 0db61443266..37d1d34a620 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -41,7 +41,7 @@ * This class is also responsible for managing when memory is allocated and returned to the Netty-based * PooledByteBufAllocatorL. * - * The only reason that this isn't package private is we're forced to put DrillBuf in Netty's package which need access + * The only reason that this isn't package private is we're forced to put ArrowBuf in Netty's package which need access * to these objects or methods. * * Threading: AllocationManager manages thread-safety internally. Operations within the context of a single BufferLedger @@ -185,8 +185,8 @@ public void release() { /** * The reference manager that binds an allocator manager to a particular BaseAllocator. Also responsible for creating - * a set of DrillBufs that share a common fate and set of reference counts. - * As with AllocationManager, the only reason this is public is due to DrillBuf being in io.netty.buffer package. + * a set of ArrowBufs that share a common fate and set of reference counts. + * As with AllocationManager, the only reason this is public is due to ArrowBuf being in io.netty.buffer package. */ public class BufferLedger { @@ -322,7 +322,7 @@ public int decrement(int decrement) { /** * Returns the ledger associated with a particular BufferAllocator. If the BufferAllocator doesn't currently have a * ledger associated with this AllocationManager, a new one is created. This is placed on BufferLedger rather than - * AllocationManager directly because DrillBufs don't have access to AllocationManager and they are the ones + * AllocationManager directly because ArrowBufs don't have access to AllocationManager and they are the ones * responsible for exposing the ability to associate multiple allocators with a particular piece of underlying * memory. Note that this will increment the reference count of this ledger by one to ensure the ledger isn't * destroyed before use. @@ -335,32 +335,32 @@ public BufferLedger getLedgerForAllocator(BufferAllocator allocator) { } /** - * Create a new DrillBuf associated with this AllocationManager and memory. Does not impact reference count. + * Create a new ArrowBuf associated with this AllocationManager and memory. Does not impact reference count. * Typically used for slicing. * @param offset - * The offset in bytes to start this new DrillBuf. + * The offset in bytes to start this new ArrowBuf. * @param length - * The length in bytes that this DrillBuf will provide access to. - * @return A new DrillBuf that shares references with all DrillBufs associated with this BufferLedger + * The length in bytes that this ArrowBuf will provide access to. + * @return A new ArrowBuf that shares references with all ArrowBufs associated with this BufferLedger */ - public ArrowBuf newDrillBuf(int offset, int length) { + public ArrowBuf newArrowBuf(int offset, int length) { allocator.assertOpen(); - return newDrillBuf(offset, length, null); + return newArrowBuf(offset, length, null); } /** - * Create a new DrillBuf associated with this AllocationManager and memory. + * Create a new ArrowBuf associated with this AllocationManager and memory. * @param offset - * The offset in bytes to start this new DrillBuf. + * The offset in bytes to start this new ArrowBuf. * @param length - * The length in bytes that this DrillBuf will provide access to. + * The length in bytes that this ArrowBuf will provide access to. * @param manager - * An optional BufferManager argument that can be used to manage expansion of this DrillBuf + * An optional BufferManager argument that can be used to manage expansion of this ArrowBuf * @param retain * Whether or not the newly created buffer should get an additional reference count added to it. - * @return A new DrillBuf that shares references with all DrillBufs associated with this BufferLedger + * @return A new ArrowBuf that shares references with all ArrowBufs associated with this BufferLedger */ - public ArrowBuf newDrillBuf(int offset, int length, BufferManager manager) { + public ArrowBuf newArrowBuf(int offset, int length, BufferManager manager) { allocator.assertOpen(); final ArrowBuf buf = new ArrowBuf( @@ -375,7 +375,7 @@ public ArrowBuf newDrillBuf(int offset, int length, BufferManager manager) { if (BaseAllocator.DEBUG) { historicalLog.recordEvent( - "DrillBuf(BufferLedger, BufferAllocator[%s], UnsafeDirectLittleEndian[identityHashCode == " + "ArrowBuf(BufferLedger, BufferAllocator[%s], UnsafeDirectLittleEndian[identityHashCode == " + "%d](%s)) => ledger hc == %d", allocator.name, System.identityHashCode(buf), buf.toString(), System.identityHashCode(this)); diff --git a/java/memory/src/main/java/org/apache/arrow/memory/DrillByteBufAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java similarity index 92% rename from java/memory/src/main/java/org/apache/arrow/memory/DrillByteBufAllocator.java rename to java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java index 23d644841e1..f3f72fa57c3 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/DrillByteBufAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java @@ -23,19 +23,19 @@ import io.netty.buffer.ExpandableByteBuf; /** - * An implementation of ByteBufAllocator that wraps a Drill BufferAllocator. This allows the RPC layer to be accounted - * and managed using Drill's BufferAllocator infrastructure. The only thin different from a typical BufferAllocator is + * An implementation of ByteBufAllocator that wraps a Arrow BufferAllocator. This allows the RPC layer to be accounted + * and managed using Arrow's BufferAllocator infrastructure. The only thin different from a typical BufferAllocator is * the signature and the fact that this Allocator returns ExpandableByteBufs which enable otherwise non-expandable - * DrillBufs to be expandable. + * ArrowBufs to be expandable. */ -public class DrillByteBufAllocator implements ByteBufAllocator { +public class ArrowByteBufAllocator implements ByteBufAllocator { private static final int DEFAULT_BUFFER_SIZE = 4096; private static final int DEFAULT_MAX_COMPOSITE_COMPONENTS = 16; private final BufferAllocator allocator; - public DrillByteBufAllocator(BufferAllocator allocator) { + public ArrowByteBufAllocator(BufferAllocator allocator) { this.allocator = allocator; } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java index 72f77ab0c7b..90257bb9ffb 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -82,7 +82,7 @@ protected BaseAllocator( this.parentAllocator = parentAllocator; this.name = name; - this.thisAsByteBufAllocator = new DrillByteBufAllocator(this); + this.thisAsByteBufAllocator = new ArrowByteBufAllocator(this); if (DEBUG) { childAllocators = new IdentityHashMap<>(); @@ -236,7 +236,7 @@ private ArrowBuf bufferWithoutReservation(final int size, BufferManager bufferMa final AllocationManager manager = new AllocationManager(this, size); final BufferLedger ledger = manager.associate(this); // +1 ref cnt (required) - final ArrowBuf buffer = ledger.newDrillBuf(0, size, bufferManager); + final ArrowBuf buffer = ledger.newArrowBuf(0, size, bufferManager); // make sure that our allocation is equal to what we expected. Preconditions.checkArgument(buffer.capacity() == size, @@ -314,9 +314,9 @@ public ArrowBuf allocateBuffer() { Preconditions.checkState(!closed, "Attempt to allocate after closed"); Preconditions.checkState(!used, "Attempt to allocate more than once"); - final ArrowBuf drillBuf = allocate(nBytes); + final ArrowBuf arrowBuf = allocate(nBytes); used = true; - return drillBuf; + return arrowBuf; } public int getSize() { @@ -397,13 +397,13 @@ private ArrowBuf allocate(int nBytes) { * as well, so we need to return the same number back to avoid double-counting them. */ try { - final ArrowBuf drillBuf = BaseAllocator.this.bufferWithoutReservation(nBytes, null); + final ArrowBuf arrowBuf = BaseAllocator.this.bufferWithoutReservation(nBytes, null); if (DEBUG) { - historicalLog.recordEvent("allocate() => %s", String.format("DrillBuf[%d]", drillBuf.getId())); + historicalLog.recordEvent("allocate() => %s", String.format("ArrowBuf[%d]", arrowBuf.getId())); } success = true; - return drillBuf; + return arrowBuf; } finally { if (!success) { releaseBytes(nBytes); @@ -565,7 +565,7 @@ void verifyAllocator() { * Verifies the accounting state of the allocator. Only works for DEBUG. * *

- * This overload is used for recursive calls, allowing for checking that DrillBufs are unique across all allocators + * This overload is used for recursive calls, allowing for checking that ArrowBufs are unique across all allocators * that are checked. *

* @@ -594,7 +594,7 @@ private void verifyAllocator(final IdentityHashMap T typeify(ValueVector v, Class clazz) { if (clazz.isAssignableFrom(v.getClass())) { return (T) v; } - throw new IllegalStateException(String.format("Vector requested [%s] was different than type stored [%s]. Drill doesn't yet support hetergenous types.", clazz.getSimpleName(), v.getClass().getSimpleName())); + throw new IllegalStateException(String.format("Vector requested [%s] was different than type stored [%s]. Arrow doesn't yet support hetergenous types.", clazz.getSimpleName(), v.getClass().getSimpleName())); } MajorType getLastPathType() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java index d4189b2314a..de6ae829b47 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java @@ -137,7 +137,7 @@ public T addOrGet(String name, MajorType type, Class } return vector; } - final String message = "Drill does not support schema change yet. Existing[%s] and desired[%s] vector types mismatch"; + final String message = "Arrow does not support schema change yet. Existing[%s] and desired[%s] vector types mismatch"; throw new IllegalStateException(String.format(message, existing.getClass().getSimpleName(), clazz.getSimpleName())); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java index 88cbcd4a8c3..16777c806ec 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/ValueHolder.java @@ -18,10 +18,10 @@ package org.apache.arrow.vector.holders; /** - * Wrapper object for an individual value in Drill. + * Wrapper object for an individual value in Arrow. * * ValueHolders are designed to be mutable wrapper objects for defining clean - * APIs that access data in Drill. For performance, object creation is avoided + * APIs that access data in Arrow. For performance, object creation is avoided * at all costs throughout execution. For this reason, ValueHolders are * disallowed from implementing any methods, this allows for them to be * replaced by their java primitive inner members during optimization of diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index 2bdfd70b229..b6dd13a06a8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -29,12 +29,12 @@ public class ByteFunctionHelpers { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ByteFunctionHelpers.class); /** - * Helper function to check for equality of bytes in two DrillBuffers + * Helper function to check for equality of bytes in two ArrowBufs * - * @param left Left DrillBuf for comparison + * @param left Left ArrowBuf for comparison * @param lStart start offset in the buffer * @param lEnd end offset in the buffer - * @param right Right DrillBuf for comparison + * @param right Right ArrowBuf for comparison * @param rStart start offset in the buffer * @param rEnd end offset in the buffer * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise @@ -81,14 +81,14 @@ private static final int memEqual(final long laddr, int lStart, int lEnd, final } /** - * Helper function to compare a set of bytes in two DrillBuffers. + * Helper function to compare a set of bytes in two ArrowBufs. * * Function will check data before completing in the case that * - * @param left Left DrillBuf to compare + * @param left Left ArrowBuf to compare * @param lStart start offset in the buffer * @param lEnd end offset in the buffer - * @param right Right DrillBuf to compare + * @param right Right ArrowBuf to compare * @param rStart start offset in the buffer * @param rEnd end offset in the buffer * @return 1 if left input is greater, -1 if left input is smaller, 0 otherwise @@ -138,9 +138,9 @@ private static final int memcmp(final long laddr, int lStart, int lEnd, final lo } /** - * Helper function to compare a set of bytes in DrillBuf to a ByteArray. + * Helper function to compare a set of bytes in ArrowBuf to a ByteArray. * - * @param left Left DrillBuf for comparison purposes + * @param left Left ArrowBuf for comparison purposes * @param lStart start offset in the buffer * @param lEnd end offset in the buffer * @param right second input to be compared diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 576a5b6351a..a3763cd34f1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -145,16 +145,16 @@ public static StringBuilder toStringWithZeroes(long number, int desiredLength) { public static BigDecimal getBigDecimalFromIntermediate(ByteBuf data, int startIndex, int nDecimalDigits, int scale) { // In the intermediate representation we don't pad the scale with zeroes, so set truncate = false - return getBigDecimalFromDrillBuf(data, startIndex, nDecimalDigits, scale, false); + return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, false); } public static BigDecimal getBigDecimalFromSparse(ArrowBuf data, int startIndex, int nDecimalDigits, int scale) { // In the sparse representation we pad the scale with zeroes for ease of arithmetic, need to truncate - return getBigDecimalFromDrillBuf(data, startIndex, nDecimalDigits, scale, true); + return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, true); } - public static BigDecimal getBigDecimalFromDrillBuf(ArrowBuf bytebuf, int start, int length, int scale) { + public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int start, int length, int scale) { byte[] value = new byte[length]; bytebuf.getBytes(start, value, 0, length); BigInteger unscaledValue = new BigInteger(value); @@ -168,17 +168,17 @@ public static BigDecimal getBigDecimalFromByteBuffer(ByteBuffer bytebuf, int sta return new BigDecimal(unscaledValue, scale); } - /* Create a BigDecimal object using the data in the DrillBuf. + /* Create a BigDecimal object using the data in the ArrowBuf. * This function assumes that data is provided in a non-dense format * It works on both sparse and intermediate representations. */ - public static BigDecimal getBigDecimalFromDrillBuf(ByteBuf data, int startIndex, int nDecimalDigits, int scale, + public static BigDecimal getBigDecimalFromArrowBuf(ByteBuf data, int startIndex, int nDecimalDigits, int scale, boolean truncateScale) { // For sparse decimal type we have padded zeroes at the end, strip them while converting to BigDecimal. int actualDigits; - // Initialize the BigDecimal, first digit in the DrillBuf has the sign so mask it out + // Initialize the BigDecimal, first digit in the ArrowBuf has the sign so mask it out BigInteger decimalDigits = BigInteger.valueOf((data.getInt(startIndex)) & 0x7FFFFFFF); BigInteger base = BigInteger.valueOf(DIGITS_BASE); @@ -208,7 +208,7 @@ public static BigDecimal getBigDecimalFromDrillBuf(ByteBuf data, int startIndex, /* This function returns a BigDecimal object from the dense decimal representation. * First step is to convert the dense representation into an intermediate representation - * and then invoke getBigDecimalFromDrillBuf() to get the BigDecimal object + * and then invoke getBigDecimalFromArrowBuf() to get the BigDecimal object */ public static BigDecimal getBigDecimalFromDense(ArrowBuf data, int startIndex, int nDecimalDigits, int scale, int maxPrecision, int width) { @@ -340,7 +340,7 @@ public static void getSparseFromBigDecimal(BigDecimal input, ByteBuf data, int s destIndex = nDecimalDigits - 1; while (scale > 0) { - // Get next set of MAX_DIGITS (9) store it in the DrillBuf + // Get next set of MAX_DIGITS (9) store it in the ArrowBuf fractionalPart = fractionalPart.movePointLeft(MAX_DIGITS); BigDecimal temp = fractionalPart.remainder(BigDecimal.ONE); From a3856222d78d58b51088769178715dcb1e5a8d2c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 1 Mar 2016 14:48:27 -0800 Subject: [PATCH 009/210] ARROW-8: Add .travis.yml and test script for Arrow C++. OS X build fixes --- .travis.yml | 27 ++++++++++++++++++++++ README.md | 11 +++++++++ ci/travis_script_cpp.sh | 35 +++++++++++++++++++++++++++++ cpp/CMakeLists.txt | 37 ++++++++++++++++--------------- cpp/setup_build_env.sh | 3 +-- cpp/src/arrow/util/CMakeLists.txt | 2 +- 6 files changed, 94 insertions(+), 21 deletions(-) create mode 100644 .travis.yml create mode 100755 ci/travis_script_cpp.sh diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000000..cb2d5cb1bad --- /dev/null +++ b/.travis.yml @@ -0,0 +1,27 @@ +sudo: required +dist: trusty +addons: + apt: + sources: + - ubuntu-toolchain-r-test + - kalakris-cmake + packages: + - gcc-4.9 # Needed for C++11 + - g++-4.9 # Needed for C++11 + - gcov + - cmake + - valgrind + +matrix: + include: + - compiler: gcc + language: cpp + os: linux + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - compiler: clang + language: cpp + os: osx + addons: + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh diff --git a/README.md b/README.md index 4423a913513..d948a996bc0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,16 @@ ## Apache Arrow + + + + + +
Build Status + + travis build status + +
+ #### Powering Columnar In-Memory Analytics Arrow is a set of technologies that enable big-data systems to process and move data fast. diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh new file mode 100755 index 00000000000..28f16cc021f --- /dev/null +++ b/ci/travis_script_cpp.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -e + +mkdir $TRAVIS_BUILD_DIR/cpp-build +pushd $TRAVIS_BUILD_DIR/cpp-build + +CPP_DIR=$TRAVIS_BUILD_DIR/cpp + +# Build an isolated thirdparty +cp -r $CPP_DIR/thirdparty . +cp $CPP_DIR/setup_build_env.sh . + +if [ $TRAVIS_OS_NAME == "linux" ]; then + # Use a C++11 compiler on Linux + export CC="gcc-4.9" + export CXX="g++-4.9" +fi + +source setup_build_env.sh + +echo $GTEST_HOME + +cmake -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +make lint +make -j4 + +if [ $TRAVIS_OS_NAME == "linux" ]; then + valgrind --tool=memcheck --leak-check=yes --error-exitcode=1 ctest +else + ctest +fi + +popd +rm -rf cpp-build diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 90e55dfddbf..5ddd9dae3fe 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -44,6 +44,11 @@ if (NOT "$ENV{ARROW_GCC_ROOT}" STREQUAL "") set(CMAKE_CXX_COMPILER ${GCC_ROOT}/bin/g++) endif() +if(APPLE) + # In newer versions of CMake, this is the default setting + set(CMAKE_MACOSX_RPATH 1) +endif() + # ---------------------------------------------------------------------- # cmake options @@ -68,19 +73,15 @@ endif() ############################################################ # compiler flags that are common across debug/release builds -# - msse4.2: Enable sse4.2 compiler intrinsics. # - Wall: Enable all warnings. -# - Wno-sign-compare: suppress warnings for comparison between signed and unsigned -# integers -# -Wno-deprecated: some of the gutil code includes old things like ext/hash_set, ignore that -# - pthread: enable multithreaded malloc -# - -D__STDC_FORMAT_MACROS: for PRI* print format macros -# -fno-strict-aliasing -# Assume programs do not follow strict aliasing rules. -# GCC cannot always verify whether strict aliasing rules are indeed followed due to -# fundamental limitations in escape analysis, which can result in subtle bad code generation. -# This has a small perf hit but worth it to avoid hard to debug crashes. -set(CXX_COMMON_FLAGS "-std=c++11 -fno-strict-aliasing -msse3 -Wall -Wno-deprecated -pthread -D__STDC_FORMAT_MACROS") +set(CXX_COMMON_FLAGS "-std=c++11 -msse3 -Wall") + +if (APPLE) + # Depending on the default OSX_DEPLOYMENT_TARGET (< 10.9), libstdc++ may be + # the default standard library which does not support C++11. libc++ is the + # default from 10.9 onward. + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -stdlib=libc++") +endif() # compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE= .') # For all builds: @@ -157,10 +158,6 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") else() message("Running without a controlling terminal or in a dumb terminal") endif() - - # Use libstdc++ and not libc++. The latter lacks support for tr1 in OSX - # and since 10.9 is now the default. - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++") endif() # Sanity check linking option. @@ -473,11 +470,15 @@ set(ARROW_SRCS src/arrow/type.cc ) -add_library(arrow SHARED +set(LIBARROW_LINKAGE "SHARED") + +add_library(arrow + ${LIBARROW_LINKAGE} ${ARROW_SRCS} ) target_link_libraries(arrow ${LINK_LIBS}) set_target_properties(arrow PROPERTIES LINKER_LANGUAGE CXX) install(TARGETS arrow - LIBRARY DESTINATION lib) + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index 457b9717ebe..e9901bdbecd 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -1,11 +1,10 @@ #!/bin/bash -set -e - SOURCE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) ./thirdparty/download_thirdparty.sh ./thirdparty/build_thirdparty.sh +source thirdparty/versions.sh export GTEST_HOME=$SOURCE_DIR/thirdparty/$GTEST_BASEDIR diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 88e3f7a656d..ff8db6a0410 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -26,7 +26,7 @@ set(UTIL_SRCS ) set(UTIL_LIBS - rt) +) add_library(arrow_util STATIC ${UTIL_SRCS} From 8f2ca246b34daa49eed2a1eb2a747cab93bb2dbd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Mar 2016 13:49:20 -0800 Subject: [PATCH 010/210] ARROW-13: Add PR merge tool from parquet-mr, suitably modified Author: Wes McKinney Closes #7 from wesm/ARROW-13 and squashes the following commits: 7a58712 [Wes McKinney] Add PR merge tool from parquet-mr, suitably modified --- dev/README.md | 94 +++++++++++ dev/merge_arrow_pr.py | 362 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 456 insertions(+) create mode 100644 dev/README.md create mode 100755 dev/merge_arrow_pr.py diff --git a/dev/README.md b/dev/README.md new file mode 100644 index 00000000000..e986abef191 --- /dev/null +++ b/dev/README.md @@ -0,0 +1,94 @@ + + +# Arrow Developer Scripts + +This directory contains scripts useful to developers when packaging, +testing, or committing to Arrow. + +Merging a pull request requires being a committer on the project. + +* How to merge a Pull request: +have an apache and apache-github remote setup +``` +git remote add apache-github https://github.com/apache/arrow.git +git remote add apache https://git-wip-us.apache.org/repos/asf/arrow.git +``` +run the following command +``` +dev/merge_arrow_pr.py +``` + +Note: +* The directory name of your Arrow git clone must be called arrow +* Without jira-python installed you'll have to close the JIRA manually + +example output: +``` +Which pull request would you like to merge? (e.g. 34): +``` +Type the pull request number (from https://github.com/apache/arrow/pulls) and hit enter. +``` +=== Pull Request #X === +title Blah Blah Blah +source repo/branch +target master +url https://api.github.com/repos/apache/arrow/pulls/X + +Proceed with merging pull request #3? (y/n): +``` +If this looks good, type y and hit enter. +``` +From git-wip-us.apache.org:/repos/asf/arrow.git + * [new branch] master -> PR_TOOL_MERGE_PR_3_MASTER +Switched to branch 'PR_TOOL_MERGE_PR_3_MASTER' + +Merge complete (local ref PR_TOOL_MERGE_PR_3_MASTER). Push to apache? (y/n): +``` +A local branch with the merge has been created. +type y and hit enter to push it to apache master +``` +Counting objects: 67, done. +Delta compression using up to 4 threads. +Compressing objects: 100% (26/26), done. +Writing objects: 100% (36/36), 5.32 KiB, done. +Total 36 (delta 17), reused 0 (delta 0) +To git-wip-us.apache.org:/repos/arrow-mr.git + b767ac4..485658a PR_TOOL_MERGE_PR_X_MASTER -> master +Restoring head pointer to b767ac4e +Note: checking out 'b767ac4e'. + +You are in 'detached HEAD' state. You can look around, make experimental +changes and commit them, and you can discard any commits you make in this +state without impacting any branches by performing another checkout. + +If you want to create a new branch to retain commits you create, you may +do so (now or later) by using -b with the checkout command again. Example: + + git checkout -b new_branch_name + +HEAD is now at b767ac4... Update README.md +Deleting local branch PR_TOOL_MERGE_PR_X +Deleting local branch PR_TOOL_MERGE_PR_X_MASTER +Pull request #X merged! +Merge hash: 485658a5 + +Would you like to pick 485658a5 into another branch? (y/n): +``` +For now just say n as we have 1 branch diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py new file mode 100755 index 00000000000..ef47dec88c1 --- /dev/null +++ b/dev/merge_arrow_pr.py @@ -0,0 +1,362 @@ +#!/usr/bin/env python + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Utility for creating well-formed pull request merges and pushing them to Apache. +# usage: ./apache-pr-merge.py (see config env vars below) +# +# This utility assumes you already have a local Arrow git clone and that you +# have added remotes corresponding to both (i) the Github Apache Arrow mirror +# and (ii) the apache git repo. + +import json +import os +import re +import subprocess +import sys +import tempfile +import urllib2 +import getpass + +try: + import jira.client + JIRA_IMPORTED = True +except ImportError: + JIRA_IMPORTED = False + +# Location of your Arrow git clone +ARROW_HOME = os.path.abspath(__file__).rsplit("/", 2)[0] +PROJECT_NAME = ARROW_HOME.rsplit("/", 1)[1] +print "ARROW_HOME = " + ARROW_HOME +print "PROJECT_NAME = " + PROJECT_NAME + +# Remote name which points to the Gihub site +PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github") +# Remote name which points to Apache git +PUSH_REMOTE_NAME = os.environ.get("PUSH_REMOTE_NAME", "apache") +# ASF JIRA username +JIRA_USERNAME = os.environ.get("JIRA_USERNAME") +# ASF JIRA password +JIRA_PASSWORD = os.environ.get("JIRA_PASSWORD") + +GITHUB_BASE = "https://github.com/apache/" + PROJECT_NAME + "/pull" +GITHUB_API_BASE = "https://api.github.com/repos/apache/" + PROJECT_NAME +JIRA_BASE = "https://issues.apache.org/jira/browse" +JIRA_API_BASE = "https://issues.apache.org/jira" +# Prefix added to temporary branches +BRANCH_PREFIX = "PR_TOOL" + +os.chdir(ARROW_HOME) + + +def get_json(url): + try: + return json.load(urllib2.urlopen(url)) + except urllib2.HTTPError as e: + print "Unable to fetch URL, exiting: %s" % url + sys.exit(-1) + + +def fail(msg): + print msg + clean_up() + sys.exit(-1) + + +def run_cmd(cmd): + try: + if isinstance(cmd, list): + return subprocess.check_output(cmd) + else: + return subprocess.check_output(cmd.split(" ")) + except subprocess.CalledProcessError as e: + # this avoids hiding the stdout / stderr of failed processes + print 'Command failed: %s' % cmd + print 'With output:' + print '--------------' + print e.output + print '--------------' + raise e + +def continue_maybe(prompt): + result = raw_input("\n%s (y/n): " % prompt) + if result.lower() != "y": + fail("Okay, exiting") + + +original_head = run_cmd("git rev-parse HEAD")[:8] + + +def clean_up(): + print "Restoring head pointer to %s" % original_head + run_cmd("git checkout %s" % original_head) + + branches = run_cmd("git branch").replace(" ", "").split("\n") + + for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): + print "Deleting local branch %s" % branch + run_cmd("git branch -D %s" % branch) + + +# merge the requested PR and return the merge hash +def merge_pr(pr_num, target_ref): + pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) + target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper()) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name)) + run_cmd("git checkout %s" % target_branch_name) + + had_conflicts = False + try: + run_cmd(['git', 'merge', pr_branch_name, '--squash']) + except Exception as e: + msg = "Error merging: %s\nWould you like to manually fix-up this merge?" % e + continue_maybe(msg) + msg = "Okay, please fix any conflicts and 'git add' conflicting files... Finished?" + continue_maybe(msg) + had_conflicts = True + + commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, + '--pretty=format:%an <%ae>']).split("\n") + distinct_authors = sorted(set(commit_authors), + key=lambda x: commit_authors.count(x), reverse=True) + primary_author = distinct_authors[0] + commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, + '--pretty=format:%h [%an] %s']).split("\n\n") + + merge_message_flags = [] + + merge_message_flags += ["-m", title] + if body != None: + merge_message_flags += ["-m", body] + + authors = "\n".join(["Author: %s" % a for a in distinct_authors]) + + merge_message_flags += ["-m", authors] + + if had_conflicts: + committer_name = run_cmd("git config --get user.name").strip() + committer_email = run_cmd("git config --get user.email").strip() + message = "This patch had conflicts when merged, resolved by\nCommitter: %s <%s>" % ( + committer_name, committer_email) + merge_message_flags += ["-m", message] + + # The string "Closes #%s" string is required for GitHub to correctly close the PR + merge_message_flags += [ + "-m", + "Closes #%s from %s and squashes the following commits:" % (pr_num, pr_repo_desc)] + for c in commits: + merge_message_flags += ["-m", c] + + run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags) + + continue_maybe("Merge complete (local ref %s). Push to %s?" % ( + target_branch_name, PUSH_REMOTE_NAME)) + + try: + run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref)) + except Exception as e: + clean_up() + fail("Exception while pushing: %s" % e) + + merge_hash = run_cmd("git rev-parse %s" % target_branch_name)[:8] + clean_up() + print("Pull request #%s merged!" % pr_num) + print("Merge hash: %s" % merge_hash) + return merge_hash + + +def cherry_pick(pr_num, merge_hash, default_branch): + pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) + if pick_ref == "": + pick_ref = default_branch + + pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper()) + + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name)) + run_cmd("git checkout %s" % pick_branch_name) + run_cmd("git cherry-pick -sx %s" % merge_hash) + + continue_maybe("Pick complete (local ref %s). Push to %s?" % ( + pick_branch_name, PUSH_REMOTE_NAME)) + + try: + run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) + except Exception as e: + clean_up() + fail("Exception while pushing: %s" % e) + + pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8] + clean_up() + + print("Pull request #%s picked into %s!" % (pr_num, pick_ref)) + print("Pick hash: %s" % pick_hash) + return pick_ref + + +def fix_version_from_branch(branch, versions): + # Note: Assumes this is a sorted (newest->oldest) list of un-released versions + if branch == "master": + return versions[0] + else: + branch_ver = branch.replace("branch-", "") + return filter(lambda x: x.name.startswith(branch_ver), versions)[-1] + +def exctract_jira_id(title): + m = re.search(r'^(ARROW-[0-9]+)\b.*$', title) + if m and m.groups > 0: + return m.group(1) + else: + fail("PR title should be prefixed by a jira id \"ARROW-XXX: ...\", found: \"%s\"" % title) + +def check_jira(title): + jira_id = exctract_jira_id(title) + asf_jira = jira.client.JIRA({'server': JIRA_API_BASE}, + basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) + try: + issue = asf_jira.issue(jira_id) + except Exception as e: + fail("ASF JIRA could not find %s\n%s" % (jira_id, e)) + +def resolve_jira(title, merge_branches, comment): + asf_jira = jira.client.JIRA({'server': JIRA_API_BASE}, + basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) + + default_jira_id = exctract_jira_id(title) + + jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id) + if jira_id == "": + jira_id = default_jira_id + + try: + issue = asf_jira.issue(jira_id) + except Exception as e: + fail("ASF JIRA could not find %s\n%s" % (jira_id, e)) + + cur_status = issue.fields.status.name + cur_summary = issue.fields.summary + cur_assignee = issue.fields.assignee + if cur_assignee is None: + cur_assignee = "NOT ASSIGNED!!!" + else: + cur_assignee = cur_assignee.displayName + + if cur_status == "Resolved" or cur_status == "Closed": + fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status)) + print ("=== JIRA %s ===" % jira_id) + print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % ( + cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) + + versions = asf_jira.project_versions("ARROW") + versions = sorted(versions, key=lambda x: x.name, reverse=True) + versions = filter(lambda x: x.raw['released'] is False, versions) + + default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) + for v in default_fix_versions: + # Handles the case where we have forked a release branch but not yet made the release. + # In this case, if the PR is committed to the master branch and the release branch, we + # only consider the release branch to be the fix version. E.g. it is not valid to have + # both 1.1.0 and 1.0.0 as fix versions. + (major, minor, patch) = v.split(".") + if patch == "0": + previous = "%s.%s.%s" % (major, int(minor) - 1, 0) + if previous in default_fix_versions: + default_fix_versions = filter(lambda x: x != v, default_fix_versions) + default_fix_versions = ",".join(default_fix_versions) + + fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions) + if fix_versions == "": + fix_versions = default_fix_versions + fix_versions = fix_versions.replace(" ", "").split(",") + + def get_version_json(version_str): + return filter(lambda v: v.name == version_str, versions)[0].raw + + jira_fix_versions = map(lambda v: get_version_json(v), fix_versions) + + resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0] + asf_jira.transition_issue( + jira_id, resolve["id"], fixVersions=jira_fix_versions, comment=comment) + + print "Succesfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions) + + +if not JIRA_USERNAME: + JIRA_USERNAME = raw_input("Env JIRA_USERNAME not set, please enter your JIRA username:") + +if not JIRA_PASSWORD: + JIRA_PASSWORD = getpass.getpass("Env JIRA_PASSWORD not set, please enter your JIRA password:") + +branches = get_json("%s/branches" % GITHUB_API_BASE) +branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches]) +# Assumes branch names can be sorted lexicographically +# Julien: I commented this out as we don't have any "branch-*" branch yet +#latest_branch = sorted(branch_names, reverse=True)[0] + +pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ") +pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num)) + +url = pr["url"] +title = pr["title"] +check_jira(title) +body = pr["body"] +target_ref = pr["base"]["ref"] +user_login = pr["user"]["login"] +base_ref = pr["head"]["ref"] +pr_repo_desc = "%s/%s" % (user_login, base_ref) + +if pr["merged"] is True: + print "Pull request %s has already been merged, assuming you want to backport" % pr_num + merge_commit_desc = run_cmd([ + 'git', 'log', '--merges', '--first-parent', + '--grep=pull request #%s' % pr_num, '--oneline']).split("\n")[0] + if merge_commit_desc == "": + fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num) + + merge_hash = merge_commit_desc[:7] + message = merge_commit_desc[8:] + + print "Found: %s" % message + maybe_cherry_pick(pr_num, merge_hash, latest_branch) + sys.exit(0) + +if not bool(pr["mergeable"]): + msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \ + "Continue? (experts only!)" + continue_maybe(msg) + +print ("\n=== Pull Request #%s ===" % pr_num) +print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % ( + title, pr_repo_desc, target_ref, url)) +continue_maybe("Proceed with merging pull request #%s?" % pr_num) + +merged_refs = [target_ref] + +merge_hash = merge_pr(pr_num, target_ref) + +pick_prompt = "Would you like to pick %s into another branch?" % merge_hash +while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y": + merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)] + +if JIRA_IMPORTED: + continue_maybe("Would you like to update the associated JIRA?") + jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num) + resolve_jira(title, merged_refs, jira_comment) +else: + print "Could not find jira-python library. Run 'sudo pip install jira-python' to install." + print "Exiting without trying to close the associated JIRA." From 1000d110cdc8a699cfb9caaee7772a0a5161538c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Mar 2016 14:00:12 -0800 Subject: [PATCH 011/210] ARROW-36: Remove fixVersions from JIRA resolve code path This one is tricky to test; sorry I missed this on the first go (the JIRA transition code executes after ARROW-13 was merged). Author: Wes McKinney Closes #11 from wesm/ARROW-36 and squashes the following commits: 432c17c [Wes McKinney] Remove fixVersions from JIRA resolve code path --- dev/merge_arrow_pr.py | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index ef47dec88c1..fe0bcd13dd8 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -262,38 +262,11 @@ def resolve_jira(title, merge_branches, comment): print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % ( cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) - versions = asf_jira.project_versions("ARROW") - versions = sorted(versions, key=lambda x: x.name, reverse=True) - versions = filter(lambda x: x.raw['released'] is False, versions) - - default_fix_versions = map(lambda x: fix_version_from_branch(x, versions).name, merge_branches) - for v in default_fix_versions: - # Handles the case where we have forked a release branch but not yet made the release. - # In this case, if the PR is committed to the master branch and the release branch, we - # only consider the release branch to be the fix version. E.g. it is not valid to have - # both 1.1.0 and 1.0.0 as fix versions. - (major, minor, patch) = v.split(".") - if patch == "0": - previous = "%s.%s.%s" % (major, int(minor) - 1, 0) - if previous in default_fix_versions: - default_fix_versions = filter(lambda x: x != v, default_fix_versions) - default_fix_versions = ",".join(default_fix_versions) - - fix_versions = raw_input("Enter comma-separated fix version(s) [%s]: " % default_fix_versions) - if fix_versions == "": - fix_versions = default_fix_versions - fix_versions = fix_versions.replace(" ", "").split(",") - - def get_version_json(version_str): - return filter(lambda v: v.name == version_str, versions)[0].raw - - jira_fix_versions = map(lambda v: get_version_json(v), fix_versions) - - resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0] - asf_jira.transition_issue( - jira_id, resolve["id"], fixVersions=jira_fix_versions, comment=comment) - - print "Succesfully resolved %s with fixVersions=%s!" % (jira_id, fix_versions) + resolve = filter(lambda a: a['name'] == "Resolve Issue", + asf_jira.transitions(jira_id))[0] + asf_jira.transition_issue(jira_id, resolve["id"], comment=comment) + + print "Succesfully resolved %s!" % (jira_id) if not JIRA_USERNAME: From e418020852ad4fa148b07f21f5b4d47230fe4c5b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Mar 2016 14:02:53 -0800 Subject: [PATCH 012/210] ARROW-19: Add an externalized MemoryPool interface for use in builder classes Memory management will be an ongoing concern, but this is a stride in the right direction. Applications requiring custom memory management will be able to implement a subclass of MemoryPool; we can evolve its API as user needs evolve. Author: Wes McKinney Closes #8 from wesm/ARROW-19 and squashes the following commits: 08d3895 [Wes McKinney] Some include cleanup e319a36 [Wes McKinney] cpplint fixes abca6eb [Wes McKinney] Add a MemoryPool abstract interface, change builder instances to request memory from pool via Buffer subclass --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/array-test.cc | 10 +++- cpp/src/arrow/array.h | 1 - cpp/src/arrow/builder.cc | 2 +- cpp/src/arrow/builder.h | 22 +++++--- cpp/src/arrow/types/construct.cc | 13 +++-- cpp/src/arrow/types/construct.h | 4 +- cpp/src/arrow/types/list-test.cc | 2 +- cpp/src/arrow/types/list.h | 7 ++- cpp/src/arrow/types/primitive-test.cc | 5 +- cpp/src/arrow/types/primitive.h | 12 ++-- cpp/src/arrow/types/string-test.cc | 29 +++++----- cpp/src/arrow/types/string.h | 9 ++- cpp/src/arrow/types/struct.cc | 1 + cpp/src/arrow/types/test-common.h | 8 ++- cpp/src/arrow/types/union.cc | 1 + cpp/src/arrow/util/CMakeLists.txt | 3 + cpp/src/arrow/util/bit-util.cc | 2 +- cpp/src/arrow/util/bit-util.h | 3 +- cpp/src/arrow/util/buffer-test.cc | 6 +- cpp/src/arrow/util/buffer.cc | 36 ++++++++---- cpp/src/arrow/util/buffer.h | 36 ++++++++---- cpp/src/arrow/util/memory-pool-test.cc | 47 ++++++++++++++++ cpp/src/arrow/util/memory-pool.cc | 78 ++++++++++++++++++++++++++ cpp/src/arrow/util/memory-pool.h | 41 ++++++++++++++ 25 files changed, 301 insertions(+), 79 deletions(-) create mode 100644 cpp/src/arrow/util/memory-pool-test.cc create mode 100644 cpp/src/arrow/util/memory-pool.cc create mode 100644 cpp/src/arrow/util/memory-pool.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5ddd9dae3fe..d2c840abfe8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -434,7 +434,7 @@ if (UNIX) add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py --verbose=2 --linelength=90 - --filter=-whitespace/comments,-readability/todo,-build/header_guard + --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h`) endif (UNIX) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 5ecf91624fe..16afb9bef34 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -28,6 +29,8 @@ #include "arrow/types/integer.h" #include "arrow/types/primitive.h" #include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" using std::string; using std::vector; @@ -41,8 +44,10 @@ static TypePtr int32_nn = TypePtr(new Int32Type(false)); class TestArray : public ::testing::Test { public: void SetUp() { - auto data = std::make_shared(); - auto nulls = std::make_shared(); + pool_ = GetDefaultMemoryPool(); + + auto data = std::make_shared(pool_); + auto nulls = std::make_shared(pool_); ASSERT_OK(data->Resize(400)); ASSERT_OK(nulls->Resize(128)); @@ -51,6 +56,7 @@ class TestArray : public ::testing::Test { } protected: + MemoryPool* pool_; std::unique_ptr arr_; }; diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c95450d12a4..0eaa28d528e 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -19,7 +19,6 @@ #define ARROW_ARRAY_H #include -#include #include #include "arrow/type.h" diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1fd74719283..cb850673150 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -30,7 +30,7 @@ Status ArrayBuilder::Init(int64_t capacity) { if (nullable_) { int64_t to_alloc = util::ceil_byte(capacity) / 8; - nulls_ = std::make_shared(); + nulls_ = std::make_shared(pool_); RETURN_NOT_OK(nulls_->Resize(to_alloc)); null_bits_ = nulls_->mutable_data(); memset(null_bits_, 0, to_alloc); diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index b43668af77c..456bb04ae09 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -23,25 +23,27 @@ #include #include "arrow/type.h" -#include "arrow/util/buffer.h" #include "arrow/util/macros.h" #include "arrow/util/status.h" namespace arrow { class Array; +class MemoryPool; +class PoolBuffer; static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8; // Base class for all data array builders class ArrayBuilder { public: - explicit ArrayBuilder(const TypePtr& type) - : type_(type), - nullable_(type_->nullable), - nulls_(nullptr), null_bits_(nullptr), - length_(0), - capacity_(0) {} + explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : + pool_(pool), + type_(type), + nullable_(type_->nullable), + nulls_(nullptr), null_bits_(nullptr), + length_(0), + capacity_(0) {} virtual ~ArrayBuilder() {} @@ -71,18 +73,20 @@ class ArrayBuilder { // this function responsibly. Status Advance(int64_t elements); - const std::shared_ptr& nulls() const { return nulls_;} + const std::shared_ptr& nulls() const { return nulls_;} // Creates new array object to hold the contents of the builder and transfers // ownership of the data virtual Status ToArray(Array** out) = 0; protected: + MemoryPool* pool_; + TypePtr type_; bool nullable_; // If the type is not nullable, then null_ is nullptr after initialization - std::shared_ptr nulls_; + std::shared_ptr nulls_; uint8_t* null_bits_; // Array length, so far. Also, the index of the next element to be added diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 5176cafd3ba..e1bb990063c 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -32,12 +32,13 @@ class ArrayBuilder; // Initially looked at doing this with vtables, but shared pointers makes it // difficult -#define BUILDER_CASE(ENUM, BuilderType) \ - case TypeEnum::ENUM: \ - *out = static_cast(new BuilderType(type)); \ +#define BUILDER_CASE(ENUM, BuilderType) \ + case TypeEnum::ENUM: \ + *out = static_cast(new BuilderType(pool, type)); \ return Status::OK(); -Status make_builder(const TypePtr& type, ArrayBuilder** out) { +Status make_builder(MemoryPool* pool, const TypePtr& type, + ArrayBuilder** out) { switch (type->type) { BUILDER_CASE(UINT8, UInt8Builder); BUILDER_CASE(INT8, Int8Builder); @@ -59,10 +60,10 @@ Status make_builder(const TypePtr& type, ArrayBuilder** out) { { ListType* list_type = static_cast(type.get()); ArrayBuilder* value_builder; - RETURN_NOT_OK(make_builder(list_type->value_type, &value_builder)); + RETURN_NOT_OK(make_builder(pool, list_type->value_type, &value_builder)); // The ListBuilder takes ownership of the value_builder - ListBuilder* builder = new ListBuilder(type, value_builder); + ListBuilder* builder = new ListBuilder(pool, type, value_builder); *out = static_cast(builder); return Status::OK(); } diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index c0bfedd27d6..b5ba436f787 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -23,9 +23,11 @@ namespace arrow { class ArrayBuilder; +class MemoryPool; class Status; -Status make_builder(const TypePtr& type, ArrayBuilder** out); +Status make_builder(MemoryPool* pool, const TypePtr& type, + ArrayBuilder** out); } // namespace arrow diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 47673ff898b..abfc8a31b0d 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -76,7 +76,7 @@ class TestListBuilder : public TestBuilder { type_ = TypePtr(new ListType(value_type_)); ArrayBuilder* tmp; - ASSERT_OK(make_builder(type_, &tmp)); + ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_.reset(static_cast(tmp)); } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 0f1116257c5..4ca0f13d53c 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -34,6 +34,8 @@ namespace arrow { +class MemoryPool; + struct ListType : public DataType { // List can contain any other logical value type TypePtr value_type; @@ -100,8 +102,9 @@ class ListArray : public Array { // have been appended to the child array) class ListBuilder : public Int32Builder { public: - ListBuilder(const TypePtr& type, ArrayBuilder* value_builder) - : Int32Builder(type) { + ListBuilder(MemoryPool* pool, const TypePtr& type, + ArrayBuilder* value_builder) + : Int32Builder(pool, type) { value_builder_.reset(value_builder); } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 12968608094..3484294a39f 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -104,10 +103,10 @@ class TestPrimitiveBuilder : public TestBuilder { type_nn_ = Attrs::type(false); ArrayBuilder* tmp; - ASSERT_OK(make_builder(type_, &tmp)); + ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_.reset(static_cast(tmp)); - ASSERT_OK(make_builder(type_nn_, &tmp)); + ASSERT_OK(make_builder(pool_, type_nn_, &tmp)); builder_nn_.reset(static_cast(tmp)); } diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index a41911224e0..c5ae0f78a99 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -20,6 +20,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -31,6 +32,8 @@ namespace arrow { +class MemoryPool; + template struct PrimitiveType : public DataType { explicit PrimitiveType(bool nullable = true) @@ -113,8 +116,9 @@ class PrimitiveBuilder : public ArrayBuilder { public: typedef typename Type::c_type T; - explicit PrimitiveBuilder(const TypePtr& type) - : ArrayBuilder(type), values_(nullptr) { + explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : + ArrayBuilder(pool, type), + values_(nullptr) { elsize_ = sizeof(T); } @@ -139,7 +143,7 @@ class PrimitiveBuilder : public ArrayBuilder { Status Init(int64_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); - values_ = std::make_shared(); + values_ = std::make_shared(pool_); return values_->Resize(capacity * elsize_); } @@ -231,7 +235,7 @@ class PrimitiveBuilder : public ArrayBuilder { } protected: - std::shared_ptr values_; + std::shared_ptr values_; int64_t elsize_; }; diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 6dba3fdcbb6..a2d87ead59c 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -31,12 +31,9 @@ #include "arrow/types/test-common.h" #include "arrow/util/status.h" -using std::string; -using std::unique_ptr; -using std::vector; - namespace arrow { +class Buffer; TEST(TypesTest, TestCharType) { CharType t1(5); @@ -45,7 +42,7 @@ TEST(TypesTest, TestCharType) { ASSERT_TRUE(t1.nullable); ASSERT_EQ(t1.size, 5); - ASSERT_EQ(t1.ToString(), string("char(5)")); + ASSERT_EQ(t1.ToString(), std::string("char(5)")); // Test copy constructor CharType t2 = t1; @@ -63,7 +60,7 @@ TEST(TypesTest, TestVarcharType) { ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.physical_type.size, 6); - ASSERT_EQ(t1.ToString(), string("varchar(5)")); + ASSERT_EQ(t1.ToString(), std::string("varchar(5)")); // Test copy constructor VarcharType t2 = t1; @@ -78,7 +75,7 @@ TEST(TypesTest, TestStringType) { StringType str_nn(false); ASSERT_EQ(str.type, TypeEnum::STRING); - ASSERT_EQ(str.name(), string("string")); + ASSERT_EQ(str.name(), std::string("string")); ASSERT_TRUE(str.nullable); ASSERT_FALSE(str_nn.nullable); } @@ -111,11 +108,11 @@ class TestStringContainer : public ::testing::Test { } protected: - vector offsets_; - vector chars_; - vector nulls_; + std::vector offsets_; + std::vector chars_; + std::vector nulls_; - vector expected_; + std::vector expected_; std::shared_ptr value_buf_; std::shared_ptr offsets_buf_; @@ -175,7 +172,7 @@ class TestStringBuilder : public TestBuilder { type_ = TypePtr(new StringType()); ArrayBuilder* tmp; - ASSERT_OK(make_builder(type_, &tmp)); + ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_.reset(static_cast(tmp)); } @@ -188,8 +185,8 @@ class TestStringBuilder : public TestBuilder { protected: TypePtr type_; - unique_ptr builder_; - unique_ptr result_; + std::unique_ptr builder_; + std::unique_ptr result_; }; TEST_F(TestStringBuilder, TestAttrs) { @@ -197,8 +194,8 @@ TEST_F(TestStringBuilder, TestAttrs) { } TEST_F(TestStringBuilder, TestScalarAppend) { - vector strings = {"a", "bb", "", "", "ccc"}; - vector is_null = {0, 0, 0, 1, 0}; + std::vector strings = {"a", "bb", "", "", "ccc"}; + std::vector is_null = {0, 0, 0, 1, 0}; int N = strings.size(); int reps = 1000; diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 30d6e247db1..d0690d9a7d2 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -27,12 +27,13 @@ #include "arrow/type.h" #include "arrow/types/integer.h" #include "arrow/types/list.h" -#include "arrow/util/buffer.h" #include "arrow/util/status.h" namespace arrow { class ArrayBuilder; +class Buffer; +class MemoryPool; struct CharType : public DataType { int size; @@ -148,8 +149,9 @@ class StringArray : public ListArray { class StringBuilder : public ListBuilder { public: - explicit StringBuilder(const TypePtr& type) : - ListBuilder(type, static_cast(new UInt8Builder(value_type_))) { + explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : + ListBuilder(pool, type, + static_cast(new UInt8Builder(pool, value_type_))) { byte_builder_ = static_cast(value_builder_.get()); } @@ -171,6 +173,7 @@ class StringBuilder : public ListBuilder { } protected: + std::shared_ptr list_builder_; UInt8Builder* byte_builder_; static TypePtr value_type_; diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index b7be5d8245f..a245656b516 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,6 +17,7 @@ #include "arrow/types/struct.h" +#include #include #include #include diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 267e48a7f25..3ecb0dec7c0 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -25,6 +25,7 @@ #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/util/memory-pool.h" using std::unique_ptr; @@ -33,12 +34,15 @@ namespace arrow { class TestBuilder : public ::testing::Test { public: void SetUp() { + pool_ = GetDefaultMemoryPool(); type_ = TypePtr(new UInt8Type()); type_nn_ = TypePtr(new UInt8Type(false)); - builder_.reset(new UInt8Builder(type_)); - builder_nn_.reset(new UInt8Builder(type_nn_)); + builder_.reset(new UInt8Builder(pool_, type_)); + builder_nn_.reset(new UInt8Builder(pool_, type_nn_)); } protected: + MemoryPool* pool_; + TypePtr type_; TypePtr type_nn_; unique_ptr builder_; diff --git a/cpp/src/arrow/types/union.cc b/cpp/src/arrow/types/union.cc index 54f41a7eef6..db3f81795ea 100644 --- a/cpp/src/arrow/types/union.cc +++ b/cpp/src/arrow/types/union.cc @@ -17,6 +17,7 @@ #include "arrow/types/union.h" +#include #include #include #include diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index ff8db6a0410..c53f307c9f5 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -22,6 +22,7 @@ set(UTIL_SRCS bit-util.cc buffer.cc + memory-pool.cc status.cc ) @@ -39,6 +40,7 @@ install(FILES bit-util.h buffer.h macros.h + memory-pool.h status.h DESTINATION include/arrow/util) @@ -79,3 +81,4 @@ endif() ADD_ARROW_TEST(bit-util-test) ADD_ARROW_TEST(buffer-test) +ADD_ARROW_TEST(memory-pool-test) diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index d2ddd6584a8..dbac0a42527 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -33,7 +33,7 @@ Status util::bytes_to_bits(uint8_t* bytes, int length, std::shared_ptr* out) { int bit_length = ceil_byte(length) / 8; - auto buffer = std::make_shared(); + auto buffer = std::make_shared(); RETURN_NOT_OK(buffer->Resize(bit_length)); memset(buffer->mutable_data(), 0, bit_length); bytes_to_bits(bytes, length, buffer->mutable_data()); diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 61dffa30423..9ae6127c5ea 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -22,10 +22,9 @@ #include #include -#include "arrow/util/buffer.h" - namespace arrow { +class Buffer; class Status; namespace util { diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc index edfd08e850b..9f1fd91432b 100644 --- a/cpp/src/arrow/util/buffer-test.cc +++ b/cpp/src/arrow/util/buffer-test.cc @@ -16,10 +16,8 @@ // under the License. #include -#include #include #include -#include #include #include "arrow/test-util.h" @@ -34,7 +32,7 @@ class TestBuffer : public ::testing::Test { }; TEST_F(TestBuffer, Resize) { - OwnedMutableBuffer buf; + PoolBuffer buf; ASSERT_EQ(0, buf.size()); ASSERT_OK(buf.Resize(100)); @@ -49,7 +47,7 @@ TEST_F(TestBuffer, Resize) { TEST_F(TestBuffer, ResizeOOM) { // realloc fails, even though there may be no explicit limit - OwnedMutableBuffer buf; + PoolBuffer buf; ASSERT_OK(buf.Resize(100)); int64_t to_alloc = std::numeric_limits::max(); ASSERT_RAISES(OutOfMemory, buf.Resize(to_alloc)); diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 2fb34d59e0b..3f3807d4e20 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -19,6 +19,7 @@ #include +#include "arrow/util/memory-pool.h" #include "arrow/util/status.h" namespace arrow { @@ -34,19 +35,34 @@ std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } -OwnedMutableBuffer::OwnedMutableBuffer() : - MutableBuffer(nullptr, 0) {} +PoolBuffer::PoolBuffer(MemoryPool* pool) : + ResizableBuffer(nullptr, 0) { + if (pool == nullptr) { + pool = GetDefaultMemoryPool(); + } + pool_ = pool; +} -Status OwnedMutableBuffer::Resize(int64_t new_size) { - size_ = new_size; - try { - buffer_owner_.resize(new_size); - } catch (const std::bad_alloc& e) { - return Status::OutOfMemory("resize failed"); +Status PoolBuffer::Reserve(int64_t new_capacity) { + if (!mutable_data_ || new_capacity > capacity_) { + uint8_t* new_data; + if (mutable_data_) { + RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data)); + memcpy(new_data, mutable_data_, size_); + pool_->Free(mutable_data_, capacity_); + } else { + RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data)); + } + mutable_data_ = new_data; + data_ = mutable_data_; + capacity_ = new_capacity; } - data_ = buffer_owner_.data(); - mutable_data_ = buffer_owner_.data(); + return Status::OK(); +} +Status PoolBuffer::Resize(int64_t new_size) { + RETURN_NOT_OK(Reserve(new_size)); + size_ = new_size; return Status::OK(); } diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 3e4183936b3..8704723eb0a 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -19,15 +19,14 @@ #define ARROW_UTIL_BUFFER_H #include -#include #include #include -#include #include "arrow/util/macros.h" namespace arrow { +class MemoryPool; class Status; // ---------------------------------------------------------------------- @@ -115,17 +114,34 @@ class MutableBuffer : public Buffer { uint8_t* mutable_data_; }; -// A MutableBuffer whose memory is owned by the class instance. For example, -// for reading data out of files that you want to deallocate when this class is -// garbage-collected -class OwnedMutableBuffer : public MutableBuffer { +class ResizableBuffer : public MutableBuffer { public: - OwnedMutableBuffer(); - Status Resize(int64_t new_size); + // Change buffer reported size to indicated size, allocating memory if + // necessary + virtual Status Resize(int64_t new_size) = 0; + + // Ensure that buffer has enough memory allocated to fit the indicated + // capacity. Does not change buffer's reported size + virtual Status Reserve(int64_t new_capacity) = 0; + + protected: + ResizableBuffer(uint8_t* data, int64_t size) : + MutableBuffer(data, size), + capacity_(size) {} + + int64_t capacity_; +}; + +// A Buffer whose lifetime is tied to a particular MemoryPool +class PoolBuffer : public ResizableBuffer { + public: + explicit PoolBuffer(MemoryPool* pool = nullptr); + + virtual Status Resize(int64_t new_size); + virtual Status Reserve(int64_t new_capacity); private: - // TODO: aligned allocations - std::vector buffer_owner_; + MemoryPool* pool_; }; } // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc new file mode 100644 index 00000000000..954b5f951b5 --- /dev/null +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/test-util.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { + +TEST(DefaultMemoryPool, MemoryTracking) { + MemoryPool* pool = GetDefaultMemoryPool(); + + uint8_t* data; + ASSERT_OK(pool->Allocate(100, &data)); + ASSERT_EQ(100, pool->bytes_allocated()); + + pool->Free(data, 100); + ASSERT_EQ(0, pool->bytes_allocated()); +} + +TEST(DefaultMemoryPool, OOM) { + MemoryPool* pool = GetDefaultMemoryPool(); + + uint8_t* data; + int64_t to_alloc = std::numeric_limits::max(); + ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc new file mode 100644 index 00000000000..5820346e5a7 --- /dev/null +++ b/cpp/src/arrow/util/memory-pool.cc @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/memory-pool.h" + +#include +#include +#include + +#include "arrow/util/status.h" + +namespace arrow { + +MemoryPool::~MemoryPool() {} + +class InternalMemoryPool : public MemoryPool { + public: + InternalMemoryPool() : bytes_allocated_(0) {} + virtual ~InternalMemoryPool(); + + Status Allocate(int64_t size, uint8_t** out) override; + + void Free(uint8_t* buffer, int64_t size) override; + + int64_t bytes_allocated() const override; + + private: + mutable std::mutex pool_lock_; + int64_t bytes_allocated_; +}; + +Status InternalMemoryPool::Allocate(int64_t size, uint8_t** out) { + std::lock_guard guard(pool_lock_); + *out = static_cast(std::malloc(size)); + if (*out == nullptr) { + std::stringstream ss; + ss << "malloc of size " << size << " failed"; + return Status::OutOfMemory(ss.str()); + } + + bytes_allocated_ += size; + + return Status::OK(); +} + +int64_t InternalMemoryPool::bytes_allocated() const { + std::lock_guard guard(pool_lock_); + return bytes_allocated_; +} + +void InternalMemoryPool::Free(uint8_t* buffer, int64_t size) { + std::lock_guard guard(pool_lock_); + std::free(buffer); + bytes_allocated_ -= size; +} + +InternalMemoryPool::~InternalMemoryPool() {} + +MemoryPool* GetDefaultMemoryPool() { + static InternalMemoryPool default_memory_pool; + return &default_memory_pool; +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool.h b/cpp/src/arrow/util/memory-pool.h new file mode 100644 index 00000000000..a7cb10dae17 --- /dev/null +++ b/cpp/src/arrow/util/memory-pool.h @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_MEMORY_POOL_H +#define ARROW_UTIL_MEMORY_POOL_H + +#include + +namespace arrow { + +class Status; + +class MemoryPool { + public: + virtual ~MemoryPool(); + + virtual Status Allocate(int64_t size, uint8_t** out) = 0; + virtual void Free(uint8_t* buffer, int64_t size) = 0; + + virtual int64_t bytes_allocated() const = 0; +}; + +MemoryPool* GetDefaultMemoryPool(); + +} // namespace arrow + +#endif // ARROW_UTIL_MEMORY_POOL_H From b88b69e204b59fa8f19cd20dcb6c091fe9bde3a9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Mar 2016 14:56:31 -0800 Subject: [PATCH 013/210] ARROW-20: Add null_count_ member to array containers, remove nullable_ member Based off of ARROW-19. After some contemplation / discussion, I believe it would be better to track nullability at the schema metadata level (if at all!) rather than making it a property of the data structures. This allows the data containers to be "plain ol' data" and thus both nullable data with `null_count == 0` and non-nullable data (implicitly `null_count == 0`) can be treated as semantically equivalent in algorithms code. If it is deemed useful we can validate (cheaply) that physical data meets the metadata requirements (e.g. non-nullable type metadata cannot be associated with data containers having nulls). Author: Wes McKinney Closes #9 from wesm/ARROW-20 and squashes the following commits: 98be016 [Wes McKinney] ARROW-20: Add null_count_ member to Array containers, remove nullable member --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/array-test.cc | 57 ++++++++-------- cpp/src/arrow/array.cc | 11 ++-- cpp/src/arrow/array.h | 37 +++++++---- cpp/src/arrow/builder.cc | 35 +++++----- cpp/src/arrow/builder.h | 29 ++++---- cpp/src/arrow/test-util.h | 10 +++ cpp/src/arrow/type.h | 12 ++-- cpp/src/arrow/types/collection.h | 2 +- cpp/src/arrow/types/datetime.h | 12 ++-- cpp/src/arrow/types/json.h | 4 +- cpp/src/arrow/types/list-test.cc | 12 +--- cpp/src/arrow/types/list.h | 46 ++++++------- cpp/src/arrow/types/primitive-test.cc | 34 +++++----- cpp/src/arrow/types/primitive.cc | 11 ++-- cpp/src/arrow/types/primitive.h | 95 +++++++++++++++------------ cpp/src/arrow/types/string-test.cc | 31 ++++----- cpp/src/arrow/types/string.cc | 2 +- cpp/src/arrow/types/string.h | 43 ++++++------ cpp/src/arrow/types/struct-test.cc | 6 +- cpp/src/arrow/types/struct.h | 5 +- cpp/src/arrow/types/test-common.h | 4 +- cpp/src/arrow/types/union.h | 10 ++- cpp/src/arrow/util/bit-util.cc | 4 +- cpp/src/arrow/util/bit-util.h | 4 +- 25 files changed, 265 insertions(+), 253 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d2c840abfe8..f0eb73dc413 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -92,7 +92,7 @@ endif() # For CMAKE_BUILD_TYPE=Release # -O3: Enable all compiler optimizations # -g: Enable symbols for profiler tools (TODO: remove for shipping) -set(CXX_FLAGS_DEBUG "-ggdb") +set(CXX_FLAGS_DEBUG "-ggdb -O0") set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG") diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 16afb9bef34..df827aaa113 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -20,7 +20,6 @@ #include #include #include -#include #include #include "arrow/array.h" @@ -32,60 +31,60 @@ #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" -using std::string; -using std::vector; - namespace arrow { static TypePtr int32 = TypePtr(new Int32Type()); -static TypePtr int32_nn = TypePtr(new Int32Type(false)); - class TestArray : public ::testing::Test { public: void SetUp() { pool_ = GetDefaultMemoryPool(); - - auto data = std::make_shared(pool_); - auto nulls = std::make_shared(pool_); - - ASSERT_OK(data->Resize(400)); - ASSERT_OK(nulls->Resize(128)); - - arr_.reset(new Int32Array(100, data, nulls)); } protected: MemoryPool* pool_; - std::unique_ptr arr_; }; -TEST_F(TestArray, TestNullable) { - std::shared_ptr tmp = arr_->data(); - std::unique_ptr arr_nn(new Int32Array(100, tmp)); +TEST_F(TestArray, TestNullCount) { + auto data = std::make_shared(pool_); + auto nulls = std::make_shared(pool_); - ASSERT_TRUE(arr_->nullable()); - ASSERT_FALSE(arr_nn->nullable()); + std::unique_ptr arr(new Int32Array(100, data, 10, nulls)); + ASSERT_EQ(10, arr->null_count()); + + std::unique_ptr arr_no_nulls(new Int32Array(100, data)); + ASSERT_EQ(0, arr_no_nulls->null_count()); } TEST_F(TestArray, TestLength) { - ASSERT_EQ(arr_->length(), 100); + auto data = std::make_shared(pool_); + std::unique_ptr arr(new Int32Array(100, data)); + ASSERT_EQ(arr->length(), 100); } TEST_F(TestArray, TestIsNull) { - vector nulls = {1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 0, 1}; + std::vector nulls = {1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 0, 1}; + int32_t null_count = 0; + for (uint8_t x : nulls) { + if (x > 0) ++null_count; + } - std::shared_ptr null_buf = bytes_to_null_buffer(nulls.data(), nulls.size()); + std::shared_ptr null_buf = bytes_to_null_buffer(nulls.data(), + nulls.size()); std::unique_ptr arr; - arr.reset(new Array(int32, nulls.size(), null_buf)); + arr.reset(new Array(int32, nulls.size(), null_count, null_buf)); + + ASSERT_EQ(null_count, arr->null_count()); + ASSERT_EQ(5, null_buf->size()); + + ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get())); - ASSERT_EQ(null_buf->size(), 5); for (size_t i = 0; i < nulls.size(); ++i) { ASSERT_EQ(static_cast(nulls[i]), arr->IsNull(i)); } diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 1726a2f27d8..ee4ef66d11e 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -17,6 +17,8 @@ #include "arrow/array.h" +#include + #include "arrow/util/buffer.h" namespace arrow { @@ -24,18 +26,17 @@ namespace arrow { // ---------------------------------------------------------------------- // Base array class -Array::Array(const TypePtr& type, int64_t length, +Array::Array(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr& nulls) { - Init(type, length, nulls); + Init(type, length, null_count, nulls); } -void Array::Init(const TypePtr& type, int64_t length, +void Array::Init(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr& nulls) { type_ = type; length_ = length; + null_count_ = null_count; nulls_ = nulls; - - nullable_ = type->nullable; if (nulls_) { null_bits_ = nulls_->data(); } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 0eaa28d528e..3d748c1bad6 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -30,38 +30,49 @@ namespace arrow { class Buffer; // Immutable data array with some logical type and some length. Any memory is -// owned by the respective Buffer instance (or its parents). May or may not be -// nullable. +// owned by the respective Buffer instance (or its parents). // -// The base class only has a null array (if the data type is nullable) +// The base class is only required to have a nulls buffer if the null count is +// greater than 0 // // Any buffers used to initialize the array have their references "stolen". If // you wish to use the buffer beyond the lifetime of the array, you need to // explicitly increment its reference count class Array { public: - Array() : length_(0), nulls_(nullptr), null_bits_(nullptr) {} - Array(const TypePtr& type, int64_t length, + Array() : + null_count_(0), + length_(0), + nulls_(nullptr), + null_bits_(nullptr) {} + + Array(const TypePtr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); virtual ~Array() {} - void Init(const TypePtr& type, int64_t length, const std::shared_ptr& nulls); + void Init(const TypePtr& type, int32_t length, int32_t null_count, + const std::shared_ptr& nulls); - // Determine if a slot if null. For inner loops. Does *not* boundscheck - bool IsNull(int64_t i) const { - return nullable_ && util::get_bit(null_bits_, i); + // Determine if a slot is null. For inner loops. Does *not* boundscheck + bool IsNull(int i) const { + return null_count_ > 0 && util::get_bit(null_bits_, i); } - int64_t length() const { return length_;} - bool nullable() const { return nullable_;} + int32_t length() const { return length_;} + int32_t null_count() const { return null_count_;} + const TypePtr& type() const { return type_;} TypeEnum type_enum() const { return type_->type;} + const std::shared_ptr& nulls() const { + return nulls_; + } + protected: TypePtr type_; - bool nullable_; - int64_t length_; + int32_t null_count_; + int32_t length_; std::shared_ptr nulls_; const uint8_t* null_bits_; diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index cb850673150..ba70add1551 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -25,34 +25,29 @@ namespace arrow { -Status ArrayBuilder::Init(int64_t capacity) { +Status ArrayBuilder::Init(int32_t capacity) { capacity_ = capacity; - - if (nullable_) { - int64_t to_alloc = util::ceil_byte(capacity) / 8; - nulls_ = std::make_shared(pool_); - RETURN_NOT_OK(nulls_->Resize(to_alloc)); - null_bits_ = nulls_->mutable_data(); - memset(null_bits_, 0, to_alloc); - } + int32_t to_alloc = util::ceil_byte(capacity) / 8; + nulls_ = std::make_shared(pool_); + RETURN_NOT_OK(nulls_->Resize(to_alloc)); + null_bits_ = nulls_->mutable_data(); + memset(null_bits_, 0, to_alloc); return Status::OK(); } -Status ArrayBuilder::Resize(int64_t new_bits) { - if (nullable_) { - int64_t new_bytes = util::ceil_byte(new_bits) / 8; - int64_t old_bytes = nulls_->size(); - RETURN_NOT_OK(nulls_->Resize(new_bytes)); - null_bits_ = nulls_->mutable_data(); - if (old_bytes < new_bytes) { - memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); - } +Status ArrayBuilder::Resize(int32_t new_bits) { + int32_t new_bytes = util::ceil_byte(new_bits) / 8; + int32_t old_bytes = nulls_->size(); + RETURN_NOT_OK(nulls_->Resize(new_bytes)); + null_bits_ = nulls_->mutable_data(); + if (old_bytes < new_bytes) { + memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); } return Status::OK(); } -Status ArrayBuilder::Advance(int64_t elements) { - if (nullable_ && length_ + elements > capacity_) { +Status ArrayBuilder::Advance(int32_t elements) { + if (length_ + elements > capacity_) { return Status::Invalid("Builder must be expanded"); } length_ += elements; diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 456bb04ae09..491b9133d2c 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -32,7 +32,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int64_t MIN_BUILDER_CAPACITY = 1 << 8; +static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8; // Base class for all data array builders class ArrayBuilder { @@ -40,8 +40,9 @@ class ArrayBuilder { explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : pool_(pool), type_(type), - nullable_(type_->nullable), - nulls_(nullptr), null_bits_(nullptr), + nulls_(nullptr), + null_count_(0), + null_bits_(nullptr), length_(0), capacity_(0) {} @@ -57,21 +58,21 @@ class ArrayBuilder { return children_.size(); } - int64_t length() const { return length_;} - int64_t capacity() const { return capacity_;} - bool nullable() const { return nullable_;} + int32_t length() const { return length_;} + int32_t null_count() const { return null_count_;} + int32_t capacity() const { return capacity_;} // Allocates requires memory at this level, but children need to be // initialized independently - Status Init(int64_t capacity); + Status Init(int32_t capacity); - // Resizes the nulls array (if nullable) - Status Resize(int64_t new_bits); + // Resizes the nulls array + Status Resize(int32_t new_bits); // For cases where raw data was memcpy'd into the internal buffers, allows us // to advance the length of the builder. It is your responsibility to use // this function responsibly. - Status Advance(int64_t elements); + Status Advance(int32_t elements); const std::shared_ptr& nulls() const { return nulls_;} @@ -83,15 +84,15 @@ class ArrayBuilder { MemoryPool* pool_; TypePtr type_; - bool nullable_; - // If the type is not nullable, then null_ is nullptr after initialization + // When nulls are first appended to the builder, the null bitmap is allocated std::shared_ptr nulls_; + int32_t null_count_; uint8_t* null_bits_; // Array length, so far. Also, the index of the next element to be added - int64_t length_; - int64_t capacity_; + int32_t length_; + int32_t capacity_; // Child value array builders. These are owned by this class std::vector > children_; diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 2233a4f832a..0898c8e3e3a 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -84,6 +84,16 @@ void random_nulls(int64_t n, double pct_null, std::vector* nulls) { } } +static inline int null_count(const std::vector& nulls) { + int result = 0; + for (size_t i = 0; i < nulls.size(); ++i) { + if (nulls[i] > 0) { + ++result; + } + } + return result; +} + std::shared_ptr bytes_to_null_buffer(uint8_t* bytes, int length) { std::shared_ptr out; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 220f99f4e88..12f19604c68 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -57,11 +57,9 @@ struct LayoutType { // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a // timestamp encoded as an int64) -// -// Any data type can be nullable enum class TypeEnum: char { - // A degerate NULL type represented as 0 bytes/bits + // A degenerate NULL type represented as 0 bytes/bits NA = 0, // Little-endian integer types @@ -138,14 +136,12 @@ enum class TypeEnum: char { struct DataType { TypeEnum type; - bool nullable; - explicit DataType(TypeEnum type, bool nullable = true) - : type(type), nullable(nullable) {} + explicit DataType(TypeEnum type) + : type(type) {} virtual bool Equals(const DataType* other) { - return (this == other) || (this->type == other->type && - this->nullable == other->nullable); + return this == other || this->type == other->type; } virtual std::string ToString() const = 0; diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 59ba6141941..094b63f2898 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -29,7 +29,7 @@ template struct CollectionType : public DataType { std::vector child_types_; - explicit CollectionType(bool nullable = true) : DataType(T, nullable) {} + CollectionType() : DataType(T) {} const TypePtr& child(int i) const { return child_types_[i]; diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index b4d62523c41..d90883cb018 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -31,12 +31,12 @@ struct DateType : public DataType { Unit unit; - explicit DateType(Unit unit = Unit::DAY, bool nullable = true) - : DataType(TypeEnum::DATE, nullable), + explicit DateType(Unit unit = Unit::DAY) + : DataType(TypeEnum::DATE), unit(unit) {} DateType(const DateType& other) - : DateType(other.unit, other.nullable) {} + : DateType(other.unit) {} static char const *name() { return "date"; @@ -58,12 +58,12 @@ struct TimestampType : public DataType { Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) - : DataType(TypeEnum::TIMESTAMP, nullable), + explicit TimestampType(Unit unit = Unit::MILLI) + : DataType(TypeEnum::TIMESTAMP), unit(unit) {} TimestampType(const TimestampType& other) - : TimestampType(other.unit, other.nullable) {} + : TimestampType(other.unit) {} static char const *name() { return "timestamp"; diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h index 91fd132408f..6c2b097a737 100644 --- a/cpp/src/arrow/types/json.h +++ b/cpp/src/arrow/types/json.h @@ -28,8 +28,8 @@ struct JSONScalar : public DataType { static TypePtr dense_type; static TypePtr sparse_type; - explicit JSONScalar(bool dense = true, bool nullable = true) - : DataType(TypeEnum::JSON_SCALAR, nullable), + explicit JSONScalar(bool dense = true) + : DataType(TypeEnum::JSON_SCALAR), dense(dense) {} }; diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index abfc8a31b0d..1d9ddbe607a 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -44,11 +44,7 @@ TEST(TypesTest, TestListType) { std::shared_ptr vt = std::make_shared(); ListType list_type(vt); - ListType list_type_nn(vt, false); - ASSERT_EQ(list_type.type, TypeEnum::LIST); - ASSERT_TRUE(list_type.nullable); - ASSERT_FALSE(list_type_nn.nullable); ASSERT_EQ(list_type.name(), string("list")); ASSERT_EQ(list_type.ToString(), string("list")); @@ -132,8 +128,8 @@ TEST_F(TestListBuilder, TestBasics) { Done(); - ASSERT_TRUE(result_->nullable()); - ASSERT_TRUE(result_->values()->nullable()); + ASSERT_EQ(1, result_->null_count()); + ASSERT_EQ(0, result_->values()->null_count()); ASSERT_EQ(3, result_->length()); vector ex_offsets = {0, 3, 3, 7}; @@ -153,10 +149,6 @@ TEST_F(TestListBuilder, TestBasics) { } } -TEST_F(TestListBuilder, TestBasicsNonNullable) { -} - - TEST_F(TestListBuilder, TestZeroLength) { // All buffers are null Done(); diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 4ca0f13d53c..4190b53df01 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -40,8 +40,8 @@ struct ListType : public DataType { // List can contain any other logical value type TypePtr value_type; - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(TypeEnum::LIST, nullable), + explicit ListType(const TypePtr& value_type) + : DataType(TypeEnum::LIST), value_type(value_type) {} static char const *name() { @@ -56,21 +56,25 @@ class ListArray : public Array { public: ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} - ListArray(const TypePtr& type, int64_t length, std::shared_ptr offsets, - const ArrayPtr& values, std::shared_ptr nulls = nullptr) { - Init(type, length, offsets, values, nulls); + ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, + const ArrayPtr& values, + int32_t null_count = 0, + std::shared_ptr nulls = nullptr) { + Init(type, length, offsets, values, null_count, nulls); } virtual ~ListArray() {} - void Init(const TypePtr& type, int64_t length, std::shared_ptr offsets, - const ArrayPtr& values, std::shared_ptr nulls = nullptr) { + void Init(const TypePtr& type, int32_t length, std::shared_ptr offsets, + const ArrayPtr& values, + int32_t null_count = 0, + std::shared_ptr nulls = nullptr) { offset_buf_ = offsets; offsets_ = offsets == nullptr? nullptr : reinterpret_cast(offset_buf_->data()); values_ = values; - Array::Init(type, length, nulls); + Array::Init(type, length, null_count, nulls); } // Return a shared pointer in case the requestor desires to share ownership @@ -108,7 +112,7 @@ class ListBuilder : public Int32Builder { value_builder_.reset(value_builder); } - Status Init(int64_t elements) { + Status Init(int32_t elements) { // One more than requested. // // XXX: This is slightly imprecise, because we might trigger null mask @@ -116,7 +120,7 @@ class ListBuilder : public Int32Builder { return Int32Builder::Init(elements + 1); } - Status Resize(int64_t capacity) { + Status Resize(int32_t capacity) { // Need space for the end offset RETURN_NOT_OK(Int32Builder::Resize(capacity + 1)); @@ -129,18 +133,15 @@ class ListBuilder : public Int32Builder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(T* values, int64_t length, uint8_t* null_bytes = nullptr) { + Status Append(T* values, int32_t length, uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { - int64_t new_capacity = util::next_power2(length_ + length); + int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (nullable_ && null_bytes != nullptr) { - // If null_bytes is all not null, then none of the values are null - for (int i = 0; i < length; ++i) { - util::set_bit(null_bits_, length_ + i, static_cast(null_bytes[i])); - } + if (null_bytes != nullptr) { + AppendNulls(null_bytes, length); } length_ += length; @@ -159,9 +160,10 @@ class ListBuilder : public Int32Builder { raw_buffer()[length_] = child_values->length(); } - out->Init(type_, length_, values_, ArrayPtr(child_values), nulls_); + out->Init(type_, length_, values_, ArrayPtr(child_values), + null_count_, nulls_); values_ = nulls_ = nullptr; - capacity_ = length_ = 0; + capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -181,10 +183,10 @@ class ListBuilder : public Int32Builder { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (nullable_) { - util::set_bit(null_bits_, length_, is_null); + if (is_null) { + ++null_count_; + util::set_bit(null_bits_, length_); } - raw_buffer()[length_++] = value_builder_->length(); return Status::OK(); } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 3484294a39f..93634432d5c 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -53,15 +53,12 @@ TEST(TypesTest, TestBytesType) { #define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ TEST(TypesTest, TestPrimitive_##ENUM) { \ KLASS tp; \ - KLASS tp_nn(false); \ \ ASSERT_EQ(tp.type, TypeEnum::ENUM); \ ASSERT_EQ(tp.name(), string(NAME)); \ - ASSERT_TRUE(tp.nullable); \ - ASSERT_FALSE(tp_nn.nullable); \ \ - KLASS tp_copy = tp_nn; \ - ASSERT_FALSE(tp_copy.nullable); \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, TypeEnum::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -100,17 +97,16 @@ class TestPrimitiveBuilder : public TestBuilder { TestBuilder::SetUp(); type_ = Attrs::type(); - type_nn_ = Attrs::type(false); ArrayBuilder* tmp; ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_.reset(static_cast(tmp)); - ASSERT_OK(make_builder(pool_, type_nn_, &tmp)); + ASSERT_OK(make_builder(pool_, type_, &tmp)); builder_nn_.reset(static_cast(tmp)); } - void RandomData(int64_t N, double pct_null = 0.1) { + void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); random_nulls(N, pct_null, &nulls_); } @@ -118,28 +114,33 @@ class TestPrimitiveBuilder : public TestBuilder { void CheckNullable() { ArrayType result; ArrayType expected; - int64_t size = builder_->length(); + int size = builder_->length(); - auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), + auto ex_data = std::make_shared( + reinterpret_cast(draws_.data()), size * sizeof(T)); auto ex_nulls = bytes_to_null_buffer(nulls_.data(), size); - expected.Init(size, ex_data, ex_nulls); + int32_t ex_null_count = null_count(nulls_); + + expected.Init(size, ex_data, ex_null_count, ex_nulls); ASSERT_OK(builder_->Transfer(&result)); // Builder is now reset ASSERT_EQ(0, builder_->length()); ASSERT_EQ(0, builder_->capacity()); + ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); ASSERT_TRUE(result.Equals(expected)); + ASSERT_EQ(ex_null_count, result.null_count()); } void CheckNonNullable() { ArrayType result; ArrayType expected; - int64_t size = builder_nn_->length(); + int size = builder_nn_->length(); auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), size * sizeof(T)); @@ -153,6 +154,7 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(nullptr, builder_nn_->buffer()); ASSERT_TRUE(result.Equals(expected)); + ASSERT_EQ(0, result.null_count()); } protected: @@ -171,14 +173,14 @@ class TestPrimitiveBuilder : public TestBuilder { typedef CapType##Type Type; \ typedef c_type T; \ \ - static TypePtr type(bool nullable = true) { \ - return TypePtr(new Type(nullable)); \ + static TypePtr type() { \ + return TypePtr(new Type()); \ } #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ PTYPE_DECL(CapType, c_type); \ - static void draw(int64_t N, vector* draws) { \ + static void draw(int N, vector* draws) { \ randint(N, LOWER, UPPER, draws); \ } \ } @@ -208,7 +210,7 @@ TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); TYPED_TEST(TestPrimitiveBuilder, TestInit) { DECL_T(); - int64_t n = 1000; + int n = 1000; ASSERT_OK(this->builder_->Init(n)); ASSERT_EQ(n, this->builder_->capacity()); ASSERT_EQ(n * sizeof(T), this->builder_->buffer()->size()); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 2612e8ca7fd..c86260b0fc6 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -26,20 +26,23 @@ namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -void PrimitiveArray::Init(const TypePtr& type, int64_t length, +void PrimitiveArray::Init(const TypePtr& type, int32_t length, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& nulls) { - Array::Init(type, length, nulls); + Array::Init(type, length, null_count, nulls); data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); } bool PrimitiveArray::Equals(const PrimitiveArray& other) const { if (this == &other) return true; - if (type_->nullable != other.type_->nullable) return false; + if (null_count_ != other.null_count_) { + return false; + } bool equal_data = data_->Equals(*other.data_, length_); - if (type_->nullable) { + if (null_count_ > 0) { return equal_data && nulls_->Equals(*other.nulls_, util::ceil_byte(length_) / 8); } else { diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c5ae0f78a99..aa8f351202a 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -36,24 +36,24 @@ class MemoryPool; template struct PrimitiveType : public DataType { - explicit PrimitiveType(bool nullable = true) - : DataType(Derived::type_enum, nullable) {} + PrimitiveType() + : DataType(Derived::type_enum) {} virtual std::string ToString() const { return std::string(static_cast(this)->name()); } }; -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ - static constexpr int size = SIZE; \ - \ - explicit TYPENAME(bool nullable = true) \ - : PrimitiveType(nullable) {} \ - \ - static const char* name() { \ - return NAME; \ +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ + static constexpr int size = SIZE; \ + \ + TYPENAME() \ + : PrimitiveType() {} \ + \ + static const char* name() { \ + return NAME; \ } @@ -64,7 +64,9 @@ class PrimitiveArray : public Array { virtual ~PrimitiveArray() {} - void Init(const TypePtr& type, int64_t length, const std::shared_ptr& data, + void Init(const TypePtr& type, int32_t length, + const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); const std::shared_ptr& data() const { return data_;} @@ -84,15 +86,17 @@ class PrimitiveArrayImpl : public PrimitiveArray { PrimitiveArrayImpl() : PrimitiveArray() {} - PrimitiveArrayImpl(int64_t length, const std::shared_ptr& data, + PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - Init(length, data, nulls); + Init(length, data, null_count, nulls); } - void Init(int64_t length, const std::shared_ptr& data, + void Init(int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - TypePtr type(new TypeClass(nulls != nullptr)); - PrimitiveArray::Init(type, length, data, nulls); + TypePtr type(new TypeClass()); + PrimitiveArray::Init(type, length, data, null_count, nulls); } bool Equals(const PrimitiveArrayImpl& other) const { @@ -101,7 +105,7 @@ class PrimitiveArrayImpl : public PrimitiveArray { const T* raw_data() const { return reinterpret_cast(raw_data_);} - T Value(int64_t i) const { + T Value(int i) const { return raw_data()[i]; } @@ -124,7 +128,7 @@ class PrimitiveBuilder : public ArrayBuilder { virtual ~PrimitiveBuilder() {} - Status Resize(int64_t capacity) { + Status Resize(int32_t capacity) { // XXX: Set floor size for now if (capacity < MIN_BUILDER_CAPACITY) { capacity = MIN_BUILDER_CAPACITY; @@ -135,27 +139,26 @@ class PrimitiveBuilder : public ArrayBuilder { } else { RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); RETURN_NOT_OK(values_->Resize(capacity * elsize_)); - capacity_ = capacity; } + capacity_ = capacity; return Status::OK(); } - Status Init(int64_t capacity) { + Status Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); - values_ = std::make_shared(pool_); return values_->Resize(capacity * elsize_); } - Status Reserve(int64_t elements) { + Status Reserve(int32_t elements) { if (length_ + elements > capacity_) { - int64_t new_capacity = util::next_power2(length_ + elements); + int32_t new_capacity = util::next_power2(length_ + elements); return Resize(new_capacity); } return Status::OK(); } - Status Advance(int64_t elements) { + Status Advance(int32_t elements) { return ArrayBuilder::Advance(elements); } @@ -165,8 +168,9 @@ class PrimitiveBuilder : public ArrayBuilder { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (nullable_) { - util::set_bit(null_bits_, length_, is_null); + if (is_null) { + ++null_count_; + util::set_bit(null_bits_, length_); } raw_buffer()[length_++] = val; return Status::OK(); @@ -176,42 +180,49 @@ class PrimitiveBuilder : public ArrayBuilder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(const T* values, int64_t length, uint8_t* null_bytes = nullptr) { + Status Append(const T* values, int32_t length, + const uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { - int64_t new_capacity = util::next_power2(length_ + length); + int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (nullable_ && null_bytes != nullptr) { - // If null_bytes is all not null, then none of the values are null - for (int64_t i = 0; i < length; ++i) { - util::set_bit(null_bits_, length_ + i, static_cast(null_bytes[i])); - } + if (null_bytes != nullptr) { + AppendNulls(null_bytes, length); } length_ += length; return Status::OK(); } - Status AppendNull() { - if (!nullable_) { - return Status::Invalid("not nullable"); + // Write nulls as uint8_t* into pre-allocated memory + void AppendNulls(const uint8_t* null_bytes, int32_t length) { + // If null_bytes is all not null, then none of the values are null + for (int i = 0; i < length; ++i) { + if (static_cast(null_bytes[i])) { + ++null_count_; + util::set_bit(null_bits_, length_ + i); + } } + } + + Status AppendNull() { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - util::set_bit(null_bits_, length_++, true); + ++null_count_; + util::set_bit(null_bits_, length_++); return Status::OK(); } // Initialize an array type instance with the results of this builder // Transfers ownership of all buffers Status Transfer(PrimitiveArray* out) { - out->Init(type_, length_, values_, nulls_); + out->Init(type_, length_, values_, null_count_, nulls_); values_ = nulls_ = nullptr; - capacity_ = length_ = 0; + capacity_ = length_ = null_count_ = 0; return Status::OK(); } @@ -236,7 +247,7 @@ class PrimitiveBuilder : public ArrayBuilder { protected: std::shared_ptr values_; - int64_t elsize_; + int elsize_; }; } // namespace arrow diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index a2d87ead59c..e1dcebe97f0 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -39,7 +39,6 @@ TEST(TypesTest, TestCharType) { CharType t1(5); ASSERT_EQ(t1.type, TypeEnum::CHAR); - ASSERT_TRUE(t1.nullable); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.ToString(), std::string("char(5)")); @@ -47,7 +46,6 @@ TEST(TypesTest, TestCharType) { // Test copy constructor CharType t2 = t1; ASSERT_EQ(t2.type, TypeEnum::CHAR); - ASSERT_TRUE(t2.nullable); ASSERT_EQ(t2.size, 5); } @@ -56,7 +54,6 @@ TEST(TypesTest, TestVarcharType) { VarcharType t1(5); ASSERT_EQ(t1.type, TypeEnum::VARCHAR); - ASSERT_TRUE(t1.nullable); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.physical_type.size, 6); @@ -65,19 +62,14 @@ TEST(TypesTest, TestVarcharType) { // Test copy constructor VarcharType t2 = t1; ASSERT_EQ(t2.type, TypeEnum::VARCHAR); - ASSERT_TRUE(t2.nullable); ASSERT_EQ(t2.size, 5); ASSERT_EQ(t2.physical_type.size, 6); } TEST(TypesTest, TestStringType) { StringType str; - StringType str_nn(false); - ASSERT_EQ(str.type, TypeEnum::STRING); ASSERT_EQ(str.name(), std::string("string")); - ASSERT_TRUE(str.nullable); - ASSERT_FALSE(str_nn.nullable); } // ---------------------------------------------------------------------- @@ -96,7 +88,7 @@ class TestStringContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - int64_t nchars = chars_.size(); + int nchars = chars_.size(); value_buf_ = to_buffer(chars_); values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); @@ -104,7 +96,9 @@ class TestStringContainer : public ::testing::Test { offsets_buf_ = to_buffer(offsets_); nulls_buf_ = bytes_to_null_buffer(nulls_.data(), nulls_.size()); - strings_.Init(length_, offsets_buf_, values_, nulls_buf_); + null_count_ = null_count(nulls_); + + strings_.Init(length_, offsets_buf_, values_, null_count_, nulls_buf_); } protected: @@ -118,7 +112,8 @@ class TestStringContainer : public ::testing::Test { std::shared_ptr offsets_buf_; std::shared_ptr nulls_buf_; - int64_t length_; + int null_count_; + int length_; ArrayPtr values_; StringArray strings_; @@ -127,7 +122,7 @@ class TestStringContainer : public ::testing::Test { TEST_F(TestStringContainer, TestArrayBasics) { ASSERT_EQ(length_, strings_.length()); - ASSERT_TRUE(strings_.nullable()); + ASSERT_EQ(1, strings_.null_count()); } TEST_F(TestStringContainer, TestType) { @@ -149,7 +144,8 @@ TEST_F(TestStringContainer, TestListFunctions) { TEST_F(TestStringContainer, TestDestructor) { - auto arr = std::make_shared(length_, offsets_buf_, values_, nulls_buf_); + auto arr = std::make_shared(length_, offsets_buf_, values_, + null_count_, nulls_buf_); } TEST_F(TestStringContainer, TestGetString) { @@ -189,10 +185,6 @@ class TestStringBuilder : public TestBuilder { std::unique_ptr result_; }; -TEST_F(TestStringBuilder, TestAttrs) { - ASSERT_FALSE(builder_->value_builder()->nullable()); -} - TEST_F(TestStringBuilder, TestScalarAppend) { std::vector strings = {"a", "bb", "", "", "ccc"}; std::vector is_null = {0, 0, 0, 1, 0}; @@ -212,10 +204,11 @@ TEST_F(TestStringBuilder, TestScalarAppend) { Done(); ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps * null_count(is_null), result_->null_count()); ASSERT_EQ(reps * 6, result_->values()->length()); - int64_t length; - int64_t pos = 0; + int32_t length; + int32_t pos = 0; for (int i = 0; i < N * reps; ++i) { if (is_null[i % N]) { ASSERT_TRUE(result_->IsNull(i)); diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index f3dfbdc50f7..dea42e102b0 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -35,6 +35,6 @@ std::string VarcharType::ToString() const { return s.str(); } -TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type(false)); +TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type()); } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index d0690d9a7d2..084562530a8 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -40,13 +40,13 @@ struct CharType : public DataType { BytesType physical_type; - explicit CharType(int size, bool nullable = true) - : DataType(TypeEnum::CHAR, nullable), + explicit CharType(int size) + : DataType(TypeEnum::CHAR), size(size), physical_type(BytesType(size)) {} CharType(const CharType& other) - : CharType(other.size, other.nullable) {} + : CharType(other.size) {} virtual std::string ToString() const; }; @@ -58,12 +58,12 @@ struct VarcharType : public DataType { BytesType physical_type; - explicit VarcharType(int size, bool nullable = true) - : DataType(TypeEnum::VARCHAR, nullable), + explicit VarcharType(int size) + : DataType(TypeEnum::VARCHAR), size(size), physical_type(BytesType(size + 1)) {} VarcharType(const VarcharType& other) - : VarcharType(other.size, other.nullable) {} + : VarcharType(other.size) {} virtual std::string ToString() const; }; @@ -73,11 +73,11 @@ static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - explicit StringType(bool nullable = true) - : DataType(TypeEnum::STRING, nullable) {} + StringType() + : DataType(TypeEnum::STRING) {} StringType(const StringType& other) - : StringType(other.nullable) {} + : StringType() {} const LayoutPtr& physical_type() { return physical_string; @@ -98,17 +98,19 @@ class StringArray : public ListArray { public: StringArray() : ListArray(), bytes_(nullptr), raw_bytes_(nullptr) {} - StringArray(int64_t length, const std::shared_ptr& offsets, + StringArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - Init(length, offsets, values, nulls); + Init(length, offsets, values, null_count, nulls); } - void Init(const TypePtr& type, int64_t length, + void Init(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - ListArray::Init(type, length, offsets, values, nulls); + ListArray::Init(type, length, offsets, values, null_count, nulls); // TODO: type validation for values array @@ -117,23 +119,24 @@ class StringArray : public ListArray { raw_bytes_ = bytes_->raw_data(); } - void Init(int64_t length, const std::shared_ptr& offsets, + void Init(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, + int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { - TypePtr type(new StringType(nulls != nullptr)); - Init(type, length, offsets, values, nulls); + TypePtr type(new StringType()); + Init(type, length, offsets, values, null_count, nulls); } // Compute the pointer t - const uint8_t* GetValue(int64_t i, int64_t* out_length) const { + const uint8_t* GetValue(int i, int32_t* out_length) const { int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; return raw_bytes_ + pos; } // Construct a std::string - std::string GetString(int64_t i) const { - int64_t nchars; + std::string GetString(int i) const { + int32_t nchars; const uint8_t* str = GetValue(i, &nchars); return std::string(reinterpret_cast(str), nchars); } @@ -161,7 +164,7 @@ class StringBuilder : public ListBuilder { value.size()); } - Status Append(const uint8_t* value, int64_t length); + Status Append(const uint8_t* value, int32_t length); Status Append(const std::vector& values, uint8_t* null_bytes); diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 644b5457d58..1a9fc6be4a5 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -43,11 +43,7 @@ TEST(TestStructType, Basics) { vector fields = {f0, f1, f2}; - StructType struct_type(fields, true); - StructType struct_type_nn(fields, false); - - ASSERT_TRUE(struct_type.nullable); - ASSERT_FALSE(struct_type_nn.nullable); + StructType struct_type(fields); ASSERT_TRUE(struct_type.field(0).Equals(f0)); ASSERT_TRUE(struct_type.field(1).Equals(f1)); diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 7d8885b830d..afba19a7e46 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -29,9 +29,8 @@ namespace arrow { struct StructType : public DataType { std::vector fields_; - StructType(const std::vector& fields, - bool nullable = true) - : DataType(TypeEnum::STRUCT, nullable) { + explicit StructType(const std::vector& fields) + : DataType(TypeEnum::STRUCT) { fields_ = fields; } diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 3ecb0dec7c0..1744efce7d6 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -36,15 +36,13 @@ class TestBuilder : public ::testing::Test { void SetUp() { pool_ = GetDefaultMemoryPool(); type_ = TypePtr(new UInt8Type()); - type_nn_ = TypePtr(new UInt8Type(false)); builder_.reset(new UInt8Builder(pool_, type_)); - builder_nn_.reset(new UInt8Builder(pool_, type_nn_)); + builder_nn_.reset(new UInt8Builder(pool_, type_)); } protected: MemoryPool* pool_; TypePtr type_; - TypePtr type_nn_; unique_ptr builder_; unique_ptr builder_nn_; }; diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 7b66c3b88bf..62a3d1c1035 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -33,9 +33,8 @@ class Buffer; struct DenseUnionType : public CollectionType { typedef CollectionType Base; - DenseUnionType(const std::vector& child_types, - bool nullable = true) - : Base(nullable) { + explicit DenseUnionType(const std::vector& child_types) : + Base() { child_types_ = child_types; } @@ -46,9 +45,8 @@ struct DenseUnionType : public CollectionType { struct SparseUnionType : public CollectionType { typedef CollectionType Base; - SparseUnionType(const std::vector& child_types, - bool nullable = true) - : Base(nullable) { + explicit SparseUnionType(const std::vector& child_types) : + Base() { child_types_ = child_types; } diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index dbac0a42527..292cb33887f 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -25,7 +25,9 @@ namespace arrow { void util::bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits) { for (int i = 0; i < length; ++i) { - set_bit(bits, i, static_cast(bytes[i])); + if (static_cast(bytes[i])) { + set_bit(bits, i); + } } } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 9ae6127c5ea..841f617a313 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -41,8 +41,8 @@ static inline bool get_bit(const uint8_t* bits, int i) { return bits[i / 8] & (1 << (i % 8)); } -static inline void set_bit(uint8_t* bits, int i, bool is_set) { - bits[i / 8] |= (1 << (i % 8)) * is_set; +static inline void set_bit(uint8_t* bits, int i) { + bits[i / 8] |= 1 << (i % 8); } static inline int64_t next_power2(int64_t n) { From 89c6afd2026cab21fbe2b3c81f14335dffde6d08 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Mar 2016 15:35:54 -0800 Subject: [PATCH 014/210] ARROW-21: Implement a simple in-memory Schema data structure I also have restored the `nullable` bit to the type metadata only (for the moment mainly to facilitate schema testing / round-trips to Parquet and other media with required/optional distinction) and done some miscellaneous refactoring (`TypeEnum` is renamed to `LogicalType`). Author: Wes McKinney Closes #10 from wesm/ARROW-21 and squashes the following commits: c770f7d [Wes McKinney] Add simple in-memory Schema data structure. Restore nullable bit to type metadata only. Add "?" to nullable type formatting. --- cpp/CMakeLists.txt | 2 + cpp/src/arrow/CMakeLists.txt | 2 +- cpp/src/arrow/array.h | 4 +- cpp/src/arrow/{field-test.cc => field.cc} | 19 +-- cpp/src/arrow/field.h | 17 +- cpp/src/arrow/schema-test.cc | 110 ++++++++++++ cpp/src/arrow/schema.cc | 58 +++++++ cpp/src/arrow/schema.h | 56 +++++++ cpp/src/arrow/type.h | 193 +++++++++++++++------- cpp/src/arrow/types/binary.h | 3 - cpp/src/arrow/types/boolean.h | 4 - cpp/src/arrow/types/collection.h | 2 +- cpp/src/arrow/types/construct.cc | 4 +- cpp/src/arrow/types/datetime.h | 8 +- cpp/src/arrow/types/floating.h | 9 +- cpp/src/arrow/types/integer.h | 33 +--- cpp/src/arrow/types/json.h | 4 +- cpp/src/arrow/types/list-test.cc | 10 +- cpp/src/arrow/types/list.cc | 3 + cpp/src/arrow/types/list.h | 5 +- cpp/src/arrow/types/primitive-test.cc | 4 +- cpp/src/arrow/types/primitive.h | 22 --- cpp/src/arrow/types/string-test.cc | 14 +- cpp/src/arrow/types/string.h | 24 +-- cpp/src/arrow/types/struct-test.cc | 2 +- cpp/src/arrow/types/struct.cc | 1 + cpp/src/arrow/types/struct.h | 4 +- cpp/src/arrow/types/union.h | 8 +- 28 files changed, 434 insertions(+), 191 deletions(-) rename cpp/src/arrow/{field-test.cc => field.cc} (74%) create mode 100644 cpp/src/arrow/schema-test.cc create mode 100644 cpp/src/arrow/schema.cc create mode 100644 cpp/src/arrow/schema.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f0eb73dc413..5e4c2045813 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -467,6 +467,8 @@ set(LINK_LIBS set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc + src/arrow/field.cc + src/arrow/schema.cc src/arrow/type.cc ) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index eeea2dbc517..04f8dd1f908 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -30,4 +30,4 @@ install(FILES set(ARROW_TEST_LINK_LIBS arrow_test_util ${ARROW_MIN_TEST_LIBS}) ADD_ARROW_TEST(array-test) -ADD_ARROW_TEST(field-test) +ADD_ARROW_TEST(schema-test) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 3d748c1bad6..0632146637e 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -62,8 +62,8 @@ class Array { int32_t length() const { return length_;} int32_t null_count() const { return null_count_;} - const TypePtr& type() const { return type_;} - TypeEnum type_enum() const { return type_->type;} + const std::shared_ptr& type() const { return type_;} + LogicalType::type logical_type() const { return type_->type;} const std::shared_ptr& nulls() const { return nulls_; diff --git a/cpp/src/arrow/field-test.cc b/cpp/src/arrow/field.cc similarity index 74% rename from cpp/src/arrow/field-test.cc rename to cpp/src/arrow/field.cc index 2bb8bad4054..4568d905c29 100644 --- a/cpp/src/arrow/field-test.cc +++ b/cpp/src/arrow/field.cc @@ -15,24 +15,17 @@ // specific language governing permissions and limitations // under the License. -#include -#include -#include - #include "arrow/field.h" -#include "arrow/type.h" -#include "arrow/types/integer.h" -using std::string; +#include +#include namespace arrow { -TEST(TestField, Basics) { - TypePtr ftype = TypePtr(new Int32Type()); - Field f0("f0", ftype); - - ASSERT_EQ(f0.name, "f0"); - ASSERT_EQ(f0.type->ToString(), ftype->ToString()); +std::string Field::ToString() const { + std::stringstream ss; + ss << this->name << " " << this->type->ToString(); + return ss.str(); } } // namespace arrow diff --git a/cpp/src/arrow/field.h b/cpp/src/arrow/field.h index 664cae61a77..89a450c66f2 100644 --- a/cpp/src/arrow/field.h +++ b/cpp/src/arrow/field.h @@ -35,12 +35,27 @@ struct Field { TypePtr type; Field(const std::string& name, const TypePtr& type) : - name(name), type(type) {} + name(name), + type(type) {} + + bool operator==(const Field& other) const { + return this->Equals(other); + } + + bool operator!=(const Field& other) const { + return !this->Equals(other); + } bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && this->type->Equals(other.type.get())); } + + bool nullable() const { + return this->type->nullable; + } + + std::string ToString() const; }; } // namespace arrow diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc new file mode 100644 index 00000000000..3debb9cec3c --- /dev/null +++ b/cpp/src/arrow/schema-test.cc @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "arrow/field.h" +#include "arrow/schema.h" +#include "arrow/type.h" +#include "arrow/types/string.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { + +TEST(TestField, Basics) { + shared_ptr ftype = std::make_shared(); + shared_ptr ftype_nn = std::make_shared(false); + Field f0("f0", ftype); + Field f0_nn("f0", ftype_nn); + + ASSERT_EQ(f0.name, "f0"); + ASSERT_EQ(f0.type->ToString(), ftype->ToString()); + + ASSERT_TRUE(f0.nullable()); + ASSERT_FALSE(f0_nn.nullable()); +} + +TEST(TestField, Equals) { + shared_ptr ftype = std::make_shared(); + shared_ptr ftype_nn = std::make_shared(false); + + Field f0("f0", ftype); + Field f0_nn("f0", ftype_nn); + Field f0_other("f0", ftype); + + ASSERT_EQ(f0, f0_other); + ASSERT_NE(f0, f0_nn); +} + +class TestSchema : public ::testing::Test { + public: + void SetUp() {} +}; + +TEST_F(TestSchema, Basics) { + auto f0 = std::make_shared("f0", std::make_shared()); + + auto f1 = std::make_shared("f1", std::make_shared(false)); + auto f1_optional = std::make_shared("f1", std::make_shared()); + + auto f2 = std::make_shared("f2", std::make_shared()); + + vector > fields = {f0, f1, f2}; + auto schema = std::make_shared(fields); + + ASSERT_EQ(3, schema->num_fields()); + ASSERT_EQ(f0, schema->field(0)); + ASSERT_EQ(f1, schema->field(1)); + ASSERT_EQ(f2, schema->field(2)); + + auto schema2 = std::make_shared(fields); + + vector > fields3 = {f0, f1_optional, f2}; + auto schema3 = std::make_shared(fields3); + ASSERT_TRUE(schema->Equals(schema2)); + ASSERT_FALSE(schema->Equals(schema3)); + + ASSERT_TRUE(schema->Equals(*schema2.get())); + ASSERT_FALSE(schema->Equals(*schema3.get())); +} + +TEST_F(TestSchema, ToString) { + auto f0 = std::make_shared("f0", std::make_shared()); + auto f1 = std::make_shared("f1", std::make_shared(false)); + auto f2 = std::make_shared("f2", std::make_shared()); + auto f3 = std::make_shared("f3", + std::make_shared(std::make_shared())); + + vector > fields = {f0, f1, f2, f3}; + auto schema = std::make_shared(fields); + + std::string result = schema->ToString(); + std::string expected = R"(f0 ?int32 +f1 uint8 +f2 ?string +f3 ?list +)"; + + ASSERT_EQ(expected, result); +} + +} // namespace arrow diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc new file mode 100644 index 00000000000..a735fd3d230 --- /dev/null +++ b/cpp/src/arrow/schema.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/schema.h" + +#include +#include +#include +#include + +#include "arrow/field.h" + +namespace arrow { + +Schema::Schema(const std::vector >& fields) : + fields_(fields) {} + +bool Schema::Equals(const Schema& other) const { + if (this == &other) return true; + if (num_fields() != other.num_fields()) { + return false; + } + for (int i = 0; i < num_fields(); ++i) { + if (!field(i)->Equals(*other.field(i).get())) { + return false; + } + } + return true; +} + +bool Schema::Equals(const std::shared_ptr& other) const { + return Equals(*other.get()); +} + +std::string Schema::ToString() const { + std::stringstream buffer; + + for (auto field : fields_) { + buffer << field->ToString() << std::endl; + } + return buffer.str(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h new file mode 100644 index 00000000000..d04e3f628c1 --- /dev/null +++ b/cpp/src/arrow/schema.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_SCHEMA_H +#define ARROW_SCHEMA_H + +#include +#include +#include + +#include "arrow/field.h" +#include "arrow/type.h" + +namespace arrow { + +class Schema { + public: + explicit Schema(const std::vector >& fields); + + // Returns true if all of the schema fields are equal + bool Equals(const Schema& other) const; + bool Equals(const std::shared_ptr& other) const; + + // Return the ith schema element. Does not boundscheck + const std::shared_ptr& field(int i) const { + return fields_[i]; + } + + // Render a string representation of the schema suitable for debugging + std::string ToString() const; + + int num_fields() const { + return fields_.size(); + } + + private: + std::vector > fields_; +}; + +} // namespace arrow + +#endif // ARROW_FIELD_H diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 12f19604c68..04cdb52b535 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -52,96 +52,98 @@ struct LayoutType { explicit LayoutType(LayoutEnum type) : type(type) {} }; - // Data types in this library are all *logical*. They can be expressed as // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a // timestamp encoded as an int64) +struct LogicalType { + enum type { + // A degenerate NULL type represented as 0 bytes/bits + NA = 0, -enum class TypeEnum: char { - // A degenerate NULL type represented as 0 bytes/bits - NA = 0, - - // Little-endian integer types - UINT8 = 1, - INT8 = 2, - UINT16 = 3, - INT16 = 4, - UINT32 = 5, - INT32 = 6, - UINT64 = 7, - INT64 = 8, + // Little-endian integer types + UINT8 = 1, + INT8 = 2, + UINT16 = 3, + INT16 = 4, + UINT32 = 5, + INT32 = 6, + UINT64 = 7, + INT64 = 8, - // A boolean value represented as 1 byte - BOOL = 9, + // A boolean value represented as 1 byte + BOOL = 9, - // A boolean value represented as 1 bit - BIT = 10, + // A boolean value represented as 1 bit + BIT = 10, - // 4-byte floating point value - FLOAT = 11, + // 4-byte floating point value + FLOAT = 11, - // 8-byte floating point value - DOUBLE = 12, + // 8-byte floating point value + DOUBLE = 12, - // CHAR(N): fixed-length UTF8 string with length N - CHAR = 13, + // CHAR(N): fixed-length UTF8 string with length N + CHAR = 13, - // UTF8 variable-length string as List - STRING = 14, + // UTF8 variable-length string as List + STRING = 14, - // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) - VARCHAR = 15, + // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) + VARCHAR = 15, - // Variable-length bytes (no guarantee of UTF8-ness) - BINARY = 16, + // Variable-length bytes (no guarantee of UTF8-ness) + BINARY = 16, - // By default, int32 days since the UNIX epoch - DATE = 17, + // By default, int32 days since the UNIX epoch + DATE = 17, - // Exact timestamp encoded with int64 since UNIX epoch - // Default unit millisecond - TIMESTAMP = 18, + // Exact timestamp encoded with int64 since UNIX epoch + // Default unit millisecond + TIMESTAMP = 18, - // Timestamp as double seconds since the UNIX epoch - TIMESTAMP_DOUBLE = 19, + // Timestamp as double seconds since the UNIX epoch + TIMESTAMP_DOUBLE = 19, - // Exact time encoded with int64, default unit millisecond - TIME = 20, + // Exact time encoded with int64, default unit millisecond + TIME = 20, - // Precision- and scale-based decimal type. Storage type depends on the - // parameters. - DECIMAL = 21, + // Precision- and scale-based decimal type. Storage type depends on the + // parameters. + DECIMAL = 21, - // Decimal value encoded as a text string - DECIMAL_TEXT = 22, + // Decimal value encoded as a text string + DECIMAL_TEXT = 22, - // A list of some logical data type - LIST = 30, + // A list of some logical data type + LIST = 30, - // Struct of logical types - STRUCT = 31, + // Struct of logical types + STRUCT = 31, - // Unions of logical types - DENSE_UNION = 32, - SPARSE_UNION = 33, + // Unions of logical types + DENSE_UNION = 32, + SPARSE_UNION = 33, - // Union - JSON_SCALAR = 50, + // Union + JSON_SCALAR = 50, - // User-defined type - USER = 60 + // User-defined type + USER = 60 + }; }; - struct DataType { - TypeEnum type; + LogicalType::type type; + bool nullable; - explicit DataType(TypeEnum type) - : type(type) {} + explicit DataType(LogicalType::type type, bool nullable = true) : + type(type), + nullable(nullable) {} virtual bool Equals(const DataType* other) { - return this == other || this->type == other->type; + return this == other || (this->type == other->type && + this->nullable == other->nullable); } virtual std::string ToString() const = 0; @@ -171,6 +173,77 @@ struct ListLayoutType : public LayoutType { value_type(value_type) {} }; +template +struct PrimitiveType : public DataType { + explicit PrimitiveType(bool nullable = true) + : DataType(Derived::type_enum, nullable) {} + + virtual std::string ToString() const { + std::string result; + if (nullable) { + result.append("?"); + } + result.append(static_cast(this)->name()); + return result; + } +}; + +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr LogicalType::type type_enum = LogicalType::ENUM; \ + static constexpr int size = SIZE; \ + \ + explicit TYPENAME(bool nullable = true) \ + : PrimitiveType(nullable) {} \ + \ + static const char* name() { \ + return NAME; \ + } + +struct BooleanType : public PrimitiveType { + PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); +}; + +struct UInt8Type : public PrimitiveType { + PRIMITIVE_DECL(UInt8Type, uint8_t, UINT8, 1, "uint8"); +}; + +struct Int8Type : public PrimitiveType { + PRIMITIVE_DECL(Int8Type, int8_t, INT8, 1, "int8"); +}; + +struct UInt16Type : public PrimitiveType { + PRIMITIVE_DECL(UInt16Type, uint16_t, UINT16, 2, "uint16"); +}; + +struct Int16Type : public PrimitiveType { + PRIMITIVE_DECL(Int16Type, int16_t, INT16, 2, "int16"); +}; + +struct UInt32Type : public PrimitiveType { + PRIMITIVE_DECL(UInt32Type, uint32_t, UINT32, 4, "uint32"); +}; + +struct Int32Type : public PrimitiveType { + PRIMITIVE_DECL(Int32Type, int32_t, INT32, 4, "int32"); +}; + +struct UInt64Type : public PrimitiveType { + PRIMITIVE_DECL(UInt64Type, uint64_t, UINT64, 8, "uint64"); +}; + +struct Int64Type : public PrimitiveType { + PRIMITIVE_DECL(Int64Type, int64_t, INT64, 8, "int64"); +}; + +struct FloatType : public PrimitiveType { + PRIMITIVE_DECL(FloatType, float, FLOAT, 4, "float"); +}; + +struct DoubleType : public PrimitiveType { + PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); +}; + } // namespace arrow #endif // ARROW_TYPE_H diff --git a/cpp/src/arrow/types/binary.h b/cpp/src/arrow/types/binary.h index a9f20046b58..1fd675e5fde 100644 --- a/cpp/src/arrow/types/binary.h +++ b/cpp/src/arrow/types/binary.h @@ -25,9 +25,6 @@ namespace arrow { -struct StringType : public DataType { -}; - } // namespace arrow #endif // ARROW_TYPES_BINARY_H diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h index 31388c8152d..8fc9cfd19c0 100644 --- a/cpp/src/arrow/types/boolean.h +++ b/cpp/src/arrow/types/boolean.h @@ -22,10 +22,6 @@ namespace arrow { -struct BooleanType : public PrimitiveType { - PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); -}; - typedef PrimitiveArrayImpl BooleanArray; // typedef PrimitiveBuilder BooleanBuilder; diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 094b63f2898..42a9c926bb1 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -25,7 +25,7 @@ namespace arrow { -template +template struct CollectionType : public DataType { std::vector child_types_; diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index e1bb990063c..05d6b270fc3 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -33,7 +33,7 @@ class ArrayBuilder; // difficult #define BUILDER_CASE(ENUM, BuilderType) \ - case TypeEnum::ENUM: \ + case LogicalType::ENUM: \ *out = static_cast(new BuilderType(pool, type)); \ return Status::OK(); @@ -56,7 +56,7 @@ Status make_builder(MemoryPool* pool, const TypePtr& type, BUILDER_CASE(STRING, StringBuilder); - case TypeEnum::LIST: + case LogicalType::LIST: { ListType* list_type = static_cast(type.get()); ArrayBuilder* value_builder; diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index d90883cb018..765fc29dd57 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -31,8 +31,8 @@ struct DateType : public DataType { Unit unit; - explicit DateType(Unit unit = Unit::DAY) - : DataType(TypeEnum::DATE), + explicit DateType(Unit unit = Unit::DAY, bool nullable = true) + : DataType(LogicalType::DATE, nullable), unit(unit) {} DateType(const DateType& other) @@ -58,8 +58,8 @@ struct TimestampType : public DataType { Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI) - : DataType(TypeEnum::TIMESTAMP), + explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) + : DataType(LogicalType::TIMESTAMP, nullable), unit(unit) {} TimestampType(const TimestampType& other) diff --git a/cpp/src/arrow/types/floating.h b/cpp/src/arrow/types/floating.h index 7551ce665a2..e7522781d33 100644 --- a/cpp/src/arrow/types/floating.h +++ b/cpp/src/arrow/types/floating.h @@ -21,17 +21,10 @@ #include #include "arrow/types/primitive.h" +#include "arrow/type.h" namespace arrow { -struct FloatType : public PrimitiveType { - PRIMITIVE_DECL(FloatType, float, FLOAT, 4, "float"); -}; - -struct DoubleType : public PrimitiveType { - PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); -}; - typedef PrimitiveArrayImpl FloatArray; typedef PrimitiveArrayImpl DoubleArray; diff --git a/cpp/src/arrow/types/integer.h b/cpp/src/arrow/types/integer.h index 7e5eab55be0..56841912494 100644 --- a/cpp/src/arrow/types/integer.h +++ b/cpp/src/arrow/types/integer.h @@ -22,41 +22,10 @@ #include #include "arrow/types/primitive.h" +#include "arrow/type.h" namespace arrow { -struct UInt8Type : public PrimitiveType { - PRIMITIVE_DECL(UInt8Type, uint8_t, UINT8, 1, "uint8"); -}; - -struct Int8Type : public PrimitiveType { - PRIMITIVE_DECL(Int8Type, int8_t, INT8, 1, "int8"); -}; - -struct UInt16Type : public PrimitiveType { - PRIMITIVE_DECL(UInt16Type, uint16_t, UINT16, 2, "uint16"); -}; - -struct Int16Type : public PrimitiveType { - PRIMITIVE_DECL(Int16Type, int16_t, INT16, 2, "int16"); -}; - -struct UInt32Type : public PrimitiveType { - PRIMITIVE_DECL(UInt32Type, uint32_t, UINT32, 4, "uint32"); -}; - -struct Int32Type : public PrimitiveType { - PRIMITIVE_DECL(Int32Type, int32_t, INT32, 4, "int32"); -}; - -struct UInt64Type : public PrimitiveType { - PRIMITIVE_DECL(UInt64Type, uint64_t, UINT64, 8, "uint64"); -}; - -struct Int64Type : public PrimitiveType { - PRIMITIVE_DECL(Int64Type, int64_t, INT64, 8, "int64"); -}; - // Array containers typedef PrimitiveArrayImpl UInt8Array; diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h index 6c2b097a737..b67fb3807ad 100644 --- a/cpp/src/arrow/types/json.h +++ b/cpp/src/arrow/types/json.h @@ -28,8 +28,8 @@ struct JSONScalar : public DataType { static TypePtr dense_type; static TypePtr sparse_type; - explicit JSONScalar(bool dense = true) - : DataType(TypeEnum::JSON_SCALAR), + explicit JSONScalar(bool dense = true, bool nullable = true) + : DataType(LogicalType::JSON_SCALAR, nullable), dense(dense) {} }; diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 1d9ddbe607a..b4bbd2841a8 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -44,19 +44,19 @@ TEST(TypesTest, TestListType) { std::shared_ptr vt = std::make_shared(); ListType list_type(vt); - ASSERT_EQ(list_type.type, TypeEnum::LIST); + ASSERT_EQ(list_type.type, LogicalType::LIST); ASSERT_EQ(list_type.name(), string("list")); - ASSERT_EQ(list_type.ToString(), string("list")); + ASSERT_EQ(list_type.ToString(), string("?list")); ASSERT_EQ(list_type.value_type->type, vt->type); ASSERT_EQ(list_type.value_type->type, vt->type); - std::shared_ptr st = std::make_shared(); - std::shared_ptr lt = std::make_shared(st); + std::shared_ptr st = std::make_shared(false); + std::shared_ptr lt = std::make_shared(st, false); ASSERT_EQ(lt->ToString(), string("list")); - ListType lt2(lt); + ListType lt2(lt, false); ASSERT_EQ(lt2.ToString(), string("list>")); } diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index f0ff5bf928a..577d71d0b28 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -24,6 +24,9 @@ namespace arrow { std::string ListType::ToString() const { std::stringstream s; + if (this->nullable) { + s << "?"; + } s << "list<" << value_type->ToString() << ">"; return s.str(); } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 4190b53df01..1fc83536db8 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -40,8 +40,8 @@ struct ListType : public DataType { // List can contain any other logical value type TypePtr value_type; - explicit ListType(const TypePtr& value_type) - : DataType(TypeEnum::LIST), + explicit ListType(const TypePtr& value_type, bool nullable = true) + : DataType(LogicalType::LIST, nullable), value_type(value_type) {} static char const *name() { @@ -51,7 +51,6 @@ struct ListType : public DataType { virtual std::string ToString() const; }; - class ListArray : public Array { public: ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 93634432d5c..02eaaa7542b 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -54,11 +54,11 @@ TEST(TypesTest, TestBytesType) { TEST(TypesTest, TestPrimitive_##ENUM) { \ KLASS tp; \ \ - ASSERT_EQ(tp.type, TypeEnum::ENUM); \ + ASSERT_EQ(tp.type, LogicalType::ENUM); \ ASSERT_EQ(tp.name(), string(NAME)); \ \ KLASS tp_copy = tp; \ - ASSERT_EQ(tp_copy.type, TypeEnum::ENUM); \ + ASSERT_EQ(tp_copy.type, LogicalType::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index aa8f351202a..49040fb6626 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -34,28 +34,6 @@ namespace arrow { class MemoryPool; -template -struct PrimitiveType : public DataType { - PrimitiveType() - : DataType(Derived::type_enum) {} - - virtual std::string ToString() const { - return std::string(static_cast(this)->name()); - } -}; - -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr TypeEnum type_enum = TypeEnum::ENUM; \ - static constexpr int size = SIZE; \ - \ - TYPENAME() \ - : PrimitiveType() {} \ - \ - static const char* name() { \ - return NAME; \ - } - // Base class for fixed-size logical types class PrimitiveArray : public Array { diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index e1dcebe97f0..9af66729502 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -38,14 +38,14 @@ class Buffer; TEST(TypesTest, TestCharType) { CharType t1(5); - ASSERT_EQ(t1.type, TypeEnum::CHAR); + ASSERT_EQ(t1.type, LogicalType::CHAR); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.ToString(), std::string("char(5)")); // Test copy constructor CharType t2 = t1; - ASSERT_EQ(t2.type, TypeEnum::CHAR); + ASSERT_EQ(t2.type, LogicalType::CHAR); ASSERT_EQ(t2.size, 5); } @@ -53,7 +53,7 @@ TEST(TypesTest, TestCharType) { TEST(TypesTest, TestVarcharType) { VarcharType t1(5); - ASSERT_EQ(t1.type, TypeEnum::VARCHAR); + ASSERT_EQ(t1.type, LogicalType::VARCHAR); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.physical_type.size, 6); @@ -61,14 +61,14 @@ TEST(TypesTest, TestVarcharType) { // Test copy constructor VarcharType t2 = t1; - ASSERT_EQ(t2.type, TypeEnum::VARCHAR); + ASSERT_EQ(t2.type, LogicalType::VARCHAR); ASSERT_EQ(t2.size, 5); ASSERT_EQ(t2.physical_type.size, 6); } TEST(TypesTest, TestStringType) { StringType str; - ASSERT_EQ(str.type, TypeEnum::STRING); + ASSERT_EQ(str.type, LogicalType::STRING); ASSERT_EQ(str.name(), std::string("string")); } @@ -128,8 +128,8 @@ TEST_F(TestStringContainer, TestArrayBasics) { TEST_F(TestStringContainer, TestType) { TypePtr type = strings_.type(); - ASSERT_EQ(TypeEnum::STRING, type->type); - ASSERT_EQ(TypeEnum::STRING, strings_.type_enum()); + ASSERT_EQ(LogicalType::STRING, type->type); + ASSERT_EQ(LogicalType::STRING, strings_.logical_type()); } diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 084562530a8..5795cfed577 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -40,8 +40,8 @@ struct CharType : public DataType { BytesType physical_type; - explicit CharType(int size) - : DataType(TypeEnum::CHAR), + explicit CharType(int size, bool nullable = true) + : DataType(LogicalType::CHAR, nullable), size(size), physical_type(BytesType(size)) {} @@ -58,8 +58,8 @@ struct VarcharType : public DataType { BytesType physical_type; - explicit VarcharType(int size) - : DataType(TypeEnum::VARCHAR), + explicit VarcharType(int size, bool nullable = true) + : DataType(LogicalType::VARCHAR, nullable), size(size), physical_type(BytesType(size + 1)) {} VarcharType(const VarcharType& other) @@ -73,26 +73,26 @@ static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - StringType() - : DataType(TypeEnum::STRING) {} + explicit StringType(bool nullable = true) + : DataType(LogicalType::STRING, nullable) {} StringType(const StringType& other) : StringType() {} - const LayoutPtr& physical_type() { - return physical_string; - } - static char const *name() { return "string"; } virtual std::string ToString() const { - return name(); + std::string result; + if (nullable) { + result.append("?"); + } + result.append(name()); + return result; } }; - // TODO: add a BinaryArray layer in between class StringArray : public ListArray { public: diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 1a9fc6be4a5..df615710479 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -49,7 +49,7 @@ TEST(TestStructType, Basics) { ASSERT_TRUE(struct_type.field(1).Equals(f1)); ASSERT_TRUE(struct_type.field(2).Equals(f2)); - ASSERT_EQ(struct_type.ToString(), "struct"); + ASSERT_EQ(struct_type.ToString(), "?struct"); // TODO: out of bounds for field(...) } diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index a245656b516..6b233bc372a 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -26,6 +26,7 @@ namespace arrow { std::string StructType::ToString() const { std::stringstream s; + if (nullable) s << "?"; s << "struct<"; for (size_t i = 0; i < fields_.size(); ++i) { if (i > 0) s << ", "; diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index afba19a7e46..e575c31287c 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -29,8 +29,8 @@ namespace arrow { struct StructType : public DataType { std::vector fields_; - explicit StructType(const std::vector& fields) - : DataType(TypeEnum::STRUCT) { + explicit StructType(const std::vector& fields, bool nullable = true) + : DataType(LogicalType::STRUCT, nullable) { fields_ = fields; } diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 62a3d1c1035..9aff780c6a3 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -30,8 +30,8 @@ namespace arrow { class Buffer; -struct DenseUnionType : public CollectionType { - typedef CollectionType Base; +struct DenseUnionType : public CollectionType { + typedef CollectionType Base; explicit DenseUnionType(const std::vector& child_types) : Base() { @@ -42,8 +42,8 @@ struct DenseUnionType : public CollectionType { }; -struct SparseUnionType : public CollectionType { - typedef CollectionType Base; +struct SparseUnionType : public CollectionType { + typedef CollectionType Base; explicit SparseUnionType(const std::vector& child_types) : Base() { From 307977e39eddf62f832a5f1a452963751c6b36a0 Mon Sep 17 00:00:00 2001 From: proflin Date: Thu, 3 Mar 2016 16:14:47 -0800 Subject: [PATCH 015/210] ARROW-15: Fix a naming typo for memory.AllocationManager.AllocationOutcome Rename FORCED_SUCESS to FORCED_SUC**_C_**ESS in memory.AllocationManager.AllocationOutcome. Author: proflin Closes #4 from proflin/ARROW-15--Fix-a-naming-typo-for-memory.AllocationManager.AllocationOutcome and squashes the following commits: 0e276fa [proflin] ARROW-15: Fix a naming typo for memory.AllocationManager.AllocationOutcome --- .../src/main/java/org/apache/arrow/memory/Accountant.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java index dc75e5d7231..37c598ad89e 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/Accountant.java @@ -247,7 +247,7 @@ public static enum AllocationOutcome { /** * Allocation succeeded but only because the allocator was forced to move beyond a limit. */ - FORCED_SUCESS(true), + FORCED_SUCCESS(true), /** * Allocation failed because the local allocator's limits were exceeded. From 0c95d3cc6d954128bf400598878ad9c4228ccbce Mon Sep 17 00:00:00 2001 From: proflin Date: Thu, 3 Mar 2016 16:16:28 -0800 Subject: [PATCH 016/210] ARROW-10: Fix mismatch of javadoc names and method parameters Author: proflin Author: Liwei Lin Closes #3 from proflin/ARROW-10--Fix-mismatch-of-javadoc-names-and-method-parameters and squashes the following commits: 99366ab [Liwei Lin] ARROW-10: Fix mismatch of javadoc names and method parameters 9186cb3 [proflin] ARROW-10: Fix mismatch of javadoc names and method parameters 2b1313e [proflin] Fix mismatch of javadoc names and method parameters --- .../main/java/org/apache/arrow/memory/AllocationManager.java | 5 ++--- .../org/apache/arrow/memory/AllocatorClosedException.java | 5 +++-- .../src/main/java/org/apache/arrow/memory/BufferManager.java | 1 + .../main/java/org/apache/arrow/memory/ChildAllocator.java | 5 +---- .../java/org/apache/arrow/memory/util/HistoricalLog.java | 2 +- .../main/java/org/apache/arrow/vector/AllocationHelper.java | 2 +- .../org/apache/arrow/vector/complex/ContainerVectorLike.java | 2 +- 7 files changed, 10 insertions(+), 12 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java index 37d1d34a620..43ee9c108d9 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocationManager.java @@ -328,7 +328,8 @@ public int decrement(int decrement) { * destroyed before use. * * @param allocator - * @return + * A BufferAllocator. + * @return The ledger associated with the BufferAllocator. */ public BufferLedger getLedgerForAllocator(BufferAllocator allocator) { return associate((BaseAllocator) allocator); @@ -356,8 +357,6 @@ public ArrowBuf newArrowBuf(int offset, int length) { * The length in bytes that this ArrowBuf will provide access to. * @param manager * An optional BufferManager argument that can be used to manage expansion of this ArrowBuf - * @param retain - * Whether or not the newly created buffer should get an additional reference count added to it. * @return A new ArrowBuf that shares references with all ArrowBufs associated with this BufferLedger */ public ArrowBuf newArrowBuf(int offset, int length, BufferManager manager) { diff --git a/java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java b/java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java index 566457981c7..3274642dedd 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/AllocatorClosedException.java @@ -20,11 +20,12 @@ /** * Exception thrown when a closed BufferAllocator is used. Note * this is an unchecked exception. - * - * @param message string associated with the cause */ @SuppressWarnings("serial") public class AllocatorClosedException extends RuntimeException { + /** + * @param message string associated with the cause + */ public AllocatorClosedException(String message) { super(message); } diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java b/java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java index d6470fa51e7..89694347910 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BufferManager.java @@ -43,6 +43,7 @@ public interface BufferManager extends AutoCloseable { * @param newSize * Size of new replacement buffer. * @return + * A new version of the buffer. */ public ArrowBuf replace(ArrowBuf old, int newSize); diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java index 6f120e5328b..11c9063fc9c 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/ChildAllocator.java @@ -31,15 +31,12 @@ class ChildAllocator extends BaseAllocator { * Constructor. * * @param parentAllocator parent allocator -- the one creating this child - * @param allocatorOwner a handle to the object making the request - * @param allocationPolicy the allocation policy to use; the policy for all - * allocators must match for each invocation of a drillbit + * @param name the name of this child allocator * @param initReservation initial amount of space to reserve (obtained from the parent) * @param maxAllocation maximum amount of space that can be obtained from this allocator; * note this includes direct allocations (via {@see BufferAllocator#buffer(int, int)} * et al) and requests from descendant allocators. Depending on the allocation policy in * force, even less memory may be available - * @param flags one or more of BaseAllocator.F_* flags */ ChildAllocator( BaseAllocator parentAllocator, diff --git a/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java index 38cb779343a..c9b5c5385c5 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/util/HistoricalLog.java @@ -126,7 +126,7 @@ public void buildHistory(final StringBuilder sb, boolean includeStackTrace) { /** * * @param sb - * @param indexLevel + * @param indent * @param includeStackTrace */ public synchronized void buildHistory(final StringBuilder sb, int indent, boolean includeStackTrace) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java index 54c3cd7331e..15c3a0227c6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/AllocationHelper.java @@ -49,7 +49,7 @@ public static void allocate(ValueVector v, int valueCount, int bytesPerValue, in * Allocates the exact amount if v is fixed width, otherwise falls back to dynamic allocation * @param v value vector we are trying to allocate * @param valueCount size we are trying to allocate - * @throws org.apache.drill.exec.memory.OutOfMemoryException if it can't allocate the memory + * @throws org.apache.arrow.memory.OutOfMemoryException if it can't allocate the memory */ public static void allocateNew(ValueVector v, int valueCount) { if (v instanceof FixedWidthVector) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java index e50b0d0d0a5..655b55a6aa2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java @@ -31,7 +31,7 @@ public interface ContainerVectorLike { * * @param descriptor vector descriptor * @return result of operation wrapping vector corresponding to the given descriptor and whether it's newly created - * @throws org.apache.drill.common.exceptions.DrillRuntimeException + * @throws org.apache.arrow.vector.util.SchemaChangeRuntimeException * if schema change is not permissible between the given and existing data vector types. */ AddOrGetResult addOrGetVector(VectorDescriptor descriptor); From 3b777c7f43d75444f040351b8ae4b735250f2efc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Mar 2016 16:18:28 -0800 Subject: [PATCH 017/210] ARROW-26: Add instructions for enabling Arrow C++ Parquet adapter build This patch documents the workflow for building the optional Arrow-Parquet C++ integration. I originally thought about adding an option to build it in Arrow's thirdparty, but it immediately results in a dependency-hell situation (Parquet requires Thrift, Boost, snappy, lz4, zlib) Author: Wes McKinney Closes #12 from wesm/ARROW-26 and squashes the following commits: b28fd75 [Wes McKinney] Add instructions for enabling Arrow C++ Parquet adapter build --- cpp/CMakeLists.txt | 4 ++-- cpp/doc/Parquet.md | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 cpp/doc/Parquet.md diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5e4c2045813..f425c5f3106 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -54,7 +54,7 @@ endif() # Top level cmake dir if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") - option(ARROW_WITH_PARQUET + option(ARROW_PARQUET "Build the Parquet adapter and link to libparquet" OFF) @@ -441,7 +441,7 @@ endif (UNIX) #---------------------------------------------------------------------- # Parquet adapter -if(ARROW_WITH_PARQUET) +if(ARROW_PARQUET) find_package(Parquet REQUIRED) include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) ADD_THIRDPARTY_LIB(parquet diff --git a/cpp/doc/Parquet.md b/cpp/doc/Parquet.md new file mode 100644 index 00000000000..370ac833388 --- /dev/null +++ b/cpp/doc/Parquet.md @@ -0,0 +1,24 @@ +## Building Arrow-Parquet integration + +To build the Arrow C++'s Parquet adapter library, you must first build [parquet-cpp][1]: + +```bash +# Set this to your preferred install location +export PARQUET_HOME=$HOME/local + +git clone https://github.com/apache/parquet-cpp.git +cd parquet-cpp +source setup_build_env.sh +cmake -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME +make -j4 +make install +``` + +Make sure that `$PARQUET_HOME` is set to the installation location. Now, build +Arrow with the Parquet adapter enabled: + +```bash +cmake -DARROW_PARQUET=ON +``` + +[1]: https://github.com/apache/parquet-cpp \ No newline at end of file From 9c2b95446abe1ec4dd5c25215c9595a3d7b49f2b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Mar 2016 15:02:10 -0800 Subject: [PATCH 018/210] ARROW-23: Add a logical Column data structure I also added global const instances of common primitive types Author: Wes McKinney Closes #15 from wesm/ARROW-23 and squashes the following commits: 1835d33 [Wes McKinney] Don't use auto 988135c [Wes McKinney] Add Column chunk type validation function 8a2e40e [Wes McKinney] Remove unneeded operator()/shared_from_this experiment de9ec70 [Wes McKinney] Aggregate null counts too 7049314 [Wes McKinney] cpplint a565d26 [Wes McKinney] Add ChunkedArray / Column ctors, test passes 0648ed2 [Wes McKinney] Prototyping --- cpp/CMakeLists.txt | 2 + cpp/src/arrow/array.h | 1 - cpp/src/arrow/schema-test.cc | 7 +- cpp/src/arrow/table/CMakeLists.txt | 39 +++++++++++ cpp/src/arrow/table/column-test.cc | 93 ++++++++++++++++++++++++++ cpp/src/arrow/table/column.cc | 62 +++++++++++++++++ cpp/src/arrow/table/column.h | 103 +++++++++++++++++++++++++++++ cpp/src/arrow/type.cc | 12 ++++ cpp/src/arrow/type.h | 17 +++++ cpp/src/arrow/types/list.h | 2 +- cpp/src/arrow/types/primitive.h | 20 +++--- cpp/src/arrow/util/bit-util.h | 4 ++ 12 files changed, 347 insertions(+), 15 deletions(-) create mode 100644 cpp/src/arrow/table/CMakeLists.txt create mode 100644 cpp/src/arrow/table/column-test.cc create mode 100644 cpp/src/arrow/table/column.cc create mode 100644 cpp/src/arrow/table/column.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f425c5f3106..15afb1acf67 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -458,10 +458,12 @@ endif() add_subdirectory(src/arrow) add_subdirectory(src/arrow/util) +add_subdirectory(src/arrow/table) add_subdirectory(src/arrow/types) set(LINK_LIBS arrow_util + arrow_table arrow_types) set(ARROW_SRCS diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 0632146637e..85e853e2ae5 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -81,7 +81,6 @@ class Array { DISALLOW_COPY_AND_ASSIGN(Array); }; - typedef std::shared_ptr ArrayPtr; } // namespace arrow diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc index 3debb9cec3c..7c190d068c2 100644 --- a/cpp/src/arrow/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -31,7 +31,7 @@ using std::vector; namespace arrow { TEST(TestField, Basics) { - shared_ptr ftype = std::make_shared(); + shared_ptr ftype = INT32; shared_ptr ftype_nn = std::make_shared(false); Field f0("f0", ftype); Field f0_nn("f0", ftype_nn); @@ -44,7 +44,7 @@ TEST(TestField, Basics) { } TEST(TestField, Equals) { - shared_ptr ftype = std::make_shared(); + shared_ptr ftype = INT32; shared_ptr ftype_nn = std::make_shared(false); Field f0("f0", ftype); @@ -61,8 +61,7 @@ class TestSchema : public ::testing::Test { }; TEST_F(TestSchema, Basics) { - auto f0 = std::make_shared("f0", std::make_shared()); - + auto f0 = std::make_shared("f0", INT32); auto f1 = std::make_shared("f1", std::make_shared(false)); auto f1_optional = std::make_shared("f1", std::make_shared()); diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt new file mode 100644 index 00000000000..a401622d2e0 --- /dev/null +++ b/cpp/src/arrow/table/CMakeLists.txt @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# arrow_table +####################################### + +set(TABLE_SRCS + column.cc +) + +set(TABLE_LIBS +) + +add_library(arrow_table STATIC + ${TABLE_SRCS} +) +target_link_libraries(arrow_table ${TABLE_LIBS}) +SET_TARGET_PROPERTIES(arrow_table PROPERTIES LINKER_LANGUAGE CXX) + +# Headers: top level +install(FILES + DESTINATION include/arrow/table) + +ADD_ARROW_TEST(column-test) diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/table/column-test.cc new file mode 100644 index 00000000000..15f554f4632 --- /dev/null +++ b/cpp/src/arrow/table/column-test.cc @@ -0,0 +1,93 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/field.h" +#include "arrow/schema.h" +#include "arrow/table/column.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/types/integer.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { + +class TestColumn : public ::testing::Test { + public: + void SetUp() { + pool_ = GetDefaultMemoryPool(); + } + + template + std::shared_ptr MakeArray(int32_t length, int32_t null_count = 0) { + auto data = std::make_shared(pool_); + auto nulls = std::make_shared(pool_); + data->Resize(length * sizeof(typename ArrayType::value_type)); + nulls->Resize(util::bytes_for_bits(length)); + return std::make_shared(length, data, 10, nulls); + } + + protected: + MemoryPool* pool_; + + std::shared_ptr data_; + std::unique_ptr column_; +}; + +TEST_F(TestColumn, BasicAPI) { + ArrayVector arrays; + arrays.push_back(MakeArray(100)); + arrays.push_back(MakeArray(100, 10)); + arrays.push_back(MakeArray(100, 20)); + + auto field = std::make_shared("c0", INT32); + column_.reset(new Column(field, arrays)); + + ASSERT_EQ("c0", column_->name()); + ASSERT_TRUE(column_->type()->Equals(INT32)); + ASSERT_EQ(300, column_->length()); + ASSERT_EQ(30, column_->null_count()); + ASSERT_EQ(3, column_->data()->num_chunks()); +} + +TEST_F(TestColumn, ChunksInhomogeneous) { + ArrayVector arrays; + arrays.push_back(MakeArray(100)); + arrays.push_back(MakeArray(100, 10)); + + auto field = std::make_shared("c0", INT32); + column_.reset(new Column(field, arrays)); + + ASSERT_OK(column_->ValidateData()); + + arrays.push_back(MakeArray(100, 10)); + column_.reset(new Column(field, arrays)); + ASSERT_RAISES(Invalid, column_->ValidateData()); +} + +} // namespace arrow diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/table/column.cc new file mode 100644 index 00000000000..82750cf4d43 --- /dev/null +++ b/cpp/src/arrow/table/column.cc @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/table/column.h" + +#include +#include + +#include "arrow/field.h" +#include "arrow/util/status.h" + +namespace arrow { + +ChunkedArray::ChunkedArray(const ArrayVector& chunks) : + chunks_(chunks) { + length_ = 0; + for (const std::shared_ptr& chunk : chunks) { + length_ += chunk->length(); + null_count_ += chunk->null_count(); + } +} + +Column::Column(const std::shared_ptr& field, const ArrayVector& chunks) : + field_(field) { + data_ = std::make_shared(chunks); +} + +Column::Column(const std::shared_ptr& field, + const std::shared_ptr& data) : + field_(field), + data_(data) {} + +Status Column::ValidateData() { + for (int i = 0; i < data_->num_chunks(); ++i) { + const std::shared_ptr& type = data_->chunk(i)->type(); + if (!this->type()->Equals(type)) { + std::stringstream ss; + ss << "In chunk " << i << " expected type " + << this->type()->ToString() + << " but saw " + << type->ToString(); + return Status::Invalid(ss.str()); + } + } + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/table/column.h new file mode 100644 index 00000000000..9e9064e8654 --- /dev/null +++ b/cpp/src/arrow/table/column.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TABLE_COLUMN_H +#define ARROW_TABLE_COLUMN_H + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/field.h" + +namespace arrow { + +typedef std::vector > ArrayVector; + +// A data structure managing a list of primitive Arrow arrays logically as one +// large array +class ChunkedArray { + public: + explicit ChunkedArray(const ArrayVector& chunks); + + // @returns: the total length of the chunked array; computed on construction + int64_t length() const { + return length_; + } + + int64_t null_count() const { + return null_count_; + } + + int num_chunks() const { + return chunks_.size(); + } + + const std::shared_ptr& chunk(int i) const { + return chunks_[i]; + } + + protected: + ArrayVector chunks_; + int64_t length_; + int64_t null_count_; +}; + +// An immutable column data structure consisting of a field (type metadata) and +// a logical chunked data array (which can be validated as all being the same +// type). +class Column { + public: + Column(const std::shared_ptr& field, const ArrayVector& chunks); + Column(const std::shared_ptr& field, + const std::shared_ptr& data); + + int64_t length() const { + return data_->length(); + } + + int64_t null_count() const { + return data_->null_count(); + } + + // @returns: the column's name in the passed metadata + const std::string& name() const { + return field_->name; + } + + // @returns: the column's type according to the metadata + const std::shared_ptr& type() const { + return field_->type; + } + + // @returns: the column's data as a chunked logical array + const std::shared_ptr& data() const { + return data_; + } + // Verify that the column's array data is consistent with the passed field's + // metadata + Status ValidateData(); + + protected: + std::shared_ptr field_; + std::shared_ptr data_; +}; + +} // namespace arrow + +#endif // ARROW_TABLE_COLUMN_H diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 492eee52b04..ff145e2c1e3 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -19,4 +19,16 @@ namespace arrow { +const std::shared_ptr BOOL = std::make_shared(); +const std::shared_ptr UINT8 = std::make_shared(); +const std::shared_ptr UINT16 = std::make_shared(); +const std::shared_ptr UINT32 = std::make_shared(); +const std::shared_ptr UINT64 = std::make_shared(); +const std::shared_ptr INT8 = std::make_shared(); +const std::shared_ptr INT16 = std::make_shared(); +const std::shared_ptr INT32 = std::make_shared(); +const std::shared_ptr INT64 = std::make_shared(); +const std::shared_ptr FLOAT = std::make_shared(); +const std::shared_ptr DOUBLE = std::make_shared(); + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 04cdb52b535..4193a0e8bc8 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -142,10 +142,15 @@ struct DataType { nullable(nullable) {} virtual bool Equals(const DataType* other) { + // Call with a pointer so more friendly to subclasses return this == other || (this->type == other->type && this->nullable == other->nullable); } + bool Equals(const std::shared_ptr& other) { + return Equals(other.get()); + } + virtual std::string ToString() const = 0; }; @@ -244,6 +249,18 @@ struct DoubleType : public PrimitiveType { PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); }; +extern const std::shared_ptr BOOL; +extern const std::shared_ptr UINT8; +extern const std::shared_ptr UINT16; +extern const std::shared_ptr UINT32; +extern const std::shared_ptr UINT64; +extern const std::shared_ptr INT8; +extern const std::shared_ptr INT16; +extern const std::shared_ptr INT32; +extern const std::shared_ptr INT64; +extern const std::shared_ptr FLOAT; +extern const std::shared_ptr DOUBLE; + } // namespace arrow #endif // ARROW_TYPE_H diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 1fc83536db8..f39fe5c4d81 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -132,7 +132,7 @@ class ListBuilder : public Int32Builder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(T* values, int32_t length, uint8_t* null_bytes = nullptr) { + Status Append(value_type* values, int32_t length, uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 49040fb6626..09d43e7ec8b 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -60,7 +60,7 @@ class PrimitiveArray : public Array { template class PrimitiveArrayImpl : public PrimitiveArray { public: - typedef typename TypeClass::c_type T; + typedef typename TypeClass::c_type value_type; PrimitiveArrayImpl() : PrimitiveArray() {} @@ -81,9 +81,11 @@ class PrimitiveArrayImpl : public PrimitiveArray { return PrimitiveArray::Equals(*static_cast(&other)); } - const T* raw_data() const { return reinterpret_cast(raw_data_);} + const value_type* raw_data() const { + return reinterpret_cast(raw_data_); + } - T Value(int i) const { + value_type Value(int i) const { return raw_data()[i]; } @@ -96,12 +98,12 @@ class PrimitiveArrayImpl : public PrimitiveArray { template class PrimitiveBuilder : public ArrayBuilder { public: - typedef typename Type::c_type T; + typedef typename Type::c_type value_type; explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type), values_(nullptr) { - elsize_ = sizeof(T); + elsize_ = sizeof(value_type); } virtual ~PrimitiveBuilder() {} @@ -141,7 +143,7 @@ class PrimitiveBuilder : public ArrayBuilder { } // Scalar append - Status Append(T val, bool is_null = false) { + Status Append(value_type val, bool is_null = false) { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); @@ -158,7 +160,7 @@ class PrimitiveBuilder : public ArrayBuilder { // // If passed, null_bytes is of equal length to values, and any nonzero byte // will be considered as a null for that slot - Status Append(const T* values, int32_t length, + Status Append(const value_type* values, int32_t length, const uint8_t* null_bytes = nullptr) { if (length_ + length > capacity_) { int32_t new_capacity = util::next_power2(length_ + length); @@ -215,8 +217,8 @@ class PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } - T* raw_buffer() { - return reinterpret_cast(values_->mutable_data()); + value_type* raw_buffer() { + return reinterpret_cast(values_->mutable_data()); } std::shared_ptr buffer() const { diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 841f617a313..5e7197f9012 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -33,6 +33,10 @@ static inline int64_t ceil_byte(int64_t size) { return (size + 7) & ~7; } +static inline int64_t bytes_for_bits(int64_t size) { + return ceil_byte(size) / 8; +} + static inline int64_t ceil_2bytes(int64_t size) { return (size + 15) & ~15; } From 612fbc74ece160a52edbd260de8391aa07ad00ca Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 4 Mar 2016 17:59:58 -0800 Subject: [PATCH 019/210] ARROW-24: C++: Implement a logical Table container type A table enables us to interpret a collection of Arrow arrays as a logical table or "data frame"-like structure. Each column may consist of one or more "primitive" Arrow memory containers. Note that this currently has the limitation that the table column names must be strings. At least, this is consistent with most storage media and up-stack table implementations (e.g. R's data.frame). Currently this is somewhat limited in the arrangement of data (a vector of chunked columns -- the columns may contain only one data chunk) -- since a Table might be assembled from a vector of row batches (coming across the wire), "pivoting" the row batches might have performance implications that we can examine further on down the road. Author: Wes McKinney Closes #16 from wesm/ARROW-24 and squashes the following commits: b701c76 [Wes McKinney] Test case for wrong number of columns passed 5faa5ac [Wes McKinney] cpplint 9a651cb [Wes McKinney] Basic table prototype. Move Schema code under arrow/table --- cpp/CMakeLists.txt | 1 - cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/table/CMakeLists.txt | 4 + cpp/src/arrow/table/column-test.cc | 37 ++----- cpp/src/arrow/table/column.cc | 6 ++ cpp/src/arrow/table/column.h | 2 + cpp/src/arrow/{ => table}/schema-test.cc | 2 +- cpp/src/arrow/{ => table}/schema.cc | 2 +- cpp/src/arrow/{ => table}/schema.h | 0 cpp/src/arrow/table/table-test.cc | 125 +++++++++++++++++++++++ cpp/src/arrow/table/table.cc | 73 +++++++++++++ cpp/src/arrow/table/table.h | 82 +++++++++++++++ cpp/src/arrow/table/test-common.h | 55 ++++++++++ 13 files changed, 358 insertions(+), 32 deletions(-) rename cpp/src/arrow/{ => table}/schema-test.cc (99%) rename cpp/src/arrow/{ => table}/schema.cc (98%) rename cpp/src/arrow/{ => table}/schema.h (100%) create mode 100644 cpp/src/arrow/table/table-test.cc create mode 100644 cpp/src/arrow/table/table.cc create mode 100644 cpp/src/arrow/table/table.h create mode 100644 cpp/src/arrow/table/test-common.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 15afb1acf67..8042661533e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -470,7 +470,6 @@ set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc src/arrow/field.cc - src/arrow/schema.cc src/arrow/type.cc ) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 04f8dd1f908..77326ce38d7 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -30,4 +30,3 @@ install(FILES set(ARROW_TEST_LINK_LIBS arrow_test_util ${ARROW_MIN_TEST_LIBS}) ADD_ARROW_TEST(array-test) -ADD_ARROW_TEST(schema-test) diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt index a401622d2e0..b51258ffd8b 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/cpp/src/arrow/table/CMakeLists.txt @@ -21,6 +21,8 @@ set(TABLE_SRCS column.cc + schema.cc + table.cc ) set(TABLE_LIBS @@ -37,3 +39,5 @@ install(FILES DESTINATION include/arrow/table) ADD_ARROW_TEST(column-test) +ADD_ARROW_TEST(schema-test) +ADD_ARROW_TEST(table-test) diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/table/column-test.cc index 15f554f4632..4959b82c6e2 100644 --- a/cpp/src/arrow/table/column-test.cc +++ b/cpp/src/arrow/table/column-test.cc @@ -22,48 +22,29 @@ #include #include "arrow/field.h" -#include "arrow/schema.h" #include "arrow/table/column.h" +#include "arrow/table/schema.h" +#include "arrow/table/test-common.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/types/integer.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/buffer.h" -#include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" using std::shared_ptr; using std::vector; namespace arrow { -class TestColumn : public ::testing::Test { - public: - void SetUp() { - pool_ = GetDefaultMemoryPool(); - } - - template - std::shared_ptr MakeArray(int32_t length, int32_t null_count = 0) { - auto data = std::make_shared(pool_); - auto nulls = std::make_shared(pool_); - data->Resize(length * sizeof(typename ArrayType::value_type)); - nulls->Resize(util::bytes_for_bits(length)); - return std::make_shared(length, data, 10, nulls); - } - +class TestColumn : public TestBase { protected: - MemoryPool* pool_; - std::shared_ptr data_; std::unique_ptr column_; }; TEST_F(TestColumn, BasicAPI) { ArrayVector arrays; - arrays.push_back(MakeArray(100)); - arrays.push_back(MakeArray(100, 10)); - arrays.push_back(MakeArray(100, 20)); + arrays.push_back(MakePrimitive(100)); + arrays.push_back(MakePrimitive(100, 10)); + arrays.push_back(MakePrimitive(100, 20)); auto field = std::make_shared("c0", INT32); column_.reset(new Column(field, arrays)); @@ -77,15 +58,15 @@ TEST_F(TestColumn, BasicAPI) { TEST_F(TestColumn, ChunksInhomogeneous) { ArrayVector arrays; - arrays.push_back(MakeArray(100)); - arrays.push_back(MakeArray(100, 10)); + arrays.push_back(MakePrimitive(100)); + arrays.push_back(MakePrimitive(100, 10)); auto field = std::make_shared("c0", INT32); column_.reset(new Column(field, arrays)); ASSERT_OK(column_->ValidateData()); - arrays.push_back(MakeArray(100, 10)); + arrays.push_back(MakePrimitive(100, 10)); column_.reset(new Column(field, arrays)); ASSERT_RAISES(Invalid, column_->ValidateData()); } diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/table/column.cc index 82750cf4d43..d68b491fb99 100644 --- a/cpp/src/arrow/table/column.cc +++ b/cpp/src/arrow/table/column.cc @@ -39,6 +39,12 @@ Column::Column(const std::shared_ptr& field, const ArrayVector& chunks) : data_ = std::make_shared(chunks); } +Column::Column(const std::shared_ptr& field, + const std::shared_ptr& data) : + field_(field) { + data_ = std::make_shared(ArrayVector({data})); +} + Column::Column(const std::shared_ptr& field, const std::shared_ptr& data) : field_(field), diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/table/column.h index 9e9064e8654..64423bf9561 100644 --- a/cpp/src/arrow/table/column.h +++ b/cpp/src/arrow/table/column.h @@ -67,6 +67,8 @@ class Column { Column(const std::shared_ptr& field, const std::shared_ptr& data); + Column(const std::shared_ptr& field, const std::shared_ptr& data); + int64_t length() const { return data_->length(); } diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/table/schema-test.cc similarity index 99% rename from cpp/src/arrow/schema-test.cc rename to cpp/src/arrow/table/schema-test.cc index 7c190d068c2..0cf1b3c5f9a 100644 --- a/cpp/src/arrow/schema-test.cc +++ b/cpp/src/arrow/table/schema-test.cc @@ -21,7 +21,7 @@ #include #include "arrow/field.h" -#include "arrow/schema.h" +#include "arrow/table/schema.h" #include "arrow/type.h" #include "arrow/types/string.h" diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/table/schema.cc similarity index 98% rename from cpp/src/arrow/schema.cc rename to cpp/src/arrow/table/schema.cc index a735fd3d230..fb3b4d6f292 100644 --- a/cpp/src/arrow/schema.cc +++ b/cpp/src/arrow/table/schema.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/schema.h" +#include "arrow/table/schema.h" #include #include diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/table/schema.h similarity index 100% rename from cpp/src/arrow/schema.h rename to cpp/src/arrow/table/schema.h diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table/table-test.cc new file mode 100644 index 00000000000..dd4f74cd16f --- /dev/null +++ b/cpp/src/arrow/table/table-test.cc @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/field.h" +#include "arrow/table/column.h" +#include "arrow/table/schema.h" +#include "arrow/table/table.h" +#include "arrow/table/test-common.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/types/integer.h" + +using std::shared_ptr; +using std::vector; + +namespace arrow { + +class TestTable : public TestBase { + public: + void MakeExample1(int length) { + auto f0 = std::make_shared("f0", INT32); + auto f1 = std::make_shared("f1", UINT8); + auto f2 = std::make_shared("f2", INT16); + + vector > fields = {f0, f1, f2}; + schema_ = std::make_shared(fields); + + columns_ = { + std::make_shared(schema_->field(0), MakePrimitive(length)), + std::make_shared(schema_->field(1), MakePrimitive(length)), + std::make_shared(schema_->field(2), MakePrimitive(length)) + }; + } + + protected: + std::unique_ptr table_; + shared_ptr schema_; + vector > columns_; +}; + +TEST_F(TestTable, EmptySchema) { + auto empty_schema = shared_ptr(new Schema({})); + table_.reset(new Table("data", empty_schema, columns_)); + ASSERT_OK(table_->ValidateColumns()); + ASSERT_EQ(0, table_->num_rows()); + ASSERT_EQ(0, table_->num_columns()); +} + +TEST_F(TestTable, Ctors) { + int length = 100; + MakeExample1(length); + + std::string name = "data"; + + table_.reset(new Table(name, schema_, columns_)); + ASSERT_OK(table_->ValidateColumns()); + ASSERT_EQ(name, table_->name()); + ASSERT_EQ(length, table_->num_rows()); + ASSERT_EQ(3, table_->num_columns()); + + table_.reset(new Table(name, schema_, columns_, length)); + ASSERT_OK(table_->ValidateColumns()); + ASSERT_EQ(name, table_->name()); + ASSERT_EQ(length, table_->num_rows()); +} + +TEST_F(TestTable, Metadata) { + int length = 100; + MakeExample1(length); + + std::string name = "data"; + table_.reset(new Table(name, schema_, columns_)); + + ASSERT_TRUE(table_->schema()->Equals(schema_)); + + auto col = table_->column(0); + ASSERT_EQ(schema_->field(0)->name, col->name()); + ASSERT_EQ(schema_->field(0)->type, col->type()); +} + +TEST_F(TestTable, InvalidColumns) { + // Check that columns are all the same length + int length = 100; + MakeExample1(length); + + table_.reset(new Table("data", schema_, columns_, length - 1)); + ASSERT_RAISES(Invalid, table_->ValidateColumns()); + + columns_.clear(); + + // Wrong number of columns + table_.reset(new Table("data", schema_, columns_, length)); + ASSERT_RAISES(Invalid, table_->ValidateColumns()); + + columns_ = { + std::make_shared(schema_->field(0), MakePrimitive(length)), + std::make_shared(schema_->field(1), MakePrimitive(length)), + std::make_shared(schema_->field(2), MakePrimitive(length - 1)) + }; + + table_.reset(new Table("data", schema_, columns_, length)); + ASSERT_RAISES(Invalid, table_->ValidateColumns()); +} + +} // namespace arrow diff --git a/cpp/src/arrow/table/table.cc b/cpp/src/arrow/table/table.cc new file mode 100644 index 00000000000..4cefc924ed3 --- /dev/null +++ b/cpp/src/arrow/table/table.cc @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/table/table.h" + +#include +#include + +#include "arrow/field.h" +#include "arrow/table/column.h" +#include "arrow/table/schema.h" +#include "arrow/util/status.h" + +namespace arrow { + +Table::Table(const std::string& name, const std::shared_ptr& schema, + const std::vector >& columns) : + name_(name), + schema_(schema), + columns_(columns) { + if (columns.size() == 0) { + num_rows_ = 0; + } else { + num_rows_ = columns[0]->length(); + } +} + +Table::Table(const std::string& name, const std::shared_ptr& schema, + const std::vector >& columns, int64_t num_rows) : + name_(name), + schema_(schema), + columns_(columns), + num_rows_(num_rows) {} + +Status Table::ValidateColumns() const { + if (num_columns() != schema_->num_fields()) { + return Status::Invalid("Number of columns did not match schema"); + } + + if (columns_.size() == 0) { + return Status::OK(); + } + + // Make sure columns are all the same length + for (size_t i = 0; i < columns_.size(); ++i) { + const Column* col = columns_[i].get(); + if (col->length() != num_rows_) { + std::stringstream ss; + ss << "Column " << i << " expected length " + << num_rows_ + << " but got length " + << col->length(); + return Status::Invalid(ss.str()); + } + } + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/table/table.h b/cpp/src/arrow/table/table.h new file mode 100644 index 00000000000..b0129387b71 --- /dev/null +++ b/cpp/src/arrow/table/table.h @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TABLE_TABLE_H +#define ARROW_TABLE_TABLE_H + +#include +#include +#include + +namespace arrow { + +class Column; +class Schema; +class Status; + +// Immutable container of fixed-length columns conforming to a particular schema +class Table { + public: + // If columns is zero-length, the table's number of rows is zero + Table(const std::string& name, const std::shared_ptr& schema, + const std::vector >& columns); + + Table(const std::string& name, const std::shared_ptr& schema, + const std::vector >& columns, int64_t num_rows); + + // @returns: the table's name, if any (may be length 0) + const std::string& name() const { + return name_; + } + + // @returns: the table's schema + const std::shared_ptr& schema() const { + return schema_; + } + + // Note: Does not boundscheck + // @returns: the i-th column + const std::shared_ptr& column(int i) const { + return columns_[i]; + } + + // @returns: the number of columns in the table + int num_columns() const { + return columns_.size(); + } + + // @returns: the number of rows (the corresponding length of each column) + int64_t num_rows() const { + return num_rows_; + } + + // After construction, perform any checks to validate the input arguments + Status ValidateColumns() const; + + private: + // The table's name, optional + std::string name_; + + std::shared_ptr schema_; + std::vector > columns_; + + int64_t num_rows_; +}; + +} // namespace arrow + +#endif // ARROW_TABLE_TABLE_H diff --git a/cpp/src/arrow/table/test-common.h b/cpp/src/arrow/table/test-common.h new file mode 100644 index 00000000000..efe2f228cd0 --- /dev/null +++ b/cpp/src/arrow/table/test-common.h @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include + +#include "arrow/field.h" +#include "arrow/table/column.h" +#include "arrow/table/schema.h" +#include "arrow/table/table.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" + +namespace arrow { + +class TestBase : public ::testing::Test { + public: + void SetUp() { + pool_ = GetDefaultMemoryPool(); + } + + template + std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { + auto data = std::make_shared(pool_); + auto nulls = std::make_shared(pool_); + EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); + EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); + return std::make_shared(length, data, 10, nulls); + } + + protected: + MemoryPool* pool_; +}; + +} // namespace arrow From 572cdf22e3595035966a05a5ec2398f9d29df669 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 14:42:32 -0800 Subject: [PATCH 020/210] ARROW-7: Add barebones Python library build toolchain This patch provides no actual functionality; it only builds an empty Cython extension that links to libarrow.so. I will hook this into Travis CI at some later time. I have adapted a limited amount of BSD (2- or 3-clause) or Apache 2.0 3rd-party code (particularly the cmake/Cython integration) to bootstrap this Python package / build setup in accordance with http://www.apache.org/legal/resolved.html. I have noted the relevant copyright holders and licenses in `python/LICENSE.txt`. In particular, I expect to continue to refactor and reuse occasional utility code from pandas (https://github.com/pydata/pandas) as practical. Since a significant amount of "glue code" will need to be written to marshal between Arrow data and pure Python / NumPy / pandas objects, to get started I've adopted the approach used by libdynd/dynd-python -- a C++ "glue library" that is then called from Cython to provide a Python user interface. This will allow us to build shims as necessary to abstract away complications that leak through (for example: enabling C++ code with no knowledge of Python to invoke Python functions). Let's see how this goes: there are other options, like Boost::Python, but Cython + shim code is a more lightweight and flexible solution for the moment. Author: Wes McKinney Closes #17 from wesm/ARROW-7 and squashes the following commits: be059a2 [Wes McKinney] Nest arrow::py namespace 3ad3143 [Wes McKinney] Add preliminary Python development toolchain --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/table/CMakeLists.txt | 3 + python/.gitignore | 37 ++ python/CMakeLists.txt | 464 +++++++++++++++++++ python/LICENSE.txt | 88 ++++ python/README.md | 14 + python/arrow/__init__.py | 0 python/arrow/compat.py | 86 ++++ python/arrow/config.pyx | 8 + python/arrow/includes/__init__.pxd | 0 python/arrow/includes/arrow.pxd | 23 + python/arrow/includes/common.pxd | 34 ++ python/arrow/includes/parquet.pxd | 51 ++ python/arrow/includes/pyarrow.pxd | 23 + python/arrow/parquet.pyx | 23 + python/arrow/tests/__init__.py | 0 python/cmake_modules/CompilerInfo.cmake | 48 ++ python/cmake_modules/FindArrow.cmake | 77 +++ python/cmake_modules/FindCython.cmake | 30 ++ python/cmake_modules/FindNumPy.cmake | 100 ++++ python/cmake_modules/FindPythonLibsNew.cmake | 236 ++++++++++ python/cmake_modules/UseCython.cmake | 164 +++++++ python/setup.py | 244 ++++++++++ python/src/pyarrow/CMakeLists.txt | 20 + python/src/pyarrow/api.h | 21 + python/src/pyarrow/init.cc | 29 ++ python/src/pyarrow/init.h | 31 ++ python/src/pyarrow/util/CMakeLists.txt | 53 +++ python/src/pyarrow/util/test_main.cc | 26 ++ 29 files changed, 1934 insertions(+) create mode 100644 python/.gitignore create mode 100644 python/CMakeLists.txt create mode 100644 python/LICENSE.txt create mode 100644 python/README.md create mode 100644 python/arrow/__init__.py create mode 100644 python/arrow/compat.py create mode 100644 python/arrow/config.pyx create mode 100644 python/arrow/includes/__init__.pxd create mode 100644 python/arrow/includes/arrow.pxd create mode 100644 python/arrow/includes/common.pxd create mode 100644 python/arrow/includes/parquet.pxd create mode 100644 python/arrow/includes/pyarrow.pxd create mode 100644 python/arrow/parquet.pyx create mode 100644 python/arrow/tests/__init__.py create mode 100644 python/cmake_modules/CompilerInfo.cmake create mode 100644 python/cmake_modules/FindArrow.cmake create mode 100644 python/cmake_modules/FindCython.cmake create mode 100644 python/cmake_modules/FindNumPy.cmake create mode 100644 python/cmake_modules/FindPythonLibsNew.cmake create mode 100644 python/cmake_modules/UseCython.cmake create mode 100644 python/setup.py create mode 100644 python/src/pyarrow/CMakeLists.txt create mode 100644 python/src/pyarrow/api.h create mode 100644 python/src/pyarrow/init.cc create mode 100644 python/src/pyarrow/init.h create mode 100644 python/src/pyarrow/util/CMakeLists.txt create mode 100644 python/src/pyarrow/util/test_main.cc diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 77326ce38d7..102a8a1853f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -20,6 +20,7 @@ install(FILES api.h array.h builder.h + field.h type.h DESTINATION include/arrow) diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt index b51258ffd8b..68bf3148a98 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/cpp/src/arrow/table/CMakeLists.txt @@ -36,6 +36,9 @@ SET_TARGET_PROPERTIES(arrow_table PROPERTIES LINKER_LANGUAGE CXX) # Headers: top level install(FILES + column.h + schema.h + table.h DESTINATION include/arrow/table) ADD_ARROW_TEST(column-test) diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 00000000000..80103a1a529 --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,37 @@ +thirdparty/ +CMakeFiles/ +CMakeCache.txt +CTestTestfile.cmake +Makefile +cmake_install.cmake +build/ +Testing/ + +# Python stuff + +# Editor temporary/working/backup files +*flymake* + +# Compiled source +*.a +*.dll +*.o +*.py[ocd] +*.so +.build_cache_dir +MANIFEST + +# Generated sources +*.c +*.cpp +# Python files + +# setup.py working directory +build +# setup.py dist directory +dist +# Egg metadata +*.egg-info +# coverage +.coverage +coverage.xml diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt new file mode 100644 index 00000000000..df55bfac9eb --- /dev/null +++ b/python/CMakeLists.txt @@ -0,0 +1,464 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Includes code assembled from BSD/MIT/Apache-licensed code from some 3rd-party +# projects, including Kudu, Impala, and libdynd. See python/LICENSE.txt + +cmake_minimum_required(VERSION 2.7) +project(pyarrow) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules") + +# Use common cmake modules from Arrow C++ if available +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/../cpp/cmake_modules") + +include(CMakeParseArguments) + +set(BUILD_SUPPORT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../cpp/build-support) + +# Allow "make install" to not depend on all targets. +# +# Must be declared in the top-level CMakeLists.txt. +set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) + +set(CMAKE_MACOSX_RPATH 1) +set(CMAKE_OSX_DEPLOYMENT_TARGET 10.9) + +# Generate a Clang compile_commands.json "compilation database" file for use +# with various development tools, such as Vim's YouCompleteMe plugin. +# See http://clang.llvm.org/docs/JSONCompilationDatabase.html +if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") + set(CMAKE_EXPORT_COMPILE_COMMANDS 1) +endif() + +############################################################ +# Compiler flags +############################################################ + +# compiler flags that are common across debug/release builds +set(CXX_COMMON_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall") + +# compiler flags for different build types (run 'cmake -DCMAKE_BUILD_TYPE= .') +# For all builds: +# For CMAKE_BUILD_TYPE=Debug +# -ggdb: Enable gdb debugging +# For CMAKE_BUILD_TYPE=FastDebug +# Same as DEBUG, except with some optimizations on. +# For CMAKE_BUILD_TYPE=Release +# -O3: Enable all compiler optimizations +# -g: Enable symbols for profiler tools (TODO: remove for shipping) +# -DNDEBUG: Turn off dchecks/asserts/debug only code. +set(CXX_FLAGS_DEBUG "-ggdb -O0") +set(CXX_FLAGS_FASTDEBUG "-ggdb -O1") +set(CXX_FLAGS_RELEASE "-O3 -g -DNDEBUG") + +# if no build build type is specified, default to debug builds +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Debug) +endif(NOT CMAKE_BUILD_TYPE) + +string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) + +# Set compile flags based on the build type. +message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})") +if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_DEBUG}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_FASTDEBUG}) +elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set(CMAKE_CXX_FLAGS ${CXX_FLAGS_RELEASE}) +else() + message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") +endif () + +# Add common flags +set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") + +# Determine compiler version +include(CompilerInfo) + +if ("${COMPILER_FAMILY}" STREQUAL "clang") + # Using Clang with ccache causes a bunch of spurious warnings that are + # purportedly fixed in the next version of ccache. See the following for details: + # + # http://petereisentraut.blogspot.com/2011/05/ccache-and-clang.html + # http://petereisentraut.blogspot.com/2011/09/ccache-and-clang-part-2.html + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") +endif() + +set(PYARROW_LINK "a") + +# For any C code, use the same flags. +set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS}") + +# Code coverage +if ("${PYARROW_GENERATE_COVERAGE}") + if("${CMAKE_CXX_COMPILER}" MATCHES ".*clang.*") + # There appears to be some bugs in clang 3.3 which cause code coverage + # to have link errors, not locating the llvm_gcda_* symbols. + # This should be fixed in llvm 3.4 with http://llvm.org/viewvc/llvm-project?view=revision&revision=184666 + message(SEND_ERROR "Cannot currently generate coverage with clang") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage -DCOVERAGE_BUILD") + + # For coverage to work properly, we need to use static linkage. Otherwise, + # __gcov_flush() doesn't properly flush coverage from every module. + # See http://stackoverflow.com/questions/28164543/using-gcov-flush-within-a-library-doesnt-force-the-other-modules-to-yield-gc + if("${PYARROW_LINK}" STREQUAL "a") + message("Using static linking for coverage build") + set(PYARROW_LINK "s") + elseif("${PYARROW_LINK}" STREQUAL "d") + message(SEND_ERROR "Cannot use coverage with static linking") + endif() +endif() + +# If we still don't know what kind of linking to perform, choose based on +# build type (developers like fast builds). +if ("${PYARROW_LINK}" STREQUAL "a") + if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG" OR + "${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") + message("Using dynamic linking for ${CMAKE_BUILD_TYPE} builds") + set(PYARROW_LINK "d") + else() + message("Using static linking for ${CMAKE_BUILD_TYPE} builds") + set(PYARROW_LINK "s") + endif() +endif() + +# Are we using the gold linker? It doesn't work with dynamic linking as +# weak symbols aren't properly overridden, causing tcmalloc to be omitted. +# Let's flag this as an error in RELEASE builds (we shouldn't release a +# product like this). +# +# See https://sourceware.org/bugzilla/show_bug.cgi?id=16979 for details. +# +# The gold linker is only for ELF binaries, which OSX doesn't use. We can +# just skip. +if (NOT APPLE) + execute_process(COMMAND ${CMAKE_CXX_COMPILER} -Wl,--version OUTPUT_VARIABLE LINKER_OUTPUT) +endif () +if (LINKER_OUTPUT MATCHES "gold") + if ("${PYARROW_LINK}" STREQUAL "d" AND + "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + message(SEND_ERROR "Cannot use gold with dynamic linking in a RELEASE build " + "as it would cause tcmalloc symbols to get dropped") + else() + message("Using gold linker") + endif() + set(PYARROW_USING_GOLD 1) +else() + message("Using ld linker") +endif() + +# Having set PYARROW_LINK due to build type and/or sanitizer, it's now safe to +# act on its value. +if ("${PYARROW_LINK}" STREQUAL "d") + set(BUILD_SHARED_LIBS ON) + + # Position independent code is only necessary when producing shared objects. + add_definitions(-fPIC) +endif() + +# set compile output directory +string (TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME) + +# If build in-source, create the latest symlink. If build out-of-source, which is +# preferred, simply output the binaries in the build folder +if (${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_BINARY_DIR}) + set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/build/${BUILD_SUBDIR_NAME}/") + # Link build/latest to the current build directory, to avoid developers + # accidentally running the latest debug build when in fact they're building + # release builds. + FILE(MAKE_DIRECTORY ${BUILD_OUTPUT_ROOT_DIRECTORY}) + if (NOT APPLE) + set(MORE_ARGS "-T") + endif() +EXECUTE_PROCESS(COMMAND ln ${MORE_ARGS} -sf ${BUILD_OUTPUT_ROOT_DIRECTORY} + ${CMAKE_CURRENT_BINARY_DIR}/build/latest) +else() + set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") + # set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}/") +endif() + +# where to put generated archives (.a files) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") +set(ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") + +# where to put generated libraries (.so files) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") +set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") + +# where to put generated binaries +set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") + +## Python and libraries +find_package(PythonLibsNew REQUIRED) +include(UseCython) + +include_directories(SYSTEM + src) + +############################################################ +# Testing +############################################################ + +# Add a new test case, with or without an executable that should be built. +# +# REL_TEST_NAME is the name of the test. It may be a single component +# (e.g. monotime-test) or contain additional components (e.g. +# net/net_util-test). Either way, the last component must be a globally +# unique name. +# +# Arguments after the test name will be passed to set_tests_properties(). +function(ADD_PYARROW_TEST REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}.cc) + # This test has a corresponding .cc file, set it up as an executable. + set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") + add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") + target_link_libraries(${TEST_NAME} ${PYARROW_TEST_LINK_LIBS}) + else() + # No executable, just invoke the test (probably a script) directly. + set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) + endif() + + add_test(${TEST_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${TEST_PATH}) + if(ARGN) + set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) + endif() +endfunction() + +# A wrapper for add_dependencies() that is compatible with NO_TESTS. +function(ADD_PYARROW_TEST_DEPENDENCIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + add_dependencies(${TEST_NAME} ${ARGN}) +endfunction() + +enable_testing() + +############################################################ +# Dependencies +############################################################ +function(ADD_THIRDPARTY_LIB LIB_NAME) + set(options) + set(one_value_args SHARED_LIB STATIC_LIB) + set(multi_value_args DEPS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(("${PYARROW_LINK}" STREQUAL "s" AND ARG_STATIC_LIB) OR (NOT ARG_SHARED_LIB)) + if(NOT ARG_STATIC_LIB) + message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") + endif() + add_library(${LIB_NAME} STATIC IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") + else() + add_library(${LIB_NAME} SHARED IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + endif() + + if(ARG_DEPS) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${ARG_DEPS}") + endif() + + # Set up an "exported variant" for this thirdparty library (see "Visibility" + # above). It's the same as the real target, just with an "_exported" suffix. + # We prefer the static archive if it exists (as it's akin to an "internal" + # library), but we'll settle for the shared object if we must. + # + # A shared object exported variant will force any "leaf" library that + # transitively depends on it to also depend on it at runtime; this is + # desirable for some libraries (e.g. cyrus_sasl). + set(LIB_NAME_EXPORTED ${LIB_NAME}_exported) + if(ARG_STATIC_LIB) + add_library(${LIB_NAME_EXPORTED} STATIC IMPORTED) + set_target_properties(${LIB_NAME_EXPORTED} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + else() + add_library(${LIB_NAME_EXPORTED} SHARED IMPORTED) + set_target_properties(${LIB_NAME_EXPORTED} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + endif() + if(ARG_DEPS) + set_target_properties(${LIB_NAME_EXPORTED} + PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${ARG_DEPS}") + endif() +endfunction() + +## GMock +find_package(GTest REQUIRED) +include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(gtest + STATIC_LIB ${GTEST_STATIC_LIB}) + +## Arrow +find_package(Arrow REQUIRED) +include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(arrow + SHARED_LIB ${ARROW_SHARED_LIB}) + +############################################################ +# Linker setup +############################################################ + +set(PYARROW_MIN_TEST_LIBS + pyarrow_test_main + pyarrow) + +set(PYARROW_MIN_TEST_LIBS + pyarrow_test_main + pyarrow + ${PYARROW_BASE_LIBS}) + +set(PYARROW_TEST_LINK_LIBS ${PYARROW_MIN_TEST_LIBS}) + +############################################################ +# "make ctags" target +############################################################ +if (UNIX) + add_custom_target(ctags ctags -R --languages=c++,c --exclude=thirdparty/installed) +endif (UNIX) + +############################################################ +# "make etags" target +############################################################ +if (UNIX) + add_custom_target(tags etags --members --declarations + `find ${CMAKE_CURRENT_SOURCE_DIR}/src + -name \\*.cc -or -name \\*.hh -or -name \\*.cpp -or -name \\*.h -or -name \\*.c -or + -name \\*.f`) + add_custom_target(etags DEPENDS tags) +endif (UNIX) + +############################################################ +# "make cscope" target +############################################################ +if (UNIX) + add_custom_target(cscope find ${CMAKE_CURRENT_SOURCE_DIR} + ( -name \\*.cc -or -name \\*.hh -or -name \\*.cpp -or + -name \\*.h -or -name \\*.c -or -name \\*.f ) + -exec echo \"{}\" \; > cscope.files && cscope -q -b VERBATIM) +endif (UNIX) + +############################################################ +# "make lint" target +############################################################ +if (UNIX) + # Full lint + add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py + --verbose=2 + --filter=-whitespace/comments,-readability/todo,-build/header_guard + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h`) +endif (UNIX) + +############################################################ +# Subdirectories +############################################################ + +add_subdirectory(src/pyarrow) +add_subdirectory(src/pyarrow/util) + +set(PYARROW_SRCS + src/pyarrow/init.cc +) + +set(LINK_LIBS + pyarrow_util + arrow +) + +add_library(pyarrow SHARED + ${PYARROW_SRCS}) +target_link_libraries(pyarrow ${LINK_LIBS}) +set_target_properties(pyarrow PROPERTIES LINKER_LANGUAGE CXX) + +if(APPLE) + set_target_properties(pyarrow PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") +endif() + +############################################################ +# Setup and build Cython modules +############################################################ + +foreach(pyx_api_file + arrow/config.pyx + arrow/parquet.pyx) + set_source_files_properties(${pyx_api_file} PROPERTIES CYTHON_API 1) +endforeach(pyx_api_file) + +set(USE_RELATIVE_RPATH ON) +set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) + +set(CYTHON_EXTENSIONS + config + parquet +) + +foreach(module ${CYTHON_EXTENSIONS}) + string(REPLACE "." ";" directories ${module}) + list(GET directories -1 module_name) + list(REMOVE_AT directories -1) + + string(REPLACE "." "/" module_root "${module}") + set(module_SRC arrow/${module_root}.pyx) + set_source_files_properties(${module_SRC} PROPERTIES CYTHON_IS_CXX 1) + + cython_add_module(${module_name} + ${module_name}_pyx + ${module_name}_output + ${module_SRC}) + + if (directories) + string(REPLACE ";" "/" module_output_directory ${directories}) + set_target_properties(${module_name} PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${module_output_directory}) + endif() + + if(APPLE) + set(module_install_rpath "@loader_path") + else() + set(module_install_rpath "$ORIGIN") + endif() + list(LENGTH directories i) + while(${i} GREATER 0) + set(module_install_rpath "${module_install_rpath}/..") + math(EXPR i "${i} - 1" ) + endwhile(${i} GREATER 0) + + # for inplace development for now + set(module_install_rpath "${CMAKE_SOURCE_DIR}/arrow/") + + set_target_properties(${module_name} PROPERTIES + INSTALL_RPATH ${module_install_rpath}) + target_link_libraries(${module_name} pyarrow) +endforeach(module) diff --git a/python/LICENSE.txt b/python/LICENSE.txt new file mode 100644 index 00000000000..078e144ded1 --- /dev/null +++ b/python/LICENSE.txt @@ -0,0 +1,88 @@ +## 3rd-party licenses for code that has been adapted for the Arrow Python + library + +------------------------------------------------------------------------------- +Some code from pandas has been adapted for this codebase. pandas is available +under the 3-clause BSD license, which follows: + +pandas license +============== + +Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Copyright (c) 2008-2011 AQR Capital Management, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +------------------------------------------------------------------------------- + +Some bits from DyND, in particular aspects of the build system, have been +adapted from libdynd and dynd-python under the terms of the BSD 2-clause +license + +The BSD 2-Clause License + + Copyright (C) 2011-12, Dynamic NDArray Developers + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Dynamic NDArray Developers list: + + * Mark Wiebe + * Continuum Analytics + +------------------------------------------------------------------------------- + +Some source code from Ibis (https://github.com/cloudera/ibis) has been adapted +for Arrow. Ibis is released under the Apache License, Version 2.0. diff --git a/python/README.md b/python/README.md new file mode 100644 index 00000000000..c79fa9786f4 --- /dev/null +++ b/python/README.md @@ -0,0 +1,14 @@ +## Python library for Apache Arrow + +This library provides a Pythonic API wrapper for the reference Arrow C++ +implementation, along with tools for interoperability with pandas, NumPy, and +other traditional Python scientific computing packages. + +#### Development details + +This project is layered in two pieces: + +* pyarrow, a C++ library for easier interoperability between Arrow C++, NumPy, + and pandas +* Cython extensions and pure Python code under arrow/ which expose Arrow C++ + and pyarrow to pure Python users \ No newline at end of file diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/arrow/compat.py b/python/arrow/compat.py new file mode 100644 index 00000000000..2ac41ac8abf --- /dev/null +++ b/python/arrow/compat.py @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +import itertools + +import numpy as np + +import sys +import six +from six import BytesIO, StringIO, string_types as py_string + + +PY26 = sys.version_info[:2] == (2, 6) +PY2 = sys.version_info[0] == 2 + + +if PY26: + import unittest2 as unittest +else: + import unittest + + +if PY2: + import cPickle + + try: + from cdecimal import Decimal + except ImportError: + from decimal import Decimal + + unicode_type = unicode + lzip = zip + zip = itertools.izip + + def dict_values(x): + return x.values() + + range = xrange + long = long + + def tobytes(o): + if isinstance(o, unicode): + return o.encode('utf8') + else: + return o + + def frombytes(o): + return o +else: + unicode_type = str + def lzip(*x): + return list(zip(*x)) + long = int + zip = zip + def dict_values(x): + return list(x.values()) + from decimal import Decimal + range = range + + def tobytes(o): + if isinstance(o, str): + return o.encode('utf8') + else: + return o + + def frombytes(o): + return o.decode('utf8') + + +integer_types = six.integer_types + (np.integer,) diff --git a/python/arrow/config.pyx b/python/arrow/config.pyx new file mode 100644 index 00000000000..8f10beb3a2e --- /dev/null +++ b/python/arrow/config.pyx @@ -0,0 +1,8 @@ +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +cdef extern from 'pyarrow/init.h' namespace 'arrow::py': + void pyarrow_init() + +pyarrow_init() diff --git a/python/arrow/includes/__init__.pxd b/python/arrow/includes/__init__.pxd new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd new file mode 100644 index 00000000000..3635ceb8685 --- /dev/null +++ b/python/arrow/includes/arrow.pxd @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from arrow.includes.common cimport * + +cdef extern from "arrow/api.h" namespace "arrow" nogil: + pass diff --git a/python/arrow/includes/common.pxd b/python/arrow/includes/common.pxd new file mode 100644 index 00000000000..f2fc826625e --- /dev/null +++ b/python/arrow/includes/common.pxd @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from libc.stdint cimport * +from libcpp cimport bool as c_bool +from libcpp.string cimport string +from libcpp.vector cimport vector + +# This must be included for cerr and other things to work +cdef extern from "": + pass + +cdef extern from "" namespace "std" nogil: + + cdef cppclass shared_ptr[T]: + T* get() + void reset() + void reset(T* p) diff --git a/python/arrow/includes/parquet.pxd b/python/arrow/includes/parquet.pxd new file mode 100644 index 00000000000..62342f30669 --- /dev/null +++ b/python/arrow/includes/parquet.pxd @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from arrow.includes.common cimport * + +cdef extern from "parquet/api/reader.h" namespace "parquet_cpp" nogil: + cdef cppclass ColumnReader: + pass + + cdef cppclass BoolReader(ColumnReader): + pass + + cdef cppclass Int32Reader(ColumnReader): + pass + + cdef cppclass Int64Reader(ColumnReader): + pass + + cdef cppclass Int96Reader(ColumnReader): + pass + + cdef cppclass FloatReader(ColumnReader): + pass + + cdef cppclass DoubleReader(ColumnReader): + pass + + cdef cppclass ByteArrayReader(ColumnReader): + pass + + cdef cppclass RowGroupReader: + pass + + cdef cppclass ParquetFileReader: + pass diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd new file mode 100644 index 00000000000..dcef663f389 --- /dev/null +++ b/python/arrow/includes/pyarrow.pxd @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from arrow.includes.common cimport * + +cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: + pass diff --git a/python/arrow/parquet.pyx b/python/arrow/parquet.pyx new file mode 100644 index 00000000000..23c3838bcad --- /dev/null +++ b/python/arrow/parquet.pyx @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from arrow.compat import frombytes, tobytes +from arrow.includes.parquet cimport * diff --git a/python/arrow/tests/__init__.py b/python/arrow/tests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cmake_modules/CompilerInfo.cmake b/python/cmake_modules/CompilerInfo.cmake new file mode 100644 index 00000000000..e66bc2693ee --- /dev/null +++ b/python/cmake_modules/CompilerInfo.cmake @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Sets COMPILER_FAMILY to 'clang' or 'gcc' +# Sets COMPILER_VERSION to the version +execute_process(COMMAND "${CMAKE_CXX_COMPILER}" -v + ERROR_VARIABLE COMPILER_VERSION_FULL) +message(INFO " ${COMPILER_VERSION_FULL}") + +# clang on Linux and Mac OS X before 10.9 +if("${COMPILER_VERSION_FULL}" MATCHES ".*clang version.*") + set(COMPILER_FAMILY "clang") + string(REGEX REPLACE ".*clang version ([0-9]+\\.[0-9]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") +# clang on Mac OS X 10.9 and later +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*based on LLVM.*") + set(COMPILER_FAMILY "clang") + string(REGEX REPLACE ".*based on LLVM ([0-9]+\\.[0.9]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") + +# clang on Mac OS X, XCode 7. No version replacement is done +# because Apple no longer advertises the upstream LLVM version. +elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-700\\..*") + set(COMPILER_FAMILY "clang") + +# gcc +elseif("${COMPILER_VERSION_FULL}" MATCHES ".*gcc version.*") + set(COMPILER_FAMILY "gcc") + string(REGEX REPLACE ".*gcc version ([0-9\\.]+).*" "\\1" + COMPILER_VERSION "${COMPILER_VERSION_FULL}") +else() + message(FATAL_ERROR "Unknown compiler. Version info:\n${COMPILER_VERSION_FULL}") +endif() +message("Selected compiler ${COMPILER_FAMILY} ${COMPILER_VERSION}") diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake new file mode 100644 index 00000000000..3d9983849eb --- /dev/null +++ b/python/cmake_modules/FindArrow.cmake @@ -0,0 +1,77 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# - Find ARROW (arrow/api.h, libarrow.a, libarrow.so) +# This module defines +# ARROW_INCLUDE_DIR, directory containing headers +# ARROW_LIBS, directory containing arrow libraries +# ARROW_STATIC_LIB, path to libarrow.a +# ARROW_SHARED_LIB, path to libarrow's shared library +# ARROW_FOUND, whether arrow has been found + +set(ARROW_SEARCH_HEADER_PATHS + $ENV{ARROW_HOME}/include +) + +set(ARROW_SEARCH_LIB_PATH + $ENV{ARROW_HOME}/lib +) + +find_path(ARROW_INCLUDE_DIR arrow/array.h PATHS + ${ARROW_SEARCH_HEADER_PATHS} + # make sure we don't accidentally pick up a different version + NO_DEFAULT_PATH +) + +find_library(ARROW_LIB_PATH NAMES arrow + PATHS + ${ARROW_SEARCH_LIB_PATH} + NO_DEFAULT_PATH) + +if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) + set(ARROW_FOUND TRUE) + set(ARROW_LIB_NAME libarrow) + set(ARROW_LIBS ${ARROW_SEARCH_LIB_PATH}) + set(ARROW_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_LIB_NAME}.a) + set(ARROW_SHARED_LIB ${ARROW_LIBS}/${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +else () + set(ARROW_FOUND FALSE) +endif () + +if (ARROW_FOUND) + if (NOT Arrow_FIND_QUIETLY) + message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}") + endif () +else () + if (NOT Arrow_FIND_QUIETLY) + set(ARROW_ERR_MSG "Could not find the Arrow library. Looked for headers") + set(ARROW_ERR_MSG "${ARROW_ERR_MSG} in ${ARROW_SEARCH_HEADER_PATHS}, and for libs") + set(ARROW_ERR_MSG "${ARROW_ERR_MSG} in ${ARROW_SEARCH_LIB_PATH}") + if (Arrow_FIND_REQUIRED) + message(FATAL_ERROR "${ARROW_ERR_MSG}") + else (Arrow_FIND_REQUIRED) + message(STATUS "${ARROW_ERR_MSG}") + endif (Arrow_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + ARROW_INCLUDE_DIR + ARROW_LIBS + ARROW_STATIC_LIB + ARROW_SHARED_LIB +) diff --git a/python/cmake_modules/FindCython.cmake b/python/cmake_modules/FindCython.cmake new file mode 100644 index 00000000000..9df3b5d59d2 --- /dev/null +++ b/python/cmake_modules/FindCython.cmake @@ -0,0 +1,30 @@ +# Find the Cython compiler. +# +# This code sets the following variables: +# +# CYTHON_EXECUTABLE +# +# See also UseCython.cmake + +#============================================================================= +# Copyright 2011 Kitware, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +find_program( CYTHON_EXECUTABLE NAMES cython cython.bat ) + +include( FindPackageHandleStandardArgs ) +FIND_PACKAGE_HANDLE_STANDARD_ARGS( Cython REQUIRED_VARS CYTHON_EXECUTABLE ) + +mark_as_advanced( CYTHON_EXECUTABLE ) diff --git a/python/cmake_modules/FindNumPy.cmake b/python/cmake_modules/FindNumPy.cmake new file mode 100644 index 00000000000..58bb531f532 --- /dev/null +++ b/python/cmake_modules/FindNumPy.cmake @@ -0,0 +1,100 @@ +# - Find the NumPy libraries +# This module finds if NumPy is installed, and sets the following variables +# indicating where it is. +# +# TODO: Update to provide the libraries and paths for linking npymath lib. +# +# NUMPY_FOUND - was NumPy found +# NUMPY_VERSION - the version of NumPy found as a string +# NUMPY_VERSION_MAJOR - the major version number of NumPy +# NUMPY_VERSION_MINOR - the minor version number of NumPy +# NUMPY_VERSION_PATCH - the patch version number of NumPy +# NUMPY_VERSION_DECIMAL - e.g. version 1.6.1 is 10601 +# NUMPY_INCLUDE_DIRS - path to the NumPy include files + +#============================================================================ +# Copyright 2012 Continuum Analytics, Inc. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +#============================================================================ + +# Finding NumPy involves calling the Python interpreter +if(NumPy_FIND_REQUIRED) + find_package(PythonInterp REQUIRED) +else() + find_package(PythonInterp) +endif() + +if(NOT PYTHONINTERP_FOUND) + set(NUMPY_FOUND FALSE) + return() +endif() + +execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import numpy as n; print(n.__version__); print(n.get_include());" + RESULT_VARIABLE _NUMPY_SEARCH_SUCCESS + OUTPUT_VARIABLE _NUMPY_VALUES_OUTPUT + ERROR_VARIABLE _NUMPY_ERROR_VALUE + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(NOT _NUMPY_SEARCH_SUCCESS MATCHES 0) + if(NumPy_FIND_REQUIRED) + message(FATAL_ERROR + "NumPy import failure:\n${_NUMPY_ERROR_VALUE}") + endif() + set(NUMPY_FOUND FALSE) + return() +endif() + +# Convert the process output into a list +string(REGEX REPLACE ";" "\\\\;" _NUMPY_VALUES ${_NUMPY_VALUES_OUTPUT}) +string(REGEX REPLACE "\n" ";" _NUMPY_VALUES ${_NUMPY_VALUES}) +list(GET _NUMPY_VALUES 0 NUMPY_VERSION) +list(GET _NUMPY_VALUES 1 NUMPY_INCLUDE_DIRS) + +string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" _VER_CHECK "${NUMPY_VERSION}") +if("${_VER_CHECK}" STREQUAL "") + # The output from Python was unexpected. Raise an error always + # here, because we found NumPy, but it appears to be corrupted somehow. + message(FATAL_ERROR + "Requested version and include path from NumPy, got instead:\n${_NUMPY_VALUES_OUTPUT}\n") + return() +endif() + +# Make sure all directory separators are '/' +string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIRS ${NUMPY_INCLUDE_DIRS}) + +# Get the major and minor version numbers +string(REGEX REPLACE "\\." ";" _NUMPY_VERSION_LIST ${NUMPY_VERSION}) +list(GET _NUMPY_VERSION_LIST 0 NUMPY_VERSION_MAJOR) +list(GET _NUMPY_VERSION_LIST 1 NUMPY_VERSION_MINOR) +list(GET _NUMPY_VERSION_LIST 2 NUMPY_VERSION_PATCH) +string(REGEX MATCH "[0-9]*" NUMPY_VERSION_PATCH ${NUMPY_VERSION_PATCH}) +math(EXPR NUMPY_VERSION_DECIMAL + "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}") + +find_package_message(NUMPY + "Found NumPy: version \"${NUMPY_VERSION}\" ${NUMPY_INCLUDE_DIRS}" + "${NUMPY_INCLUDE_DIRS}${NUMPY_VERSION}") + +set(NUMPY_FOUND TRUE) diff --git a/python/cmake_modules/FindPythonLibsNew.cmake b/python/cmake_modules/FindPythonLibsNew.cmake new file mode 100644 index 00000000000..c70e6bc26a7 --- /dev/null +++ b/python/cmake_modules/FindPythonLibsNew.cmake @@ -0,0 +1,236 @@ +# - Find python libraries +# This module finds the libraries corresponding to the Python interpeter +# FindPythonInterp provides. +# This code sets the following variables: +# +# PYTHONLIBS_FOUND - have the Python libs been found +# PYTHON_PREFIX - path to the Python installation +# PYTHON_LIBRARIES - path to the python library +# PYTHON_INCLUDE_DIRS - path to where Python.h is found +# PYTHON_SITE_PACKAGES - path to installation site-packages +# PYTHON_IS_DEBUG - whether the Python interpreter is a debug build +# +# PYTHON_INCLUDE_PATH - path to where Python.h is found (deprecated) +# +# A function PYTHON_ADD_MODULE( src1 src2 ... srcN) is defined +# to build modules for python. +# +# Thanks to talljimbo for the patch adding the 'LDVERSION' config +# variable usage. + +#============================================================================= +# Copyright 2001-2009 Kitware, Inc. +# Copyright 2012-2014 Continuum Analytics, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the names of Kitware, Inc., the Insight Software Consortium, +# nor the names of their contributors may be used to endorse or promote +# products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +# Use the Python interpreter to find the libs. +if(PythonLibsNew_FIND_REQUIRED) + find_package(PythonInterp REQUIRED) +else() + find_package(PythonInterp) +endif() + +if(NOT PYTHONINTERP_FOUND) + set(PYTHONLIBS_FOUND FALSE) + return() +endif() + +# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter +# testing whether sys has the gettotalrefcount function is a reliable, +# cross-platform way to detect a CPython debug interpreter. +# +# The library suffix is from the config var LDVERSION sometimes, otherwise +# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows. +# +# The config var LIBPL is for Linux, and helps on Debian Jessie where the +# addition of multi-arch support shuffled things around. +execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "from distutils import sysconfig as s;import sys;import struct; +print('.'.join(str(v) for v in sys.version_info)); +print(sys.prefix); +print(s.get_python_inc(plat_specific=True)); +print(s.get_python_lib(plat_specific=True)); +print(s.get_config_var('SO')); +print(hasattr(sys, 'gettotalrefcount')+0); +print(struct.calcsize('@P')); +print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); +print(s.get_config_var('LIBPL')); +" + RESULT_VARIABLE _PYTHON_SUCCESS + OUTPUT_VARIABLE _PYTHON_VALUES + ERROR_VARIABLE _PYTHON_ERROR_VALUE + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(NOT _PYTHON_SUCCESS MATCHES 0) + if(PythonLibsNew_FIND_REQUIRED) + message(FATAL_ERROR + "Python config failure:\n${_PYTHON_ERROR_VALUE}") + endif() + set(PYTHONLIBS_FOUND FALSE) + return() +endif() + +# Convert the process output into a list +string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) +string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) +list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST) +list(GET _PYTHON_VALUES 1 PYTHON_PREFIX) +list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR) +list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES) +list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION) +list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG) +list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P) +list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX) +list(GET _PYTHON_VALUES 8 PYTHON_LIBRARY_PATH) + +# Make sure the Python has the same pointer-size as the chosen compiler +# Skip the check on OS X, it doesn't consistently have CMAKE_SIZEOF_VOID_P defined +if((NOT APPLE) AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}")) + if(PythonLibsNew_FIND_REQUIRED) + math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8") + math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8") + message(FATAL_ERROR + "Python config failure: Python is ${_PYTHON_BITS}-bit, " + "chosen compiler is ${_CMAKE_BITS}-bit") + endif() + set(PYTHONLIBS_FOUND FALSE) + return() +endif() + +# The built-in FindPython didn't always give the version numbers +string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST}) +list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR) +list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR) +list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH) + +# Make sure all directory separators are '/' +string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) +string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR ${PYTHON_INCLUDE_DIR}) +string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES}) + +if(CMAKE_HOST_WIN32) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + set(PYTHON_LIBRARY + "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + else() + set(PYTHON_LIBRARY "${PYTHON_PREFIX}/libs/libpython${PYTHON_LIBRARY_SUFFIX}.a") + endif() +elseif(APPLE) + # Seems to require "-undefined dynamic_lookup" instead of linking + # against the .dylib, otherwise it crashes. This flag is added + # below + set(PYTHON_LIBRARY "") + #set(PYTHON_LIBRARY + # "${PYTHON_PREFIX}/lib/libpython${PYTHON_LIBRARY_SUFFIX}.dylib") +else() + if(${PYTHON_SIZEOF_VOID_P} MATCHES 8) + set(_PYTHON_LIBS_SEARCH "${PYTHON_PREFIX}/lib64" "${PYTHON_PREFIX}/lib" "${PYTHON_LIBRARY_PATH}") + else() + set(_PYTHON_LIBS_SEARCH "${PYTHON_PREFIX}/lib" "${PYTHON_LIBRARY_PATH}") + endif() + message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}") + # Probably this needs to be more involved. It would be nice if the config + # information the python interpreter itself gave us were more complete. + find_library(PYTHON_LIBRARY + NAMES "python${PYTHON_LIBRARY_SUFFIX}" + PATHS ${_PYTHON_LIBS_SEARCH} + NO_DEFAULT_PATH) + message(STATUS "Found Python lib ${PYTHON_LIBRARY}") +endif() + +# For backward compatibility, set PYTHON_INCLUDE_PATH, but make it internal. +SET(PYTHON_INCLUDE_PATH "${PYTHON_INCLUDE_DIR}" CACHE INTERNAL + "Path to where Python.h is found (deprecated)") + +MARK_AS_ADVANCED( + PYTHON_LIBRARY + PYTHON_INCLUDE_DIR +) + +# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the +# cache entries because they are meant to specify the location of a single +# library. We now set the variables listed by the documentation for this +# module. +SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}") +SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") +SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}") + + +# Don't know how to get to this directory, just doing something simple :P +#INCLUDE(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake) +#FIND_PACKAGE_HANDLE_STANDARD_ARGS(PythonLibs DEFAULT_MSG PYTHON_LIBRARIES PYTHON_INCLUDE_DIRS) +find_package_message(PYTHON + "Found PythonLibs: ${PYTHON_LIBRARY}" + "${PYTHON_EXECUTABLE}${PYTHON_VERSION}") + + +# PYTHON_ADD_MODULE( src1 src2 ... srcN) is used to build modules for python. +FUNCTION(PYTHON_ADD_MODULE _NAME ) + GET_PROPERTY(_TARGET_SUPPORTS_SHARED_LIBS + GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS) + OPTION(PYTHON_ENABLE_MODULE_${_NAME} "Add module ${_NAME}" TRUE) + OPTION(PYTHON_MODULE_${_NAME}_BUILD_SHARED + "Add module ${_NAME} shared" ${_TARGET_SUPPORTS_SHARED_LIBS}) + + # Mark these options as advanced + MARK_AS_ADVANCED(PYTHON_ENABLE_MODULE_${_NAME} + PYTHON_MODULE_${_NAME}_BUILD_SHARED) + + IF(PYTHON_ENABLE_MODULE_${_NAME}) + IF(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + SET(PY_MODULE_TYPE MODULE) + ELSE(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + SET(PY_MODULE_TYPE STATIC) + SET_PROPERTY(GLOBAL APPEND PROPERTY PY_STATIC_MODULES_LIST ${_NAME}) + ENDIF(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + + SET_PROPERTY(GLOBAL APPEND PROPERTY PY_MODULES_LIST ${_NAME}) + ADD_LIBRARY(${_NAME} ${PY_MODULE_TYPE} ${ARGN}) + IF(APPLE) + # On OS X, linking against the Python libraries causes + # segfaults, so do this dynamic lookup instead. + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES LINK_FLAGS + "-undefined dynamic_lookup") + ELSE() + TARGET_LINK_LIBRARIES(${_NAME} ${PYTHON_LIBRARIES}) + ENDIF() + IF(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}") + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}") + ELSE() + ENDIF() + + ENDIF(PYTHON_ENABLE_MODULE_${_NAME}) +ENDFUNCTION(PYTHON_ADD_MODULE) \ No newline at end of file diff --git a/python/cmake_modules/UseCython.cmake b/python/cmake_modules/UseCython.cmake new file mode 100644 index 00000000000..e7034db52f3 --- /dev/null +++ b/python/cmake_modules/UseCython.cmake @@ -0,0 +1,164 @@ +# Define a function to create Cython modules. +# +# For more information on the Cython project, see http://cython.org/. +# "Cython is a language that makes writing C extensions for the Python language +# as easy as Python itself." +# +# This file defines a CMake function to build a Cython Python module. +# To use it, first include this file. +# +# include( UseCython ) +# +# Then call cython_add_module to create a module. +# +# cython_add_module( ... ) +# +# Where is the desired name of the target for the resulting Python module, +# is the desired name of the target that runs the Cython compiler +# to generate the needed C or C++ files, is a variable to hold the +# files generated by Cython, and ... are source files +# to be compiled into the module, e.g. *.pyx, *.c, *.cxx, etc. +# only one .pyx file may be present for each target +# (this is an inherent limitation of Cython). +# +# The sample paths set with the CMake include_directories() command will be used +# for include directories to search for *.pxd when running the Cython complire. +# +# Cache variables that effect the behavior include: +# +# CYTHON_ANNOTATE +# CYTHON_NO_DOCSTRINGS +# CYTHON_FLAGS +# +# Source file properties that effect the build process are +# +# CYTHON_IS_CXX +# CYTHON_IS_PUBLIC +# CYTHON_IS_API +# +# If this is set of a *.pyx file with CMake set_source_files_properties() +# command, the file will be compiled as a C++ file. +# +# See also FindCython.cmake + +#============================================================================= +# Copyright 2011 Kitware, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +#============================================================================= + +# Configuration options. +set( CYTHON_ANNOTATE OFF + CACHE BOOL "Create an annotated .html file when compiling *.pyx." ) +set( CYTHON_NO_DOCSTRINGS OFF + CACHE BOOL "Strip docstrings from the compiled module." ) +set( CYTHON_FLAGS "" CACHE STRING + "Extra flags to the cython compiler." ) +mark_as_advanced( CYTHON_ANNOTATE CYTHON_NO_DOCSTRINGS CYTHON_FLAGS ) + +find_package( Cython REQUIRED ) +find_package( PythonLibsNew REQUIRED ) + +set( CYTHON_CXX_EXTENSION "cxx" ) +set( CYTHON_C_EXTENSION "c" ) + +# Create a *.c or *.cxx file from a *.pyx file. +# Input the generated file basename. The generate files will put into the variable +# placed in the "generated_files" argument. Finally all the *.py and *.pyx files. +function( compile_pyx _name pyx_target_name generated_files pyx_file) + # Default to assuming all files are C. + set( cxx_arg "" ) + set( extension ${CYTHON_C_EXTENSION} ) + set( pyx_lang "C" ) + set( comment "Compiling Cython C source for ${_name}..." ) + + get_filename_component( pyx_file_basename "${pyx_file}" NAME_WE ) + + # Determine if it is a C or C++ file. + get_source_file_property( property_is_cxx ${pyx_file} CYTHON_IS_CXX ) + if( ${property_is_cxx} ) + set( cxx_arg "--cplus" ) + set( extension ${CYTHON_CXX_EXTENSION} ) + set( pyx_lang "CXX" ) + set( comment "Compiling Cython CXX source for ${_name}..." ) + endif() + get_source_file_property( pyx_location ${pyx_file} LOCATION ) + + # Set additional flags. + if( CYTHON_ANNOTATE ) + set( annotate_arg "--annotate" ) + endif() + + if( CYTHON_NO_DOCSTRINGS ) + set( no_docstrings_arg "--no-docstrings" ) + endif() + + if(NOT WIN32) + if( "${CMAKE_BUILD_TYPE}" STREQUAL "Debug" OR + "${CMAKE_BUILD_TYPE}" STREQUAL "RelWithDebInfo" ) + set( cython_debug_arg "--gdb" ) + endif() + endif() + + # Determining generated file names. + get_source_file_property( property_is_public ${pyx_file} CYTHON_PUBLIC ) + get_source_file_property( property_is_api ${pyx_file} CYTHON_API ) + if( ${property_is_api} ) + set( _generated_files "${_name}.${extension}" "${_name}.h" "${name}_api.h") + elseif( ${property_is_public} ) + set( _generated_files "${_name}.${extension}" "${_name}.h") + else() + set( _generated_files "${_name}.${extension}") + endif() + set_source_files_properties( ${_generated_files} PROPERTIES GENERATED TRUE ) + set( ${generated_files} ${_generated_files} PARENT_SCOPE ) + + # Add the command to run the compiler. + add_custom_target(${pyx_target_name} + COMMAND ${CYTHON_EXECUTABLE} ${cxx_arg} ${include_directory_arg} + ${annotate_arg} ${no_docstrings_arg} ${cython_debug_arg} ${CYTHON_FLAGS} + --output-file "${_name}.${extension}" ${pyx_location} + DEPENDS ${pyx_location} + # do not specify byproducts for now since they don't work with the older + # version of cmake available in the apt repositories. + #BYPRODUCTS ${_generated_files} + COMMENT ${comment} + ) + + # Remove their visibility to the user. + set( corresponding_pxd_file "" CACHE INTERNAL "" ) + set( header_location "" CACHE INTERNAL "" ) + set( pxd_location "" CACHE INTERNAL "" ) +endfunction() + +# cython_add_module( src1 src2 ... srcN ) +# Build the Cython Python module. +function( cython_add_module _name pyx_target_name generated_files) + set( pyx_module_source "" ) + set( other_module_sources "" ) + foreach( _file ${ARGN} ) + if( ${_file} MATCHES ".*\\.py[x]?$" ) + list( APPEND pyx_module_source ${_file} ) + else() + list( APPEND other_module_sources ${_file} ) + endif() + endforeach() + compile_pyx( ${_name} ${pyx_target_name} _generated_files ${pyx_module_source} ) + set( ${generated_files} ${_generated_files} PARENT_SCOPE ) + include_directories( ${PYTHON_INCLUDE_DIRS} ) + python_add_module( ${_name} ${_generated_files} ${other_module_sources} ) + add_dependencies( ${_name} ${pyx_target_name}) + target_link_libraries( ${_name} ${PYTHON_LIBRARIES} ) +endfunction() + +include( CMakeParseArguments ) diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 00000000000..f6b0a4bee83 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import glob +import os.path as osp +import re +import shutil +from Cython.Distutils import build_ext as _build_ext +import Cython + +import sys + +import pkg_resources +from setuptools import setup + +import os + +from os.path import join as pjoin + +from distutils.command.clean import clean as _clean +from distutils import sysconfig + +# Check if we're running 64-bit Python +is_64_bit = sys.maxsize > 2**32 + +# Check if this is a debug build of Python. +if hasattr(sys, 'gettotalrefcount'): + build_type = 'Debug' +else: + build_type = 'Release' + +if Cython.__version__ < '0.19.1': + raise Exception('Please upgrade to Cython 0.19.1 or newer') + +MAJOR = 0 +MINOR = 1 +MICRO = 0 +VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) + + +class clean(_clean): + + def run(self): + _clean.run(self) + for x in []: + try: + os.remove(x) + except OSError: + pass + + +class build_ext(_build_ext): + + def build_extensions(self): + numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') + + for ext in self.extensions: + if (hasattr(ext, 'include_dirs') and + numpy_incl not in ext.include_dirs): + ext.include_dirs.append(numpy_incl) + _build_ext.build_extensions(self) + + def run(self): + self._run_cmake() + _build_ext.run(self) + + # adapted from cmake_build_ext in dynd-python + # github.com/libdynd/dynd-python + + description = "Build the C-extensions for arrow" + user_options = ([('extra-cmake-args=', None, + 'extra arguments for CMake')] + + _build_ext.user_options) + + def initialize_options(self): + _build_ext.initialize_options(self) + self.extra_cmake_args = '' + + def _run_cmake(self): + # The directory containing this setup.py + source = osp.dirname(osp.abspath(__file__)) + + # The staging directory for the module being built + build_temp = pjoin(os.getcwd(), self.build_temp) + + # Change to the build directory + saved_cwd = os.getcwd() + if not os.path.isdir(self.build_temp): + self.mkpath(self.build_temp) + os.chdir(self.build_temp) + + # Detect if we built elsewhere + if os.path.isfile('CMakeCache.txt'): + cachefile = open('CMakeCache.txt', 'r') + cachedir = re.search('CMAKE_CACHEFILE_DIR:INTERNAL=(.*)', + cachefile.read()).group(1) + cachefile.close() + if (cachedir != build_temp): + return + + pyexe_option = '-DPYTHON_EXECUTABLE=%s' % sys.executable + static_lib_option = '' + build_tests_option = '' + + if sys.platform != 'win32': + cmake_command = ['cmake', self.extra_cmake_args, pyexe_option, + build_tests_option, + static_lib_option, source] + + self.spawn(cmake_command) + self.spawn(['make']) + else: + import shlex + cmake_generator = 'Visual Studio 14 2015' + if is_64_bit: + cmake_generator += ' Win64' + # Generate the build files + extra_cmake_args = shlex.split(self.extra_cmake_args) + cmake_command = (['cmake'] + extra_cmake_args + + [source, pyexe_option, + static_lib_option, + build_tests_option, + '-G', cmake_generator]) + if "-G" in self.extra_cmake_args: + cmake_command = cmake_command[:-2] + + self.spawn(cmake_command) + # Do the build + self.spawn(['cmake', '--build', '.', '--config', build_type]) + + if self.inplace: + # a bit hacky + build_lib = saved_cwd + else: + build_lib = pjoin(os.getcwd(), self.build_lib) + + # Move the built libpyarrow library to the place expected by the Python + # build + if sys.platform != 'win32': + name, = glob.glob('libpyarrow.*') + try: + os.makedirs(pjoin(build_lib, 'arrow')) + except OSError: + pass + shutil.move(name, pjoin(build_lib, 'arrow', name)) + else: + shutil.move(pjoin(build_type, 'pyarrow.dll'), + pjoin(build_lib, 'arrow', 'pyarrow.dll')) + + # Move the built C-extension to the place expected by the Python build + self._found_names = [] + for name in self.get_cmake_cython_names(): + built_path = self.get_ext_built(name) + if not os.path.exists(built_path): + print(built_path) + raise RuntimeError('libpyarrow C-extension failed to build:', + os.path.abspath(built_path)) + + ext_path = pjoin(build_lib, self._get_cmake_ext_path(name)) + if os.path.exists(ext_path): + os.remove(ext_path) + self.mkpath(os.path.dirname(ext_path)) + print('Moving built libpyarrow C-extension', built_path, + 'to build path', ext_path) + shutil.move(self.get_ext_built(name), ext_path) + self._found_names.append(name) + + os.chdir(saved_cwd) + + def _get_inplace_dir(self): + pass + + def _get_cmake_ext_path(self, name): + # Get the package directory from build_py + build_py = self.get_finalized_command('build_py') + package_dir = build_py.get_package_dir('arrow') + # This is the name of the arrow C-extension + suffix = sysconfig.get_config_var('EXT_SUFFIX') + if suffix is None: + suffix = sysconfig.get_config_var('SO') + filename = name + suffix + return pjoin(package_dir, filename) + + def get_ext_built(self, name): + if sys.platform == 'win32': + head, tail = os.path.split(name) + suffix = sysconfig.get_config_var('SO') + return pjoin(head, build_type, tail + suffix) + else: + suffix = sysconfig.get_config_var('SO') + return name + suffix + + def get_cmake_cython_names(self): + return ['config', 'parquet'] + + def get_names(self): + return self._found_names + + def get_outputs(self): + # Just the C extensions + cmake_exts = [self._get_cmake_ext_path(name) + for name in self.get_names()] + regular_exts = _build_ext.get_outputs(self) + return regular_exts + cmake_exts + + +extensions = [] + +DESC = """\ +Python library for Apache Arrow""" + +setup( + name="arrow", + packages=['arrow', 'arrow.tests'], + version=VERSION, + package_data={'arrow': ['*.pxd', '*.pyx']}, + ext_modules=extensions, + cmdclass={ + 'clean': clean, + 'build_ext': build_ext + }, + install_requires=['cython >= 0.21'], + description=DESC, + license='Apache License, Version 2.0', + maintainer="Apache Arrow Developers", + maintainer_email="dev@arrow.apache.org", + test_suite="arrow.tests" +) diff --git a/python/src/pyarrow/CMakeLists.txt b/python/src/pyarrow/CMakeLists.txt new file mode 100644 index 00000000000..e20c3238b5f --- /dev/null +++ b/python/src/pyarrow/CMakeLists.txt @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# Unit tests +####################################### diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h new file mode 100644 index 00000000000..c2285de77bf --- /dev/null +++ b/python/src/pyarrow/api.h @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_API_H +#define PYARROW_API_H + +#endif // PYARROW_API_H diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/init.cc new file mode 100644 index 00000000000..c36f4137255 --- /dev/null +++ b/python/src/pyarrow/init.cc @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/init.h" + +namespace arrow { + +namespace py { + +void pyarrow_init() { +} + +} // namespace py + +} // namespace arrow diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/init.h new file mode 100644 index 00000000000..1fc9f101026 --- /dev/null +++ b/python/src/pyarrow/init.h @@ -0,0 +1,31 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_INIT_H +#define PYARROW_INIT_H + +namespace arrow { + +namespace py { + +void pyarrow_init(); + +} // namespace py + +} // namespace arrow + +#endif // PYARROW_INIT_H diff --git a/python/src/pyarrow/util/CMakeLists.txt b/python/src/pyarrow/util/CMakeLists.txt new file mode 100644 index 00000000000..60dc80eb38c --- /dev/null +++ b/python/src/pyarrow/util/CMakeLists.txt @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# pyarrow_util +####################################### + +set(UTIL_SRCS +) + +set(UTIL_LIBS +) + +add_library(pyarrow_util STATIC + ${UTIL_SRCS} +) +target_link_libraries(pyarrow_util ${UTIL_LIBS}) +SET_TARGET_PROPERTIES(pyarrow_util PROPERTIES LINKER_LANGUAGE CXX) + +####################################### +# pyarrow_test_main +####################################### + +add_library(pyarrow_test_main + test_main.cc) + +if (APPLE) + target_link_libraries(pyarrow_test_main + gmock + dl) + set_target_properties(pyarrow_test_main + PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") +else() + target_link_libraries(pyarrow_test_main + gtest + pthread + dl + ) +endif() diff --git a/python/src/pyarrow/util/test_main.cc b/python/src/pyarrow/util/test_main.cc new file mode 100644 index 00000000000..00139f36742 --- /dev/null +++ b/python/src/pyarrow/util/test_main.cc @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +int main(int argc, char **argv) { + ::testing::InitGoogleTest(&argc, argv); + + int ret = RUN_ALL_TESTS(); + + return ret; +} From 8caa287263425c5b6c64c0e25fb8aa945e2f78d4 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 14:47:36 -0800 Subject: [PATCH 021/210] ARROW-35: Add a short call-to-action in the top level README.md Author: Wes McKinney Closes #13 from wesm/ARROW-35 and squashes the following commits: e10bfc3 [Wes McKinney] Add a proper mailto link c4428fe [Wes McKinney] Add a short 'how to get involved' blurb in top-level README --- README.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/README.md b/README.md index d948a996bc0..84bae78cc7f 100644 --- a/README.md +++ b/README.md @@ -22,3 +22,24 @@ Initial implementations include: - [Arrow Structures and APIs in Java](https://github.com/apache/arrow/tree/master/java) Arrow is an [Apache Software Foundation](www.apache.org) project. More info can be found at [arrow.apache.org](http://arrow.apache.org). + +#### Getting involved + +Right now the primary audience for Apache Arrow are the designers and +developers of data systems; most people will use Apache Arrow indirectly +through systems that use it for internal data handling and interoperating with +other Arrow-enabled systems. + +Even if you do not plan to contribute to Apache Arrow itself or Arrow +integrations in other projects, we'd be happy to have you involved: + +- Join the mailing list: send an email to + [dev-subscribe@arrow.apache.org][1]. Share your ideas and use cases for the + project. +- [Follow our activity on JIRA][3] +- [Learn the format][2] +- Contribute code to one of the reference implementations + +[1]: mailto:dev-subscribe@arrow.apache.org +[2]: https://github.com/apache/arrow/tree/master/format +[3]: https://issues.apache.org/jira/browse/ARROW \ No newline at end of file From 571343bbe36f99a11ed82e475b976bbe79dfb755 Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Mon, 7 Mar 2016 14:49:27 -0800 Subject: [PATCH 022/210] ARROW-9: Rename some unchanged "Drill" to "Arrow" (follow-up) https://issues.apache.org/jira/browse/ARROW-9 There is a unchanged one from "Drill" to "Arrow" at `ValueVector` and minor typos are fixed. Author: hyukjinkwon Author: Hyukjin Kwon Closes #18 from HyukjinKwon/ARROW-9 and squashes the following commits: 54a5d9f [Hyukjin Kwon] Update typo 628f35d [hyukjinkwon] Replace straggler references to Drill (follow-up) --- .../main/java/org/apache/arrow/vector/ValueVector.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index c05f0e7c50f..a170c59abd7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -63,7 +63,7 @@ public interface ValueVector extends Closeable, Iterable { /** * Allocates new buffers. ValueVector implements logic to determine how much to allocate. - * @return Returns true if allocation was succesful. + * @return Returns true if allocation was successful. */ boolean allocateNewSafe(); @@ -71,7 +71,7 @@ public interface ValueVector extends Closeable, Iterable { /** * Set the initial record capacity - * @param numRecords + * @param numRecords the initial record capacity. */ void setInitialCapacity(int numRecords); @@ -87,7 +87,7 @@ public interface ValueVector extends Closeable, Iterable { void close(); /** - * Release the underlying DrillBuf and reset the ValueVector to empty. + * Release the underlying ArrowBuf and reset the ValueVector to empty. */ void clear(); @@ -198,7 +198,7 @@ interface Accessor { } /** - * An abstractiong that is used to write into this vector instance. + * An abstraction that is used to write into this vector instance. */ interface Mutator { /** From 9afb667783b8cedbe6e9d6ee5eb02d35cf1d0f79 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 15:02:56 -0800 Subject: [PATCH 023/210] ARROW-31: Python: prototype user object model, add PyList conversion path with type inference Depends on ARROW-7. Pretty mundane stuff but got to start somewhere. I'm going to do a little more in this patch (handle normal lists of strings and lists of other supported Python types) before merging. Author: Wes McKinney Closes #19 from wesm/ARROW-31 and squashes the following commits: 2345541 [Wes McKinney] Test basic conversion of nested lists 1d4618b [Wes McKinney] Prototype string and double converters b02b296 [Wes McKinney] Type inference for lists and lists-of-lists 8c3891c [Wes McKinney] Smoke test that array garbage collection deallocates memory c28bf09 [Wes McKinney] Build array successfully, without validating contents 731544a [Wes McKinney] Move PrimitiveType::ToString template back to type.h b5b5b82 [Wes McKinney] Failing test stubs, raise on null array edb451c [Wes McKinney] Add a few data type smoke tests 47fd78e [Wes McKinney] Add unit test stub 07c1379 [Wes McKinney] Move some bits from arrow/type.h to type.cc 3a774fb [Wes McKinney] Add Status::ToString impls. Unit test stub 4e206fc [Wes McKinney] Add pandas converter placeholder 102ed36 [Wes McKinney] Cython array box scaffold builds 94f122f [Wes McKinney] Basic object model for sequence->arrow conversions bdb02e7 [Wes McKinney] Use shared_ptr with dynamic make_builder too d5655ba [Wes McKinney] Clean up array builder API to return shared_ptr 4132bda [Wes McKinney] Essential scaffolding -- error handling, memory pools, etc. -- to work toward converting Python lists to Arrow arrays 55e69a2 [Wes McKinney] Typed array stubs ac8c796 [Wes McKinney] Cache primitive data type instances 8f7edaf [Wes McKinney] Consolidate Field and data type subclasses. Add more Python stubs ea2f3ec [Wes McKinney] Bootstrap end-to-end exposure in Python, wrap DataType and Field types --- cpp/CMakeLists.txt | 83 ++-- cpp/src/arrow/CMakeLists.txt | 1 - cpp/src/arrow/api.h | 21 + cpp/src/arrow/builder.h | 10 +- cpp/src/arrow/field.h | 63 --- cpp/src/arrow/table/CMakeLists.txt | 15 - cpp/src/arrow/table/column-test.cc | 1 - cpp/src/arrow/table/column.cc | 2 +- cpp/src/arrow/table/column.h | 2 +- cpp/src/arrow/table/schema-test.cc | 9 +- cpp/src/arrow/table/schema.cc | 2 +- cpp/src/arrow/table/schema.h | 1 - cpp/src/arrow/table/table-test.cc | 1 - cpp/src/arrow/table/table.cc | 2 +- cpp/src/arrow/table/test-common.h | 1 - cpp/src/arrow/type.cc | 49 +++ cpp/src/arrow/type.h | 143 ++++-- cpp/src/arrow/types/CMakeLists.txt | 22 +- cpp/src/arrow/types/boolean.h | 3 +- cpp/src/arrow/types/construct.cc | 21 +- cpp/src/arrow/types/construct.h | 6 +- cpp/src/arrow/types/json.cc | 5 +- cpp/src/arrow/types/list-test.cc | 24 +- cpp/src/arrow/types/list.cc | 12 - cpp/src/arrow/types/list.h | 51 +-- cpp/src/arrow/types/primitive-test.cc | 64 ++- cpp/src/arrow/types/primitive.h | 22 +- cpp/src/arrow/types/string-test.cc | 11 +- cpp/src/arrow/types/string.h | 41 +- cpp/src/arrow/types/struct-test.cc | 19 +- cpp/src/arrow/types/struct.cc | 18 - cpp/src/arrow/types/struct.h | 21 +- cpp/src/arrow/util/CMakeLists.txt | 20 +- cpp/src/arrow/util/buffer.cc | 8 + cpp/src/arrow/util/buffer.h | 2 + cpp/src/arrow/util/status.cc | 40 ++ python/CMakeLists.txt | 21 +- python/arrow/__init__.py | 34 ++ python/arrow/array.pxd | 85 ++++ python/arrow/array.pyx | 179 ++++++++ python/arrow/config.pyx | 2 +- python/arrow/error.pxd | 20 + python/arrow/error.pyx | 30 ++ python/arrow/includes/arrow.pxd | 75 +++- python/arrow/includes/common.pxd | 4 +- python/arrow/includes/pyarrow.pxd | 24 +- python/arrow/scalar.pxd | 47 ++ python/arrow/scalar.pyx | 28 ++ python/arrow/schema.pxd | 39 ++ python/arrow/schema.pyx | 150 +++++++ python/arrow/tests/test_array.py | 26 ++ python/arrow/tests/test_convert_builtin.py | 85 ++++ python/arrow/tests/test_schema.py | 51 +++ python/setup.py | 7 +- python/src/pyarrow/adapters/builtin.cc | 415 ++++++++++++++++++ python/src/pyarrow/adapters/builtin.h | 40 ++ .../src/pyarrow/adapters/pandas.h | 17 +- python/src/pyarrow/api.h | 7 + python/src/pyarrow/common.cc | 71 +++ python/src/pyarrow/common.h | 95 ++++ python/src/pyarrow/helpers.cc | 57 +++ .../null.h => python/src/pyarrow/helpers.h | 22 +- python/src/pyarrow/init.cc | 8 +- python/src/pyarrow/init.h | 8 +- python/src/pyarrow/status.cc | 92 ++++ python/src/pyarrow/status.h | 144 ++++++ 66 files changed, 2246 insertions(+), 453 deletions(-) delete mode 100644 cpp/src/arrow/field.h create mode 100644 python/arrow/array.pxd create mode 100644 python/arrow/array.pyx create mode 100644 python/arrow/error.pxd create mode 100644 python/arrow/error.pyx create mode 100644 python/arrow/scalar.pxd create mode 100644 python/arrow/scalar.pyx create mode 100644 python/arrow/schema.pxd create mode 100644 python/arrow/schema.pyx create mode 100644 python/arrow/tests/test_array.py create mode 100644 python/arrow/tests/test_convert_builtin.py create mode 100644 python/arrow/tests/test_schema.py create mode 100644 python/src/pyarrow/adapters/builtin.cc create mode 100644 python/src/pyarrow/adapters/builtin.h rename cpp/src/arrow/field.cc => python/src/pyarrow/adapters/pandas.h (76%) create mode 100644 python/src/pyarrow/common.cc create mode 100644 python/src/pyarrow/common.h create mode 100644 python/src/pyarrow/helpers.cc rename cpp/src/arrow/types/null.h => python/src/pyarrow/helpers.h (72%) create mode 100644 python/src/pyarrow/status.cc create mode 100644 python/src/pyarrow/status.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8042661533e..e8cb88c0b4d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -37,18 +37,17 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -# Enable using a custom GCC toolchain to build Arrow -if (NOT "$ENV{ARROW_GCC_ROOT}" STREQUAL "") - set(GCC_ROOT $ENV{ARROW_GCC_ROOT}) - set(CMAKE_C_COMPILER ${GCC_ROOT}/bin/gcc) - set(CMAKE_CXX_COMPILER ${GCC_ROOT}/bin/g++) -endif() - if(APPLE) # In newer versions of CMake, this is the default setting set(CMAKE_MACOSX_RPATH 1) endif() +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) + # ---------------------------------------------------------------------- # cmake options @@ -126,38 +125,16 @@ endif () # Add common flags set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") -# Required to avoid static linking errors with dependencies -add_definitions(-fPIC) - # Determine compiler version include(CompilerInfo) if ("${COMPILER_FAMILY}" STREQUAL "clang") - # Clang helpfully provides a few extensions from C++11 such as the 'override' - # keyword on methods. This doesn't change behavior, and we selectively enable - # it in src/gutil/port.h only on clang. So, we can safely use it, and don't want - # to trigger warnings when we do so. - # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++11-extensions") - # Using Clang with ccache causes a bunch of spurious warnings that are # purportedly fixed in the next version of ccache. See the following for details: # # http://petereisentraut.blogspot.com/2011/05/ccache-and-clang.html # http://petereisentraut.blogspot.com/2011/09/ccache-and-clang-part-2.html set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") - - # Only hardcode -fcolor-diagnostics if stderr is opened on a terminal. Otherwise - # the color codes show up as noisy artifacts. - # - # This test is imperfect because 'cmake' and 'make' can be run independently - # (with different terminal options), and we're testing during the former. - execute_process(COMMAND test -t 2 RESULT_VARIABLE ARROW_IS_TTY) - if ((${ARROW_IS_TTY} EQUAL 0) AND (NOT ("$ENV{TERM}" STREQUAL "dumb"))) - message("Running in a controlling terminal") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fcolor-diagnostics") - else() - message("Running without a controlling terminal or in a dumb terminal") - endif() endif() # Sanity check linking option. @@ -278,12 +255,6 @@ set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") include_directories(src) -############################################################ -# Visibility -############################################################ -# For generate_export_header() and add_compiler_export_flags(). -include(GenerateExportHeader) - ############################################################ # Testing ############################################################ @@ -456,21 +427,32 @@ endif() # Subdirectories ############################################################ -add_subdirectory(src/arrow) -add_subdirectory(src/arrow/util) -add_subdirectory(src/arrow/table) -add_subdirectory(src/arrow/types) - -set(LINK_LIBS - arrow_util - arrow_table - arrow_types) +set(LIBARROW_LINK_LIBS +) set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc - src/arrow/field.cc src/arrow/type.cc + + src/arrow/table/column.cc + src/arrow/table/schema.cc + src/arrow/table/table.cc + + src/arrow/types/construct.cc + src/arrow/types/floating.cc + src/arrow/types/integer.cc + src/arrow/types/json.cc + src/arrow/types/list.cc + src/arrow/types/primitive.cc + src/arrow/types/string.cc + src/arrow/types/struct.cc + src/arrow/types/union.cc + + src/arrow/util/bit-util.cc + src/arrow/util/buffer.cc + src/arrow/util/memory-pool.cc + src/arrow/util/status.cc ) set(LIBARROW_LINKAGE "SHARED") @@ -479,8 +461,15 @@ add_library(arrow ${LIBARROW_LINKAGE} ${ARROW_SRCS} ) -target_link_libraries(arrow ${LINK_LIBS}) -set_target_properties(arrow PROPERTIES LINKER_LANGUAGE CXX) +set_target_properties(arrow + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") +target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) + +add_subdirectory(src/arrow) +add_subdirectory(src/arrow/util) +add_subdirectory(src/arrow/table) +add_subdirectory(src/arrow/types) install(TARGETS arrow LIBRARY DESTINATION lib diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 102a8a1853f..77326ce38d7 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -20,7 +20,6 @@ install(FILES api.h array.h builder.h - field.h type.h DESTINATION include/arrow) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 899e8aae19c..c73d4b386cf 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -15,7 +15,28 @@ // specific language governing permissions and limitations // under the License. +// Coarse public API while the library is in development + #ifndef ARROW_API_H #define ARROW_API_H +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/type.h" + +#include "arrow/table/column.h" +#include "arrow/table/schema.h" +#include "arrow/table/table.h" + +#include "arrow/types/boolean.h" +#include "arrow/types/construct.h" +#include "arrow/types/floating.h" +#include "arrow/types/integer.h" +#include "arrow/types/list.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" + +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + #endif // ARROW_API_H diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 491b9133d2c..8cc689c3e81 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -32,7 +32,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 8; +static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; // Base class for all data array builders class ArrayBuilder { @@ -78,12 +78,16 @@ class ArrayBuilder { // Creates new array object to hold the contents of the builder and transfers // ownership of the data - virtual Status ToArray(Array** out) = 0; + virtual std::shared_ptr Finish() = 0; + + const std::shared_ptr& type() const { + return type_; + } protected: MemoryPool* pool_; - TypePtr type_; + std::shared_ptr type_; // When nulls are first appended to the builder, the null bitmap is allocated std::shared_ptr nulls_; diff --git a/cpp/src/arrow/field.h b/cpp/src/arrow/field.h deleted file mode 100644 index 89a450c66f2..00000000000 --- a/cpp/src/arrow/field.h +++ /dev/null @@ -1,63 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_FIELD_H -#define ARROW_FIELD_H - -#include - -#include "arrow/type.h" - -namespace arrow { - -// A field is a piece of metadata that includes (for now) a name and a data -// type - -struct Field { - // Field name - std::string name; - - // The field's data type - TypePtr type; - - Field(const std::string& name, const TypePtr& type) : - name(name), - type(type) {} - - bool operator==(const Field& other) const { - return this->Equals(other); - } - - bool operator!=(const Field& other) const { - return !this->Equals(other); - } - - bool Equals(const Field& other) const { - return (this == &other) || (this->name == other.name && - this->type->Equals(other.type.get())); - } - - bool nullable() const { - return this->type->nullable; - } - - std::string ToString() const; -}; - -} // namespace arrow - -#endif // ARROW_FIELD_H diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt index 68bf3148a98..26d843d853b 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/cpp/src/arrow/table/CMakeLists.txt @@ -19,21 +19,6 @@ # arrow_table ####################################### -set(TABLE_SRCS - column.cc - schema.cc - table.cc -) - -set(TABLE_LIBS -) - -add_library(arrow_table STATIC - ${TABLE_SRCS} -) -target_link_libraries(arrow_table ${TABLE_LIBS}) -SET_TARGET_PROPERTIES(arrow_table PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES column.h diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/table/column-test.cc index 4959b82c6e2..bf95932916c 100644 --- a/cpp/src/arrow/table/column-test.cc +++ b/cpp/src/arrow/table/column-test.cc @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/test-common.h" diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/table/column.cc index d68b491fb99..573e6508759 100644 --- a/cpp/src/arrow/table/column.cc +++ b/cpp/src/arrow/table/column.cc @@ -20,7 +20,7 @@ #include #include -#include "arrow/field.h" +#include "arrow/type.h" #include "arrow/util/status.h" namespace arrow { diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/table/column.h index 64423bf9561..dfc7516e26a 100644 --- a/cpp/src/arrow/table/column.h +++ b/cpp/src/arrow/table/column.h @@ -23,7 +23,7 @@ #include #include "arrow/array.h" -#include "arrow/field.h" +#include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/table/schema-test.cc index 0cf1b3c5f9a..d6725cc08c0 100644 --- a/cpp/src/arrow/table/schema-test.cc +++ b/cpp/src/arrow/table/schema-test.cc @@ -20,7 +20,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/schema.h" #include "arrow/type.h" #include "arrow/types/string.h" @@ -97,10 +96,10 @@ TEST_F(TestSchema, ToString) { auto schema = std::make_shared(fields); std::string result = schema->ToString(); - std::string expected = R"(f0 ?int32 -f1 uint8 -f2 ?string -f3 ?list + std::string expected = R"(f0 int32 +f1 uint8 not null +f2 string +f3 list )"; ASSERT_EQ(expected, result); diff --git a/cpp/src/arrow/table/schema.cc b/cpp/src/arrow/table/schema.cc index fb3b4d6f292..d49d0a713e7 100644 --- a/cpp/src/arrow/table/schema.cc +++ b/cpp/src/arrow/table/schema.cc @@ -22,7 +22,7 @@ #include #include -#include "arrow/field.h" +#include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/schema.h b/cpp/src/arrow/table/schema.h index d04e3f628c1..103f01b26e3 100644 --- a/cpp/src/arrow/table/schema.h +++ b/cpp/src/arrow/table/schema.h @@ -22,7 +22,6 @@ #include #include -#include "arrow/field.h" #include "arrow/type.h" namespace arrow { diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table/table-test.cc index dd4f74cd16f..c4fdb062db8 100644 --- a/cpp/src/arrow/table/table-test.cc +++ b/cpp/src/arrow/table/table-test.cc @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/table.h" diff --git a/cpp/src/arrow/table/table.cc b/cpp/src/arrow/table/table.cc index 4cefc924ed3..0c788b8fe3f 100644 --- a/cpp/src/arrow/table/table.cc +++ b/cpp/src/arrow/table/table.cc @@ -20,9 +20,9 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" +#include "arrow/type.h" #include "arrow/util/status.h" namespace arrow { diff --git a/cpp/src/arrow/table/test-common.h b/cpp/src/arrow/table/test-common.h index efe2f228cd0..50a5f6a2f50 100644 --- a/cpp/src/arrow/table/test-common.h +++ b/cpp/src/arrow/table/test-common.h @@ -21,7 +21,6 @@ #include #include -#include "arrow/field.h" #include "arrow/table/column.h" #include "arrow/table/schema.h" #include "arrow/table/table.h" diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index ff145e2c1e3..265770822ce 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -17,8 +17,56 @@ #include "arrow/type.h" +#include +#include + namespace arrow { +std::string Field::ToString() const { + std::stringstream ss; + ss << this->name << " " << this->type->ToString(); + return ss.str(); +} + +DataType::~DataType() {} + +StringType::StringType(bool nullable) + : DataType(LogicalType::STRING, nullable) {} + +StringType::StringType(const StringType& other) + : StringType(other.nullable) {} + +std::string StringType::ToString() const { + std::string result(name()); + if (!nullable) { + result.append(" not null"); + } + return result; +} + +std::string ListType::ToString() const { + std::stringstream s; + s << "list<" << value_type->ToString() << ">"; + if (!this->nullable) { + s << " not null"; + } + return s.str(); +} + +std::string StructType::ToString() const { + std::stringstream s; + s << "struct<"; + for (size_t i = 0; i < fields_.size(); ++i) { + if (i > 0) s << ", "; + const std::shared_ptr& field = fields_[i]; + s << field->name << ": " << field->type->ToString(); + } + s << ">"; + if (!nullable) s << " not null"; + return s.str(); +} + +const std::shared_ptr NA = std::make_shared(); const std::shared_ptr BOOL = std::make_shared(); const std::shared_ptr UINT8 = std::make_shared(); const std::shared_ptr UINT16 = std::make_shared(); @@ -30,5 +78,6 @@ const std::shared_ptr INT32 = std::make_shared(); const std::shared_ptr INT64 = std::make_shared(); const std::shared_ptr FLOAT = std::make_shared(); const std::shared_ptr DOUBLE = std::make_shared(); +const std::shared_ptr STRING = std::make_shared(); } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4193a0e8bc8..e78e4949119 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -20,6 +20,7 @@ #include #include +#include namespace arrow { @@ -71,49 +72,46 @@ struct LogicalType { UINT64 = 7, INT64 = 8, - // A boolean value represented as 1 byte - BOOL = 9, - // A boolean value represented as 1 bit - BIT = 10, + BOOL = 9, // 4-byte floating point value - FLOAT = 11, + FLOAT = 10, // 8-byte floating point value - DOUBLE = 12, + DOUBLE = 11, // CHAR(N): fixed-length UTF8 string with length N - CHAR = 13, + CHAR = 12, // UTF8 variable-length string as List - STRING = 14, + STRING = 13, // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) - VARCHAR = 15, + VARCHAR = 14, // Variable-length bytes (no guarantee of UTF8-ness) - BINARY = 16, + BINARY = 15, // By default, int32 days since the UNIX epoch - DATE = 17, + DATE = 16, // Exact timestamp encoded with int64 since UNIX epoch // Default unit millisecond - TIMESTAMP = 18, + TIMESTAMP = 17, // Timestamp as double seconds since the UNIX epoch - TIMESTAMP_DOUBLE = 19, + TIMESTAMP_DOUBLE = 18, // Exact time encoded with int64, default unit millisecond - TIME = 20, + TIME = 19, // Precision- and scale-based decimal type. Storage type depends on the // parameters. - DECIMAL = 21, + DECIMAL = 20, // Decimal value encoded as a text string - DECIMAL_TEXT = 22, + DECIMAL_TEXT = 21, // A list of some logical data type LIST = 30, @@ -141,7 +139,9 @@ struct DataType { type(type), nullable(nullable) {} - virtual bool Equals(const DataType* other) { + virtual ~DataType(); + + bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses return this == other || (this->type == other->type && this->nullable == other->nullable); @@ -154,10 +154,45 @@ struct DataType { virtual std::string ToString() const = 0; }; - typedef std::shared_ptr LayoutPtr; typedef std::shared_ptr TypePtr; +// A field is a piece of metadata that includes (for now) a name and a data +// type +struct Field { + // Field name + std::string name; + + // The field's data type + TypePtr type; + + Field(const std::string& name, const TypePtr& type) : + name(name), + type(type) {} + + bool operator==(const Field& other) const { + return this->Equals(other); + } + + bool operator!=(const Field& other) const { + return !this->Equals(other); + } + + bool Equals(const Field& other) const { + return (this == &other) || (this->name == other.name && + this->type->Equals(other.type.get())); + } + + bool Equals(const std::shared_ptr& other) const { + return Equals(*other.get()); + } + + bool nullable() const { + return this->type->nullable; + } + + std::string ToString() const; +}; struct BytesType : public LayoutType { int size; @@ -183,16 +218,18 @@ struct PrimitiveType : public DataType { explicit PrimitiveType(bool nullable = true) : DataType(Derived::type_enum, nullable) {} - virtual std::string ToString() const { - std::string result; - if (nullable) { - result.append("?"); - } - result.append(static_cast(this)->name()); - return result; - } + std::string ToString() const override; }; +template +inline std::string PrimitiveType::ToString() const { + std::string result(static_cast(this)->name()); + if (!nullable) { + result.append(" not null"); + } + return result; +} + #define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ typedef C_TYPE c_type; \ static constexpr LogicalType::type type_enum = LogicalType::ENUM; \ @@ -205,6 +242,10 @@ struct PrimitiveType : public DataType { return NAME; \ } +struct NullType : public PrimitiveType { + PRIMITIVE_DECL(NullType, void, NA, 0, "null"); +}; + struct BooleanType : public PrimitiveType { PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); }; @@ -249,6 +290,55 @@ struct DoubleType : public PrimitiveType { PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); }; +struct ListType : public DataType { + // List can contain any other logical value type + TypePtr value_type; + + explicit ListType(const TypePtr& value_type, bool nullable = true) + : DataType(LogicalType::LIST, nullable), + value_type(value_type) {} + + static char const *name() { + return "list"; + } + + std::string ToString() const override; +}; + +// String is a logical type consisting of a physical list of 1-byte values +struct StringType : public DataType { + explicit StringType(bool nullable = true); + + StringType(const StringType& other); + + static char const *name() { + return "string"; + } + + std::string ToString() const override; +}; + +struct StructType : public DataType { + std::vector > fields_; + + explicit StructType(const std::vector >& fields, + bool nullable = true) + : DataType(LogicalType::STRUCT, nullable) { + fields_ = fields; + } + + const std::shared_ptr& field(int i) const { + return fields_[i]; + } + + int num_children() const { + return fields_.size(); + } + + std::string ToString() const override; +}; + +extern const std::shared_ptr NA; extern const std::shared_ptr BOOL; extern const std::shared_ptr UINT8; extern const std::shared_ptr UINT16; @@ -260,6 +350,7 @@ extern const std::shared_ptr INT32; extern const std::shared_ptr INT64; extern const std::shared_ptr FLOAT; extern const std::shared_ptr DOUBLE; +extern const std::shared_ptr STRING; } // namespace arrow diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index e090aead1f8..57cabdefd25 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -19,31 +19,11 @@ # arrow_types ####################################### -set(TYPES_SRCS - construct.cc - floating.cc - integer.cc - json.cc - list.cc - primitive.cc - string.cc - struct.cc - union.cc -) - -set(TYPES_LIBS -) - -add_library(arrow_types STATIC - ${TYPES_SRCS} -) -target_link_libraries(arrow_types ${TYPES_LIBS}) -SET_TARGET_PROPERTIES(arrow_types PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES boolean.h collection.h + construct.h datetime.h decimal.h floating.h diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h index 8fc9cfd19c0..a5023d7b368 100644 --- a/cpp/src/arrow/types/boolean.h +++ b/cpp/src/arrow/types/boolean.h @@ -24,7 +24,8 @@ namespace arrow { typedef PrimitiveArrayImpl BooleanArray; -// typedef PrimitiveBuilder BooleanBuilder; +class BooleanBuilder : public ArrayBuilder { +}; } // namespace arrow diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 05d6b270fc3..43f01a30513 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -32,13 +32,13 @@ class ArrayBuilder; // Initially looked at doing this with vtables, but shared pointers makes it // difficult -#define BUILDER_CASE(ENUM, BuilderType) \ - case LogicalType::ENUM: \ - *out = static_cast(new BuilderType(pool, type)); \ +#define BUILDER_CASE(ENUM, BuilderType) \ + case LogicalType::ENUM: \ + out->reset(new BuilderType(pool, type)); \ return Status::OK(); -Status make_builder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder** out) { +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::shared_ptr* out) { switch (type->type) { BUILDER_CASE(UINT8, UInt8Builder); BUILDER_CASE(INT8, Int8Builder); @@ -58,13 +58,12 @@ Status make_builder(MemoryPool* pool, const TypePtr& type, case LogicalType::LIST: { - ListType* list_type = static_cast(type.get()); - ArrayBuilder* value_builder; - RETURN_NOT_OK(make_builder(pool, list_type->value_type, &value_builder)); + std::shared_ptr value_builder; - // The ListBuilder takes ownership of the value_builder - ListBuilder* builder = new ListBuilder(pool, type, value_builder); - *out = static_cast(builder); + const std::shared_ptr& value_type = static_cast( + type.get())->value_type; + RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); + out->reset(new ListBuilder(pool, type, value_builder)); return Status::OK(); } // BUILDER_CASE(CHAR, CharBuilder); diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index b5ba436f787..59ebe1acddc 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -18,6 +18,8 @@ #ifndef ARROW_TYPES_CONSTRUCT_H #define ARROW_TYPES_CONSTRUCT_H +#include + #include "arrow/type.h" namespace arrow { @@ -26,8 +28,8 @@ class ArrayBuilder; class MemoryPool; class Status; -Status make_builder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder** out); +Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, + std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc index b29b95715fe..168e370d51a 100644 --- a/cpp/src/arrow/types/json.cc +++ b/cpp/src/arrow/types/json.cc @@ -19,10 +19,7 @@ #include -#include "arrow/types/boolean.h" -#include "arrow/types/integer.h" -#include "arrow/types/floating.h" -#include "arrow/types/null.h" +#include "arrow/type.h" #include "arrow/types/string.h" #include "arrow/types/union.h" diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index b4bbd2841a8..02991de2648 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -32,6 +32,7 @@ #include "arrow/types/test-common.h" #include "arrow/util/status.h" +using std::shared_ptr; using std::string; using std::unique_ptr; using std::vector; @@ -47,17 +48,18 @@ TEST(TypesTest, TestListType) { ASSERT_EQ(list_type.type, LogicalType::LIST); ASSERT_EQ(list_type.name(), string("list")); - ASSERT_EQ(list_type.ToString(), string("?list")); + ASSERT_EQ(list_type.ToString(), string("list")); ASSERT_EQ(list_type.value_type->type, vt->type); ASSERT_EQ(list_type.value_type->type, vt->type); std::shared_ptr st = std::make_shared(false); std::shared_ptr lt = std::make_shared(st, false); - ASSERT_EQ(lt->ToString(), string("list")); + ASSERT_EQ(lt->ToString(), string("list not null")); ListType lt2(lt, false); - ASSERT_EQ(lt2.ToString(), string("list>")); + ASSERT_EQ(lt2.ToString(), + string("list not null> not null")); } // ---------------------------------------------------------------------- @@ -71,23 +73,21 @@ class TestListBuilder : public TestBuilder { value_type_ = TypePtr(new Int32Type()); type_ = TypePtr(new ListType(value_type_)); - ArrayBuilder* tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + std::shared_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_ = std::dynamic_pointer_cast(tmp); } void Done() { - Array* out; - ASSERT_OK(builder_->ToArray(&out)); - result_.reset(static_cast(out)); + result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: TypePtr value_type_; TypePtr type_; - unique_ptr builder_; - unique_ptr result_; + shared_ptr builder_; + shared_ptr result_; }; @@ -116,7 +116,7 @@ TEST_F(TestListBuilder, TestBasics) { vector lengths = {3, 0, 4}; vector is_null = {0, 1, 0}; - Int32Builder* vb = static_cast(builder_->value_builder()); + Int32Builder* vb = static_cast(builder_->value_builder().get()); int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 577d71d0b28..69a79a77fab 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -17,18 +17,6 @@ #include "arrow/types/list.h" -#include -#include - namespace arrow { -std::string ListType::ToString() const { - std::stringstream s; - if (this->nullable) { - s << "?"; - } - s << "list<" << value_type->ToString() << ">"; - return s.str(); -} - } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f39fe5c4d81..f40a8245362 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -36,21 +36,6 @@ namespace arrow { class MemoryPool; -struct ListType : public DataType { - // List can contain any other logical value type - TypePtr value_type; - - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(LogicalType::LIST, nullable), - value_type(value_type) {} - - static char const *name() { - return "list"; - } - - virtual std::string ToString() const; -}; - class ListArray : public Array { public: ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} @@ -106,10 +91,9 @@ class ListArray : public Array { class ListBuilder : public Int32Builder { public: ListBuilder(MemoryPool* pool, const TypePtr& type, - ArrayBuilder* value_builder) - : Int32Builder(pool, type) { - value_builder_.reset(value_builder); - } + std::shared_ptr value_builder) + : Int32Builder(pool, type), + value_builder_(value_builder) {} Status Init(int32_t elements) { // One more than requested. @@ -147,30 +131,27 @@ class ListBuilder : public Int32Builder { return Status::OK(); } - // Initialize an array type instance with the results of this builder - // Transfers ownership of all buffers template - Status Transfer(Container* out) { - Array* child_values; - RETURN_NOT_OK(value_builder_->ToArray(&child_values)); + std::shared_ptr Transfer() { + auto result = std::make_shared(); + + std::shared_ptr items = value_builder_->Finish(); // Add final offset if the length is non-zero if (length_) { - raw_buffer()[length_] = child_values->length(); + raw_buffer()[length_] = items->length(); } - out->Init(type_, length_, values_, ArrayPtr(child_values), + result->Init(type_, length_, values_, items, null_count_, nulls_); values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; - return Status::OK(); + + return result; } - virtual Status ToArray(Array** out) { - ListArray* result = new ListArray(); - RETURN_NOT_OK(Transfer(result)); - *out = static_cast(result); - return Status::OK(); + std::shared_ptr Finish() override { + return Transfer(); } // Start a new variable-length list slot @@ -198,10 +179,12 @@ class ListBuilder : public Int32Builder { return Append(true); } - ArrayBuilder* value_builder() const { return value_builder_.get();} + const std::shared_ptr& value_builder() const { + return value_builder_; + } protected: - std::unique_ptr value_builder_; + std::shared_ptr value_builder_; }; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 02eaaa7542b..f35a258e2cb 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -37,6 +37,7 @@ #include "arrow/util/status.h" using std::string; +using std::shared_ptr; using std::unique_ptr; using std::vector; @@ -98,12 +99,12 @@ class TestPrimitiveBuilder : public TestBuilder { type_ = Attrs::type(); - ArrayBuilder* tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + std::shared_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_ = std::dynamic_pointer_cast(tmp); - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_nn_.reset(static_cast(tmp)); + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + builder_nn_ = std::dynamic_pointer_cast(tmp); } void RandomData(int N, double pct_null = 0.1) { @@ -112,7 +113,6 @@ class TestPrimitiveBuilder : public TestBuilder { } void CheckNullable() { - ArrayType result; ArrayType expected; int size = builder_->length(); @@ -125,7 +125,9 @@ class TestPrimitiveBuilder : public TestBuilder { int32_t ex_null_count = null_count(nulls_); expected.Init(size, ex_data, ex_null_count, ex_nulls); - ASSERT_OK(builder_->Transfer(&result)); + + std::shared_ptr result = std::dynamic_pointer_cast( + builder_->Finish()); // Builder is now reset ASSERT_EQ(0, builder_->length()); @@ -133,12 +135,11 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); - ASSERT_TRUE(result.Equals(expected)); - ASSERT_EQ(ex_null_count, result.null_count()); + ASSERT_TRUE(result->Equals(expected)); + ASSERT_EQ(ex_null_count, result->null_count()); } void CheckNonNullable() { - ArrayType result; ArrayType expected; int size = builder_nn_->length(); @@ -146,22 +147,24 @@ class TestPrimitiveBuilder : public TestBuilder { size * sizeof(T)); expected.Init(size, ex_data); - ASSERT_OK(builder_nn_->Transfer(&result)); + + std::shared_ptr result = std::dynamic_pointer_cast( + builder_nn_->Finish()); // Builder is now reset ASSERT_EQ(0, builder_nn_->length()); ASSERT_EQ(0, builder_nn_->capacity()); ASSERT_EQ(nullptr, builder_nn_->buffer()); - ASSERT_TRUE(result.Equals(expected)); - ASSERT_EQ(0, result.null_count()); + ASSERT_TRUE(result->Equals(expected)); + ASSERT_EQ(0, result->null_count()); } protected: TypePtr type_; TypePtr type_nn_; - unique_ptr builder_; - unique_ptr builder_nn_; + shared_ptr builder_; + shared_ptr builder_nn_; vector draws_; vector nulls_; @@ -225,15 +228,36 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { ASSERT_OK(this->builder_->AppendNull()); } - Array* result; - ASSERT_OK(this->builder_->ToArray(&result)); - unique_ptr holder(result); + auto result = this->builder_->Finish(); for (int i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)); } } +TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { + DECL_T(); + + int size = 10000; + + vector& draws = this->draws_; + vector& nulls = this->nulls_; + + int64_t memory_before = this->pool_->bytes_allocated(); + + this->RandomData(size); + + int i; + for (i = 0; i < size; ++i) { + ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + } + + do { + std::shared_ptr result = this->builder_->Finish(); + } while (false); + + ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); +} TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); @@ -331,11 +355,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { - int n = 100; - ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_OK(this->builder_->Reserve(10)); ASSERT_EQ(0, this->builder_->length()); ASSERT_EQ(MIN_BUILDER_CAPACITY, this->builder_->capacity()); + ASSERT_OK(this->builder_->Reserve(90)); ASSERT_OK(this->builder_->Advance(100)); ASSERT_OK(this->builder_->Reserve(MIN_BUILDER_CAPACITY)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 09d43e7ec8b..1073bb6e1c3 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -64,6 +64,8 @@ class PrimitiveArrayImpl : public PrimitiveArray { PrimitiveArrayImpl() : PrimitiveArray() {} + virtual ~PrimitiveArrayImpl() {} + PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr) { @@ -197,24 +199,12 @@ class PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } - // Initialize an array type instance with the results of this builder - // Transfers ownership of all buffers - Status Transfer(PrimitiveArray* out) { - out->Init(type_, length_, values_, null_count_, nulls_); + std::shared_ptr Finish() override { + std::shared_ptr result = std::make_shared(); + result->PrimitiveArray::Init(type_, length_, values_, null_count_, nulls_); values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; - return Status::OK(); - } - - Status Transfer(ArrayType* out) { - return Transfer(static_cast(out)); - } - - virtual Status ToArray(Array** out) { - ArrayType* result = new ArrayType(); - RETURN_NOT_OK(Transfer(result)); - *out = static_cast(result); - return Status::OK(); + return result; } value_type* raw_buffer() { diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 9af66729502..8e82fd95dd8 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -166,23 +166,18 @@ class TestStringBuilder : public TestBuilder { void SetUp() { TestBuilder::SetUp(); type_ = TypePtr(new StringType()); - - ArrayBuilder* tmp; - ASSERT_OK(make_builder(pool_, type_, &tmp)); - builder_.reset(static_cast(tmp)); + builder_.reset(new StringBuilder(pool_, type_)); } void Done() { - Array* out; - ASSERT_OK(builder_->ToArray(&out)); - result_.reset(static_cast(out)); + result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: TypePtr type_; std::unique_ptr builder_; - std::unique_ptr result_; + std::shared_ptr result_; }; TEST_F(TestStringBuilder, TestScalarAppend) { diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 5795cfed577..8ccc0a9698a 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -71,28 +71,6 @@ struct VarcharType : public DataType { static const LayoutPtr byte1(new BytesType(1)); static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); -// String is a logical type consisting of a physical list of 1-byte values -struct StringType : public DataType { - explicit StringType(bool nullable = true) - : DataType(LogicalType::STRING, nullable) {} - - StringType(const StringType& other) - : StringType() {} - - static char const *name() { - return "string"; - } - - virtual std::string ToString() const { - std::string result; - if (nullable) { - result.append("?"); - } - result.append(name()); - return result; - } -}; - // TODO: add a BinaryArray layer in between class StringArray : public ListArray { public: @@ -153,26 +131,23 @@ class StringArray : public ListArray { class StringBuilder : public ListBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : - ListBuilder(pool, type, - static_cast(new UInt8Builder(pool, value_type_))) { + ListBuilder(pool, type, std::make_shared(pool, value_type_)) { byte_builder_ = static_cast(value_builder_.get()); } Status Append(const std::string& value) { - RETURN_NOT_OK(ListBuilder::Append()); - return byte_builder_->Append(reinterpret_cast(value.c_str()), - value.size()); + return Append(value.c_str(), value.size()); } - Status Append(const uint8_t* value, int32_t length); + Status Append(const char* value, int32_t length) { + RETURN_NOT_OK(ListBuilder::Append()); + return byte_builder_->Append(reinterpret_cast(value), length); + } Status Append(const std::vector& values, uint8_t* null_bytes); - virtual Status ToArray(Array** out) { - StringArray* result = new StringArray(); - RETURN_NOT_OK(ListBuilder::Transfer(result)); - *out = static_cast(result); - return Status::OK(); + std::shared_ptr Finish() override { + return ListBuilder::Transfer(); } protected: diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index df615710479..9a4777e8b98 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -17,15 +17,16 @@ #include +#include #include #include -#include "arrow/field.h" #include "arrow/type.h" #include "arrow/types/integer.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +using std::shared_ptr; using std::string; using std::vector; @@ -33,23 +34,23 @@ namespace arrow { TEST(TestStructType, Basics) { TypePtr f0_type = TypePtr(new Int32Type()); - Field f0("f0", f0_type); + auto f0 = std::make_shared("f0", f0_type); TypePtr f1_type = TypePtr(new StringType()); - Field f1("f1", f1_type); + auto f1 = std::make_shared("f1", f1_type); TypePtr f2_type = TypePtr(new UInt8Type()); - Field f2("f2", f2_type); + auto f2 = std::make_shared("f2", f2_type); - vector fields = {f0, f1, f2}; + vector > fields = {f0, f1, f2}; StructType struct_type(fields); - ASSERT_TRUE(struct_type.field(0).Equals(f0)); - ASSERT_TRUE(struct_type.field(1).Equals(f1)); - ASSERT_TRUE(struct_type.field(2).Equals(f2)); + ASSERT_TRUE(struct_type.field(0)->Equals(f0)); + ASSERT_TRUE(struct_type.field(1)->Equals(f1)); + ASSERT_TRUE(struct_type.field(2)->Equals(f2)); - ASSERT_EQ(struct_type.ToString(), "?struct"); + ASSERT_EQ(struct_type.ToString(), "struct"); // TODO: out of bounds for field(...) } diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 6b233bc372a..02af600b017 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,24 +17,6 @@ #include "arrow/types/struct.h" -#include -#include -#include -#include - namespace arrow { -std::string StructType::ToString() const { - std::stringstream s; - if (nullable) s << "?"; - s << "struct<"; - for (size_t i = 0; i < fields_.size(); ++i) { - if (i > 0) s << ", "; - const Field& field = fields_[i]; - s << field.name << ": " << field.type->ToString(); - } - s << ">"; - return s.str(); -} - } // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index e575c31287c..5842534d35b 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -18,33 +18,14 @@ #ifndef ARROW_TYPES_STRUCT_H #define ARROW_TYPES_STRUCT_H +#include #include #include -#include "arrow/field.h" #include "arrow/type.h" namespace arrow { -struct StructType : public DataType { - std::vector fields_; - - explicit StructType(const std::vector& fields, bool nullable = true) - : DataType(LogicalType::STRUCT, nullable) { - fields_ = fields; - } - - const Field& field(int i) const { - return fields_[i]; - } - - int num_children() const { - return fields_.size(); - } - - virtual std::string ToString() const; -}; - } // namespace arrow #endif // ARROW_TYPES_STRUCT_H diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index c53f307c9f5..4272ce42854 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -19,22 +19,6 @@ # arrow_util ####################################### -set(UTIL_SRCS - bit-util.cc - buffer.cc - memory-pool.cc - status.cc -) - -set(UTIL_LIBS -) - -add_library(arrow_util STATIC - ${UTIL_SRCS} -) -target_link_libraries(arrow_util ${UTIL_LIBS}) -SET_TARGET_PROPERTIES(arrow_util PROPERTIES LINKER_LANGUAGE CXX) - # Headers: top level install(FILES bit-util.h @@ -50,7 +34,7 @@ install(FILES add_library(arrow_test_util) target_link_libraries(arrow_test_util - arrow_util) +) SET_TARGET_PROPERTIES(arrow_test_util PROPERTIES LINKER_LANGUAGE CXX) @@ -64,7 +48,6 @@ add_library(arrow_test_main if (APPLE) target_link_libraries(arrow_test_main gtest - arrow_util arrow_test_util dl) set_target_properties(arrow_test_main @@ -72,7 +55,6 @@ if (APPLE) else() target_link_libraries(arrow_test_main gtest - arrow_util arrow_test_util pthread dl diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 3f3807d4e20..50f4716769d 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -31,6 +31,8 @@ Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, parent_ = parent; } +Buffer::~Buffer() {} + std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } @@ -43,6 +45,12 @@ PoolBuffer::PoolBuffer(MemoryPool* pool) : pool_ = pool; } +PoolBuffer::~PoolBuffer() { + if (mutable_data_ != nullptr) { + pool_->Free(mutable_data_, capacity_); + } +} + Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 8704723eb0a..0c3e210abd9 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -39,6 +39,7 @@ class Buffer : public std::enable_shared_from_this { Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be // able to retain a valid pointer to it even after other shared_ptr's to the @@ -136,6 +137,7 @@ class ResizableBuffer : public MutableBuffer { class PoolBuffer : public ResizableBuffer { public: explicit PoolBuffer(MemoryPool* pool = nullptr); + virtual ~PoolBuffer(); virtual Status Resize(int64_t new_size); virtual Status Reserve(int64_t new_capacity); diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index c64b8a3d5f8..c6e113ebea5 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -35,4 +35,44 @@ const char* Status::CopyState(const char* state) { return result; } +std::string Status::CodeAsString() const { + if (state_ == NULL) { + return "OK"; + } + + const char* type; + switch (code()) { + case StatusCode::OK: + type = "OK"; + break; + case StatusCode::OutOfMemory: + type = "Out of memory"; + break; + case StatusCode::KeyError: + type = "Key error"; + break; + case StatusCode::Invalid: + type = "Invalid"; + break; + case StatusCode::NotImplemented: + type = "NotImplemented"; + break; + } + return std::string(type); +} + +std::string Status::ToString() const { + std::string result(CodeAsString()); + if (state_ == NULL) { + return result; + } + + result.append(": "); + + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(reinterpret_cast(state_ + 7), length); + return result; +} + } // namespace arrow diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index df55bfac9eb..8fdd829010e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -45,6 +45,12 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() +find_program(CCACHE_FOUND ccache) +if(CCACHE_FOUND) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache) +endif(CCACHE_FOUND) + ############################################################ # Compiler flags ############################################################ @@ -389,7 +395,12 @@ add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) set(PYARROW_SRCS + src/pyarrow/common.cc + src/pyarrow/helpers.cc src/pyarrow/init.cc + src/pyarrow/status.cc + + src/pyarrow/adapters/builtin.cc ) set(LINK_LIBS @@ -410,18 +421,16 @@ endif() # Setup and build Cython modules ############################################################ -foreach(pyx_api_file - arrow/config.pyx - arrow/parquet.pyx) - set_source_files_properties(${pyx_api_file} PROPERTIES CYTHON_API 1) -endforeach(pyx_api_file) - set(USE_RELATIVE_RPATH ON) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CYTHON_EXTENSIONS + array config + error parquet + scalar + schema ) foreach(module ${CYTHON_EXTENSIONS}) diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index e69de29bb2d..3c049b85e8c 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +from arrow.array import (Array, from_pylist, total_allocated_bytes, + BooleanArray, NumericArray, + Int8Array, UInt8Array, + ListArray, StringArray) + +from arrow.error import ArrowException + +from arrow.scalar import ArrayValue, NA, Scalar + +from arrow.schema import (null, bool_, + int8, int16, int32, int64, + uint8, uint16, uint32, uint64, + float_, double, string, + list_, struct, field, + DataType, Field, Schema) diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd new file mode 100644 index 00000000000..e32d27769b5 --- /dev/null +++ b/python/arrow/array.pxd @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport shared_ptr +from arrow.includes.arrow cimport CArray, LogicalType + +from arrow.scalar import NA + +from arrow.schema cimport DataType + +cdef extern from "Python.h": + int PySlice_Check(object) + +cdef class Array: + cdef: + shared_ptr[CArray] sp_array + CArray* ap + + cdef readonly: + DataType type + + cdef init(self, const shared_ptr[CArray]& sp_array) + cdef _getitem(self, int i) + + +cdef class BooleanArray(Array): + pass + + +cdef class NumericArray(Array): + pass + + +cdef class Int8Array(NumericArray): + pass + + +cdef class UInt8Array(NumericArray): + pass + + +cdef class Int16Array(NumericArray): + pass + + +cdef class UInt16Array(NumericArray): + pass + + +cdef class Int32Array(NumericArray): + pass + + +cdef class UInt32Array(NumericArray): + pass + + +cdef class Int64Array(NumericArray): + pass + + +cdef class UInt64Array(NumericArray): + pass + + +cdef class ListArray(Array): + pass + + +cdef class StringArray(Array): + pass diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx new file mode 100644 index 00000000000..3a3210d6cc1 --- /dev/null +++ b/python/arrow/array.pyx @@ -0,0 +1,179 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from arrow.includes.arrow cimport * +cimport arrow.includes.pyarrow as pyarrow + +from arrow.compat import frombytes, tobytes +from arrow.error cimport check_status + +from arrow.scalar import NA + +def total_allocated_bytes(): + cdef MemoryPool* pool = pyarrow.GetMemoryPool() + return pool.bytes_allocated() + + +cdef class Array: + + cdef init(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + self.type = DataType() + self.type.init(self.sp_array.get().type()) + + property null_count: + + def __get__(self): + return self.sp_array.get().null_count() + + def __len__(self): + return self.sp_array.get().length() + + def isnull(self): + raise NotImplemented + + def __getitem__(self, key): + cdef: + Py_ssize_t n = len(self) + + if PySlice_Check(key): + start = key.start or 0 + while start < 0: + start += n + + stop = key.stop if key.stop is not None else n + while stop < 0: + stop += n + + step = key.step or 1 + if step != 1: + raise NotImplementedError + else: + return self.slice(start, stop) + + while key < 0: + key += len(self) + + if self.ap.IsNull(key): + return NA + else: + return self._getitem(key) + + cdef _getitem(self, int i): + raise NotImplementedError + + def slice(self, start, end): + pass + + +cdef class NullArray(Array): + pass + + +cdef class BooleanArray(Array): + pass + + +cdef class NumericArray(Array): + pass + + +cdef class Int8Array(NumericArray): + pass + + +cdef class UInt8Array(NumericArray): + pass + + +cdef class Int16Array(NumericArray): + pass + + +cdef class UInt16Array(NumericArray): + pass + + +cdef class Int32Array(NumericArray): + pass + + +cdef class UInt32Array(NumericArray): + pass + + +cdef class Int64Array(NumericArray): + pass + + +cdef class UInt64Array(NumericArray): + pass + + +cdef class FloatArray(NumericArray): + pass + + +cdef class DoubleArray(NumericArray): + pass + + +cdef class ListArray(Array): + pass + + +cdef class StringArray(Array): + pass + + +cdef dict _array_classes = { + LogicalType_NA: NullArray, + LogicalType_BOOL: BooleanArray, + LogicalType_INT64: Int64Array, + LogicalType_DOUBLE: DoubleArray, + LogicalType_LIST: ListArray, + LogicalType_STRING: StringArray, +} + +cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): + if sp_array.get() == NULL: + raise ValueError('Array was NULL') + + cdef CDataType* data_type = sp_array.get().type().get() + + if data_type == NULL: + raise ValueError('Array data type was NULL') + + cdef Array arr = _array_classes[data_type.type]() + arr.init(sp_array) + return arr + + +def from_pylist(object list_obj, type=None): + """ + Convert Python list to Arrow array + """ + cdef: + shared_ptr[CArray] sp_array + + check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + return box_arrow_array(sp_array) diff --git a/python/arrow/config.pyx b/python/arrow/config.pyx index 8f10beb3a2e..521bc066cd4 100644 --- a/python/arrow/config.pyx +++ b/python/arrow/config.pyx @@ -2,7 +2,7 @@ # distutils: language = c++ # cython: embedsignature = True -cdef extern from 'pyarrow/init.h' namespace 'arrow::py': +cdef extern from 'pyarrow/init.h' namespace 'pyarrow': void pyarrow_init() pyarrow_init() diff --git a/python/arrow/error.pxd b/python/arrow/error.pxd new file mode 100644 index 00000000000..c18cb3efffc --- /dev/null +++ b/python/arrow/error.pxd @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.pyarrow cimport * + +cdef check_status(const Status& status) diff --git a/python/arrow/error.pyx b/python/arrow/error.pyx new file mode 100644 index 00000000000..f1d51635881 --- /dev/null +++ b/python/arrow/error.pyx @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport c_string + +from arrow.compat import frombytes + +class ArrowException(Exception): + pass + +cdef check_status(const Status& status): + if status.ok(): + return + + cdef c_string c_message = status.ToString() + raise ArrowException(frombytes(c_message)) diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index 3635ceb8685..fde5de91091 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -20,4 +20,77 @@ from arrow.includes.common cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: - pass + + enum LogicalType" arrow::LogicalType::type": + LogicalType_NA" arrow::LogicalType::NA" + + LogicalType_BOOL" arrow::LogicalType::BOOL" + + LogicalType_UINT8" arrow::LogicalType::UINT8" + LogicalType_INT8" arrow::LogicalType::INT8" + LogicalType_UINT16" arrow::LogicalType::UINT16" + LogicalType_INT16" arrow::LogicalType::INT16" + LogicalType_UINT32" arrow::LogicalType::UINT32" + LogicalType_INT32" arrow::LogicalType::INT32" + LogicalType_UINT64" arrow::LogicalType::UINT64" + LogicalType_INT64" arrow::LogicalType::INT64" + + LogicalType_FLOAT" arrow::LogicalType::FLOAT" + LogicalType_DOUBLE" arrow::LogicalType::DOUBLE" + + LogicalType_STRING" arrow::LogicalType::STRING" + + LogicalType_LIST" arrow::LogicalType::LIST" + LogicalType_STRUCT" arrow::LogicalType::STRUCT" + + cdef cppclass CDataType" arrow::DataType": + LogicalType type + c_bool nullable + + c_bool Equals(const CDataType* other) + + c_string ToString() + + cdef cppclass MemoryPool" arrow::MemoryPool": + int64_t bytes_allocated() + + cdef cppclass CListType" arrow::ListType"(CDataType): + CListType(const shared_ptr[CDataType]& value_type, + c_bool nullable) + + cdef cppclass CStringType" arrow::StringType"(CDataType): + pass + + cdef cppclass CField" arrow::Field": + c_string name + shared_ptr[CDataType] type + + CField(const c_string& name, const shared_ptr[CDataType]& type) + + cdef cppclass CStructType" arrow::StructType"(CDataType): + CStructType(const vector[shared_ptr[CField]]& fields, + c_bool nullable) + + cdef cppclass CSchema" arrow::Schema": + CSchema(const shared_ptr[CField]& fields) + + cdef cppclass CArray" arrow::Array": + const shared_ptr[CDataType]& type() + + int32_t length() + int32_t null_count() + LogicalType logical_type() + + c_bool IsNull(int i) + + cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): + pass + + cdef cppclass CInt8Array" arrow::Int8Array"(CArray): + pass + + cdef cppclass CListArray" arrow::ListArray"(CArray): + pass + + cdef cppclass CStringArray" arrow::StringArray"(CListArray): + pass diff --git a/python/arrow/includes/common.pxd b/python/arrow/includes/common.pxd index f2fc826625e..839427a6990 100644 --- a/python/arrow/includes/common.pxd +++ b/python/arrow/includes/common.pxd @@ -19,7 +19,7 @@ from libc.stdint cimport * from libcpp cimport bool as c_bool -from libcpp.string cimport string +from libcpp.string cimport string as c_string from libcpp.vector cimport vector # This must be included for cerr and other things to work @@ -29,6 +29,8 @@ cdef extern from "": cdef extern from "" namespace "std" nogil: cdef cppclass shared_ptr[T]: + shared_ptr() + shared_ptr(T*) T* get() void reset() void reset(T* p) diff --git a/python/arrow/includes/pyarrow.pxd b/python/arrow/includes/pyarrow.pxd index dcef663f389..3eed5b85424 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/arrow/includes/pyarrow.pxd @@ -18,6 +18,28 @@ # distutils: language = c++ from arrow.includes.common cimport * +from arrow.includes.arrow cimport (CArray, CDataType, LogicalType, + MemoryPool) cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: - pass + # We can later add more of the common status factory methods as needed + cdef Status Status_OK "Status::OK"() + + cdef cppclass Status: + Status() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsTypeError() + c_bool IsIOError() + c_bool IsValueError() + c_bool IsNotImplemented() + c_bool IsArrowError() + + shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable) + Status ConvertPySequence(object obj, shared_ptr[CArray]* out) + + MemoryPool* GetMemoryPool() diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd new file mode 100644 index 00000000000..e193c09cd69 --- /dev/null +++ b/python/arrow/scalar.pxd @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport * +from arrow.includes.arrow cimport CArray, CListArray + +from arrow.schema cimport DataType + +cdef class Scalar: + cdef readonly: + DataType type + + +cdef class NAType(Scalar): + pass + + +cdef class ArrayValue(Scalar): + cdef: + shared_ptr[CArray] array + int index + + +cdef class Int8Value(ArrayValue): + pass + + +cdef class ListValue(ArrayValue): + pass + + +cdef class StringValue(ArrayValue): + pass diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx new file mode 100644 index 00000000000..78dadecf9b4 --- /dev/null +++ b/python/arrow/scalar.pyx @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import arrow.schema as schema + +cdef class NAType(Scalar): + + def __cinit__(self): + self.type = schema.null() + + def __repr__(self): + return 'NA' + +NA = NAType() diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd new file mode 100644 index 00000000000..487c246f44a --- /dev/null +++ b/python/arrow/schema.pxd @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.includes.common cimport shared_ptr +from arrow.includes.arrow cimport CDataType, CField, CSchema + +cdef class DataType: + cdef: + shared_ptr[CDataType] sp_type + CDataType* type + + cdef init(self, const shared_ptr[CDataType]& type) + +cdef class Field: + cdef: + shared_ptr[CField] sp_field + CField* field + + cdef readonly: + DataType type + +cdef class Schema: + cdef: + shared_ptr[CSchema] sp_schema + CSchema* schema diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx new file mode 100644 index 00000000000..63cd6e888ab --- /dev/null +++ b/python/arrow/schema.pyx @@ -0,0 +1,150 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +######################################## +# Data types, fields, schemas, and so forth + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from arrow.compat import frombytes, tobytes +from arrow.includes.arrow cimport * +cimport arrow.includes.pyarrow as pyarrow + +cimport cpython + +cdef class DataType: + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CDataType]& type): + self.sp_type = type + self.type = type.get() + + def __str__(self): + return frombytes(self.type.ToString()) + + def __repr__(self): + return 'DataType({0})'.format(str(self)) + + def __richcmp__(DataType self, DataType other, int op): + if op == cpython.Py_EQ: + return self.type.Equals(other.type) + elif op == cpython.Py_NE: + return not self.type.Equals(other.type) + else: + raise TypeError('Invalid comparison') + + +cdef class Field: + + def __cinit__(self, object name, DataType type): + self.type = type + self.sp_field.reset(new CField(tobytes(name), type.sp_type)) + self.field = self.sp_field.get() + + def __repr__(self): + return 'Field({0!r}, type={1})'.format(self.name, str(self.type)) + + property name: + + def __get__(self): + return frombytes(self.field.name) + +cdef dict _type_cache = {} + +cdef DataType primitive_type(LogicalType type, bint nullable=True): + if (type, nullable) in _type_cache: + return _type_cache[type, nullable] + + cdef DataType out = DataType() + out.init(pyarrow.GetPrimitiveType(type, nullable)) + + _type_cache[type, nullable] = out + return out + +#------------------------------------------------------------ +# Type factory functions + +def field(name, type): + return Field(name, type) + +def null(): + return primitive_type(LogicalType_NA) + +def bool_(c_bool nullable=True): + return primitive_type(LogicalType_BOOL, nullable) + +def uint8(c_bool nullable=True): + return primitive_type(LogicalType_UINT8, nullable) + +def int8(c_bool nullable=True): + return primitive_type(LogicalType_INT8, nullable) + +def uint16(c_bool nullable=True): + return primitive_type(LogicalType_UINT16, nullable) + +def int16(c_bool nullable=True): + return primitive_type(LogicalType_INT16, nullable) + +def uint32(c_bool nullable=True): + return primitive_type(LogicalType_UINT32, nullable) + +def int32(c_bool nullable=True): + return primitive_type(LogicalType_INT32, nullable) + +def uint64(c_bool nullable=True): + return primitive_type(LogicalType_UINT64, nullable) + +def int64(c_bool nullable=True): + return primitive_type(LogicalType_INT64, nullable) + +def float_(c_bool nullable=True): + return primitive_type(LogicalType_FLOAT, nullable) + +def double(c_bool nullable=True): + return primitive_type(LogicalType_DOUBLE, nullable) + +def string(c_bool nullable=True): + """ + UTF8 string + """ + return primitive_type(LogicalType_STRING, nullable) + +def list_(DataType value_type, c_bool nullable=True): + cdef DataType out = DataType() + out.init(shared_ptr[CDataType]( + new CListType(value_type.sp_type, nullable))) + return out + +def struct(fields, c_bool nullable=True): + """ + + """ + cdef: + DataType out = DataType() + Field field + vector[shared_ptr[CField]] c_fields + + for field in fields: + c_fields.push_back(field.sp_field) + + out.init(shared_ptr[CDataType]( + new CStructType(c_fields, nullable))) + return out diff --git a/python/arrow/tests/test_array.py b/python/arrow/tests/test_array.py new file mode 100644 index 00000000000..8eaa5335206 --- /dev/null +++ b/python/arrow/tests/test_array.py @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestArrayAPI(unittest.TestCase): + + def test_getitem_NA(self): + arr = arrow.from_pylist([1, None, 2]) + assert arr[1] is arrow.NA diff --git a/python/arrow/tests/test_convert_builtin.py b/python/arrow/tests/test_convert_builtin.py new file mode 100644 index 00000000000..57e6ab9f0e7 --- /dev/null +++ b/python/arrow/tests/test_convert_builtin.py @@ -0,0 +1,85 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestConvertList(unittest.TestCase): + + def test_boolean(self): + pass + + def test_empty_list(self): + arr = arrow.from_pylist([]) + assert len(arr) == 0 + assert arr.null_count == 0 + assert arr.type == arrow.null() + + def test_all_none(self): + arr = arrow.from_pylist([None, None]) + assert len(arr) == 2 + assert arr.null_count == 2 + assert arr.type == arrow.null() + + def test_integer(self): + arr = arrow.from_pylist([1, None, 3, None]) + assert len(arr) == 4 + assert arr.null_count == 2 + assert arr.type == arrow.int64() + + def test_garbage_collection(self): + import gc + bytes_before = arrow.total_allocated_bytes() + arrow.from_pylist([1, None, 3, None]) + gc.collect() + assert arrow.total_allocated_bytes() == bytes_before + + def test_double(self): + data = [1.5, 1, None, 2.5, None, None] + arr = arrow.from_pylist(data) + assert len(arr) == 6 + assert arr.null_count == 3 + assert arr.type == arrow.double() + + def test_string(self): + data = ['foo', b'bar', None, 'arrow'] + arr = arrow.from_pylist(data) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == arrow.string() + + def test_mixed_nesting_levels(self): + arrow.from_pylist([1, 2, None]) + arrow.from_pylist([[1], [2], None]) + arrow.from_pylist([[1], [2], [None]]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([1, 2, [1]]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([1, 2, []]) + + with self.assertRaises(arrow.ArrowException): + arrow.from_pylist([[1], [2], [None, [1]]]) + + def test_list_of_int(self): + data = [[1, 2, 3], [], None, [1, 2]] + arr = arrow.from_pylist(data) + assert len(arr) == 4 + assert arr.null_count == 1 + assert arr.type == arrow.list_(arrow.int64()) diff --git a/python/arrow/tests/test_schema.py b/python/arrow/tests/test_schema.py new file mode 100644 index 00000000000..a89edd74a0a --- /dev/null +++ b/python/arrow/tests/test_schema.py @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest +import arrow + + +class TestTypes(unittest.TestCase): + + def test_integers(self): + dtypes = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64'] + + for name in dtypes: + factory = getattr(arrow, name) + t = factory() + t_required = factory(False) + + assert str(t) == name + assert str(t_required) == '{0} not null'.format(name) + + def test_list(self): + value_type = arrow.int32() + list_type = arrow.list_(value_type) + assert str(list_type) == 'list' + + def test_string(self): + t = arrow.string() + assert str(t) == 'string' + + def test_field(self): + t = arrow.string() + f = arrow.field('foo', t) + + assert f.name == 'foo' + assert f.type is t + assert repr(f) == "Field('foo', type=string)" diff --git a/python/setup.py b/python/setup.py index f6b0a4bee83..9a0de071a9c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -124,7 +124,10 @@ def _run_cmake(self): static_lib_option, source] self.spawn(cmake_command) - self.spawn(['make']) + args = ['make'] + if 'PYARROW_PARALLEL' in os.environ: + args.append('-j{0}'.format(os.environ['PYARROW_PARALLEL'])) + self.spawn(args) else: import shlex cmake_generator = 'Visual Studio 14 2015' @@ -207,7 +210,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['config', 'parquet'] + return ['array', 'config', 'error', 'parquet', 'scalar', 'schema'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc new file mode 100644 index 00000000000..ae84fa12b0d --- /dev/null +++ b/python/src/pyarrow/adapters/builtin.cc @@ -0,0 +1,415 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "pyarrow/adapters/builtin.h" + +#include + +#include "pyarrow/status.h" + +using arrow::ArrayBuilder; +using arrow::DataType; +using arrow::LogicalType; + +namespace pyarrow { + +static inline bool IsPyInteger(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyLong_Check(obj) || PyInt_Check(obj); +#else + return PyLong_Check(obj); +#endif +} + +static inline bool IsPyBaseString(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyString_Check(obj) || PyUnicode_Check(obj); +#else + return PyUnicode_Check(obj); +#endif +} + +class ScalarVisitor { + public: + ScalarVisitor() : + total_count_(0), + none_count_(0), + bool_count_(0), + int_count_(0), + float_count_(0), + string_count_(0) {} + + void Visit(PyObject* obj) { + ++total_count_; + if (obj == Py_None) { + ++none_count_; + } else if (PyFloat_Check(obj)) { + ++float_count_; + } else if (IsPyInteger(obj)) { + ++int_count_; + } else if (IsPyBaseString(obj)) { + ++string_count_; + } else { + // TODO(wesm): accumulate error information somewhere + } + } + + std::shared_ptr GetType() { + // TODO(wesm): handling mixed-type cases + if (float_count_) { + return arrow::DOUBLE; + } else if (int_count_) { + // TODO(wesm): tighter type later + return arrow::INT64; + } else if (bool_count_) { + return arrow::BOOL; + } else if (string_count_) { + return arrow::STRING; + } else { + return arrow::NA; + } + } + + int64_t total_count() const { + return total_count_; + } + + private: + int64_t total_count_; + int64_t none_count_; + int64_t bool_count_; + int64_t int_count_; + int64_t float_count_; + int64_t string_count_; + + // Place to accumulate errors + // std::vector errors_; +}; + +static constexpr int MAX_NESTING_LEVELS = 32; + +class SeqVisitor { + public: + SeqVisitor() : + max_nesting_level_(0) { + memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); + } + + Status Visit(PyObject* obj, int level=0) { + Py_ssize_t size = PySequence_Size(obj); + + if (level > max_nesting_level_) { + max_nesting_level_ = level; + } + + for (int64_t i = 0; i < size; ++i) { + // TODO(wesm): Error checking? + // TODO(wesm): Specialize for PyList_GET_ITEM? + OwnedRef item_ref(PySequence_GetItem(obj, i)); + PyObject* item = item_ref.obj(); + + if (PyList_Check(item)) { + PY_RETURN_NOT_OK(Visit(item, level + 1)); + } else if (PyDict_Check(item)) { + return Status::NotImplemented("No type inference for dicts"); + } else { + // We permit nulls at any level of nesting + if (item == Py_None) { + // TODO + } else { + ++nesting_histogram_[level]; + scalars_.Visit(item); + } + } + } + return Status::OK(); + } + + std::shared_ptr GetType() { + if (scalars_.total_count() == 0) { + if (max_nesting_level_ == 0) { + return arrow::NA; + } else { + return nullptr; + } + } else { + std::shared_ptr result = scalars_.GetType(); + for (int i = 0; i < max_nesting_level_; ++i) { + result = std::make_shared(result); + } + return result; + } + } + + Status Validate() const { + if (scalars_.total_count() > 0) { + if (num_nesting_levels() > 1) { + return Status::ValueError("Mixed nesting levels not supported"); + } else if (max_observed_level() < max_nesting_level_) { + return Status::ValueError("Mixed nesting levels not supported"); + } + } + return Status::OK(); + } + + int max_observed_level() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { + result = i; + } + } + return result; + } + + int num_nesting_levels() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { + ++result; + } + } + return result; + } + + private: + ScalarVisitor scalars_; + + // Track observed + int max_nesting_level_; + int nesting_histogram_[MAX_NESTING_LEVELS]; +}; + +// Non-exhaustive type inference +static Status InferArrowType(PyObject* obj, int64_t* size, + std::shared_ptr* out_type) { + *size = PySequence_Size(obj); + if (PyErr_Occurred()) { + // Not a sequence + PyErr_Clear(); + return Status::TypeError("Object is not a sequence"); + } + + // For 0-length sequences, refuse to guess + if (*size == 0) { + *out_type = arrow::NA; + } + + SeqVisitor seq_visitor; + PY_RETURN_NOT_OK(seq_visitor.Visit(obj)); + PY_RETURN_NOT_OK(seq_visitor.Validate()); + + *out_type = seq_visitor.GetType(); + return Status::OK(); +} + +// Marshal Python sequence (list, tuple, etc.) to Arrow array +class SeqConverter { + public: + virtual Status Init(const std::shared_ptr& builder) { + builder_ = builder; + return Status::OK(); + } + + virtual Status AppendData(PyObject* seq) = 0; + + protected: + std::shared_ptr builder_; +}; + +template +class TypedConverter : public SeqConverter { + public: + Status Init(const std::shared_ptr& builder) override { + builder_ = builder; + typed_builder_ = static_cast(builder.get()); + return Status::OK(); + } + + protected: + BuilderType* typed_builder_; +}; + +class BoolConverter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + return Status::OK(); + } +}; + +class Int64Converter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + int64_t val; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + val = PyLong_AsLongLong(item.obj()); + RETURN_IF_PYERROR(); + RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + } + } + return Status::OK(); + } +}; + +class DoubleConverter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + int64_t val; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + } + } + return Status::OK(); + } +}; + +class StringConverter : public TypedConverter { + public: + Status AppendData(PyObject* seq) override { + PyObject* item; + PyObject* bytes_obj; + OwnedRef tmp; + const char* bytes; + int32_t length; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + item = PySequence_GetItem(seq, i); + OwnedRef holder(item); + + if (item == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + continue; + } else if (PyUnicode_Check(item)) { + tmp.reset(PyUnicode_AsUTF8String(item)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + } else if (PyBytes_Check(item)) { + bytes_obj = item; + } else { + return Status::TypeError("Non-string value encountered"); + } + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_ARROW_NOT_OK(typed_builder_->Append(bytes, length)); + } + return Status::OK(); + } +}; + +class ListConverter : public TypedConverter { + public: + Status Init(const std::shared_ptr& builder) override; + + Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + } else { + typed_builder_->Append(); + PY_RETURN_NOT_OK(value_converter_->AppendData(item.obj())); + } + } + return Status::OK(); + } + protected: + std::shared_ptr value_converter_; +}; + +// Dynamic constructor for sequence converters +std::shared_ptr GetConverter(const std::shared_ptr& type) { + switch (type->type) { + case LogicalType::BOOL: + return std::make_shared(); + case LogicalType::INT64: + return std::make_shared(); + case LogicalType::DOUBLE: + return std::make_shared(); + case LogicalType::STRING: + return std::make_shared(); + case LogicalType::LIST: + return std::make_shared(); + case LogicalType::STRUCT: + default: + return nullptr; + break; + } +} + +Status ListConverter::Init(const std::shared_ptr& builder) { + builder_ = builder; + typed_builder_ = static_cast(builder.get()); + + value_converter_ = GetConverter(static_cast( + builder->type().get())->value_type); + if (value_converter_ == nullptr) { + return Status::NotImplemented("value type not implemented"); + } + + value_converter_->Init(typed_builder_->value_builder()); + return Status::OK(); +} + +Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { + std::shared_ptr type; + int64_t size; + PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + + // Handle NA / NullType case + if (type->type == LogicalType::NA) { + out->reset(new arrow::Array(type, size, size)); + return Status::OK(); + } + + std::shared_ptr converter = GetConverter(type); + if (converter == nullptr) { + std::stringstream ss; + ss << "No type converter implemented for " + << type->ToString(); + return Status::NotImplemented(ss.str()); + } + + // Give the sequence converter an array builder + std::shared_ptr builder; + RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder)); + converter->Init(builder); + + PY_RETURN_NOT_OK(converter->AppendData(obj)); + + *out = builder->Finish(); + + return Status::OK(); +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h new file mode 100644 index 00000000000..24886f4970d --- /dev/null +++ b/python/src/pyarrow/adapters/builtin.h @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between CPython built-in data structures and Arrow +// data structures + +#ifndef PYARROW_ADAPTERS_BUILTIN_H +#define PYARROW_ADAPTERS_BUILTIN_H + +#include + +#include + +#include "pyarrow/common.h" + +namespace arrow { class Array; } + +namespace pyarrow { + +class Status; + +Status ConvertPySequence(PyObject* obj, std::shared_ptr* out); + +} // namespace pyarrow + +#endif // PYARROW_ADAPTERS_BUILTIN_H diff --git a/cpp/src/arrow/field.cc b/python/src/pyarrow/adapters/pandas.h similarity index 76% rename from cpp/src/arrow/field.cc rename to python/src/pyarrow/adapters/pandas.h index 4568d905c29..a4f41638087 100644 --- a/cpp/src/arrow/field.cc +++ b/python/src/pyarrow/adapters/pandas.h @@ -15,17 +15,14 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/field.h" +// Functions for converting between pandas's NumPy-based data representation +// and Arrow data structures -#include -#include +#ifndef PYARROW_ADAPTERS_PANDAS_H +#define PYARROW_ADAPTERS_PANDAS_H -namespace arrow { +namespace pyarrow { -std::string Field::ToString() const { - std::stringstream ss; - ss << this->name << " " << this->type->ToString(); - return ss.str(); -} +} // namespace pyarrow -} // namespace arrow +#endif // PYARROW_ADAPTERS_PANDAS_H diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h index c2285de77bf..72be6afe02c 100644 --- a/python/src/pyarrow/api.h +++ b/python/src/pyarrow/api.h @@ -18,4 +18,11 @@ #ifndef PYARROW_API_H #define PYARROW_API_H +#include "pyarrow/status.h" + +#include "pyarrow/helpers.h" + +#include "pyarrow/adapters/builtin.h" +#include "pyarrow/adapters/pandas.h" + #endif // PYARROW_API_H diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc new file mode 100644 index 00000000000..a2748f99b67 --- /dev/null +++ b/python/src/pyarrow/common.cc @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/common.h" + +#include +#include +#include + +#include +#include + +#include "pyarrow/status.h" + +namespace pyarrow { + +class PyArrowMemoryPool : public arrow::MemoryPool { + public: + PyArrowMemoryPool() : bytes_allocated_(0) {} + virtual ~PyArrowMemoryPool() {} + + arrow::Status Allocate(int64_t size, uint8_t** out) override { + std::lock_guard guard(pool_lock_); + *out = static_cast(std::malloc(size)); + if (*out == nullptr) { + std::stringstream ss; + ss << "malloc of size " << size << " failed"; + return arrow::Status::OutOfMemory(ss.str()); + } + + bytes_allocated_ += size; + + return arrow::Status::OK(); + } + + int64_t bytes_allocated() const override { + std::lock_guard guard(pool_lock_); + return bytes_allocated_; + } + + void Free(uint8_t* buffer, int64_t size) override { + std::lock_guard guard(pool_lock_); + std::free(buffer); + bytes_allocated_ -= size; + } + + private: + mutable std::mutex pool_lock_; + int64_t bytes_allocated_; +}; + +arrow::MemoryPool* GetMemoryPool() { + static PyArrowMemoryPool memory_pool; + return &memory_pool; +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h new file mode 100644 index 00000000000..a43e4d28c89 --- /dev/null +++ b/python/src/pyarrow/common.h @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_COMMON_H +#define PYARROW_COMMON_H + +#include + +namespace arrow { class MemoryPool; } + +namespace pyarrow { + +#define PYARROW_IS_PY2 PY_MAJOR_VERSION < 2 + +#define RETURN_ARROW_NOT_OK(s) do { \ + arrow::Status _s = (s); \ + if (!_s.ok()) { \ + return Status::ArrowError(s.ToString()); \ + } \ + } while (0); + +class OwnedRef { + public: + OwnedRef() : obj_(nullptr) {} + + OwnedRef(PyObject* obj) : + obj_(obj) {} + + ~OwnedRef() { + Py_XDECREF(obj_); + } + + void reset(PyObject* obj) { + if (obj_ != nullptr) { + Py_XDECREF(obj_); + } + obj_ = obj; + } + + PyObject* obj() const{ + return obj_; + } + + private: + PyObject* obj_; +}; + +struct PyObjectStringify { + OwnedRef tmp_obj; + const char* bytes; + + PyObjectStringify(PyObject* obj) { + PyObject* bytes_obj; + if (PyUnicode_Check(obj)) { + bytes_obj = PyUnicode_AsUTF8String(obj); + tmp_obj.reset(bytes_obj); + } else { + bytes_obj = obj; + } + bytes = PyBytes_AsString(bytes_obj); + } +}; + +// TODO(wesm): We can just let errors pass through. To be explored later +#define RETURN_IF_PYERROR() \ + if (PyErr_Occurred()) { \ + PyObject *exc_type, *exc_value, *traceback; \ + PyErr_Fetch(&exc_type, &exc_value, &traceback); \ + PyObjectStringify stringified(exc_value); \ + std::string message(stringified.bytes); \ + Py_DECREF(exc_type); \ + Py_DECREF(exc_value); \ + Py_DECREF(traceback); \ + return Status::UnknownError(message); \ + } + +arrow::MemoryPool* GetMemoryPool(); + +} // namespace pyarrow + +#endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc new file mode 100644 index 00000000000..d0969dacc21 --- /dev/null +++ b/python/src/pyarrow/helpers.cc @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/helpers.h" + +#include + +using namespace arrow; + +namespace pyarrow { + +#define GET_PRIMITIVE_TYPE(NAME, Type) \ + case LogicalType::NAME: \ + if (nullable) { \ + return NAME; \ + } else { \ + return std::make_shared(nullable); \ + } \ + break; + +std::shared_ptr GetPrimitiveType(LogicalType::type type, + bool nullable) { + switch (type) { + case LogicalType::NA: + return NA; + GET_PRIMITIVE_TYPE(UINT8, UInt8Type); + GET_PRIMITIVE_TYPE(INT8, Int8Type); + GET_PRIMITIVE_TYPE(UINT16, UInt16Type); + GET_PRIMITIVE_TYPE(INT16, Int16Type); + GET_PRIMITIVE_TYPE(UINT32, UInt32Type); + GET_PRIMITIVE_TYPE(INT32, Int32Type); + GET_PRIMITIVE_TYPE(UINT64, UInt64Type); + GET_PRIMITIVE_TYPE(INT64, Int64Type); + GET_PRIMITIVE_TYPE(BOOL, BooleanType); + GET_PRIMITIVE_TYPE(FLOAT, FloatType); + GET_PRIMITIVE_TYPE(DOUBLE, DoubleType); + GET_PRIMITIVE_TYPE(STRING, StringType); + default: + return nullptr; + } +} + +} // namespace pyarrow diff --git a/cpp/src/arrow/types/null.h b/python/src/pyarrow/helpers.h similarity index 72% rename from cpp/src/arrow/types/null.h rename to python/src/pyarrow/helpers.h index c67f752d409..1a24f056feb 100644 --- a/cpp/src/arrow/types/null.h +++ b/python/src/pyarrow/helpers.h @@ -15,20 +15,20 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TYPES_NULL_H -#define ARROW_TYPES_NULL_H +#ifndef PYARROW_HELPERS_H +#define PYARROW_HELPERS_H -#include -#include +#include +#include -#include "arrow/type.h" +namespace pyarrow { -namespace arrow { +using arrow::DataType; +using arrow::LogicalType; -struct NullType : public PrimitiveType { - PRIMITIVE_DECL(NullType, void, NA, 0, "null"); -}; +std::shared_ptr GetPrimitiveType(LogicalType::type type, + bool nullable); -} // namespace arrow +} // namespace pyarrow -#endif // ARROW_TYPES_NULL_H +#endif // PYARROW_HELPERS_H diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/init.cc index c36f4137255..acd851e1687 100644 --- a/python/src/pyarrow/init.cc +++ b/python/src/pyarrow/init.cc @@ -17,13 +17,9 @@ #include "pyarrow/init.h" -namespace arrow { - -namespace py { +namespace pyarrow { void pyarrow_init() { } -} // namespace py - -} // namespace arrow +} // namespace pyarrow diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/init.h index 1fc9f101026..71e67a20c1c 100644 --- a/python/src/pyarrow/init.h +++ b/python/src/pyarrow/init.h @@ -18,14 +18,10 @@ #ifndef PYARROW_INIT_H #define PYARROW_INIT_H -namespace arrow { - -namespace py { +namespace pyarrow { void pyarrow_init(); -} // namespace py - -} // namespace arrow +} // namespace pyarrow #endif // PYARROW_INIT_H diff --git a/python/src/pyarrow/status.cc b/python/src/pyarrow/status.cc new file mode 100644 index 00000000000..1cd54f6a785 --- /dev/null +++ b/python/src/pyarrow/status.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#include "pyarrow/status.h" + +#include +#include +#include + +namespace pyarrow { + +Status::Status(StatusCode code, const std::string& msg, int16_t posix_code) { + assert(code != StatusCode::OK); + const uint32_t size = msg.size(); + char* result = new char[size + 7]; + memcpy(result, &size, sizeof(size)); + result[4] = static_cast(code); + memcpy(result + 5, &posix_code, sizeof(posix_code)); + memcpy(result + 7, msg.c_str(), msg.size()); + state_ = result; +} + +const char* Status::CopyState(const char* state) { + uint32_t size; + memcpy(&size, state, sizeof(size)); + char* result = new char[size + 7]; + memcpy(result, state, size + 7); + return result; +} + +std::string Status::CodeAsString() const { + if (state_ == NULL) { + return "OK"; + } + + const char* type; + switch (code()) { + case StatusCode::OK: + type = "OK"; + break; + case StatusCode::OutOfMemory: + type = "Out of memory"; + break; + case StatusCode::KeyError: + type = "Key error"; + break; + case StatusCode::TypeError: + type = "Value error"; + break; + case StatusCode::ValueError: + type = "Value error"; + break; + case StatusCode::IOError: + type = "IO error"; + break; + case StatusCode::NotImplemented: + type = "Not implemented"; + break; + case StatusCode::ArrowError: + type = "Arrow C++ error"; + break; + case StatusCode::UnknownError: + type = "Unknown error"; + break; + } + return std::string(type); +} + +std::string Status::ToString() const { + std::string result(CodeAsString()); + if (state_ == NULL) { + return result; + } + + result.append(": "); + + uint32_t length; + memcpy(&length, state_, sizeof(length)); + result.append(reinterpret_cast(state_ + 7), length); + return result; +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/status.h b/python/src/pyarrow/status.h new file mode 100644 index 00000000000..cb8c8add210 --- /dev/null +++ b/python/src/pyarrow/status.h @@ -0,0 +1,144 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A Status encapsulates the result of an operation. It may indicate success, +// or it may indicate an error with an associated error message. +// +// Multiple threads can invoke const methods on a Status without +// external synchronization, but if any of the threads may call a +// non-const method, all threads accessing the same Status must use +// external synchronization. + +#ifndef PYARROW_STATUS_H_ +#define PYARROW_STATUS_H_ + +#include +#include +#include + +namespace pyarrow { + +#define PY_RETURN_NOT_OK(s) do { \ + Status _s = (s); \ + if (!_s.ok()) return _s; \ + } while (0); + +enum class StatusCode: char { + OK = 0, + OutOfMemory = 1, + KeyError = 2, + TypeError = 3, + ValueError = 4, + IOError = 5, + NotImplemented = 6, + + ArrowError = 7, + + UnknownError = 10 +}; + +class Status { + public: + // Create a success status. + Status() : state_(NULL) { } + ~Status() { delete[] state_; } + + // Copy the specified status. + Status(const Status& s); + void operator=(const Status& s); + + // Return a success status. + static Status OK() { return Status(); } + + // Return error status of an appropriate type. + static Status OutOfMemory(const std::string& msg, int16_t posix_code = -1) { + return Status(StatusCode::OutOfMemory, msg, posix_code); + } + + static Status KeyError(const std::string& msg) { + return Status(StatusCode::KeyError, msg, -1); + } + + static Status TypeError(const std::string& msg) { + return Status(StatusCode::TypeError, msg, -1); + } + + static Status IOError(const std::string& msg) { + return Status(StatusCode::IOError, msg, -1); + } + + static Status ValueError(const std::string& msg) { + return Status(StatusCode::ValueError, msg, -1); + } + + static Status NotImplemented(const std::string& msg) { + return Status(StatusCode::NotImplemented, msg, -1); + } + + static Status UnknownError(const std::string& msg) { + return Status(StatusCode::UnknownError, msg, -1); + } + + static Status ArrowError(const std::string& msg) { + return Status(StatusCode::ArrowError, msg, -1); + } + + // Returns true iff the status indicates success. + bool ok() const { return (state_ == NULL); } + + bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } + bool IsKeyError() const { return code() == StatusCode::KeyError; } + bool IsIOError() const { return code() == StatusCode::IOError; } + bool IsTypeError() const { return code() == StatusCode::TypeError; } + bool IsValueError() const { return code() == StatusCode::ValueError; } + + bool IsUnknownError() const { return code() == StatusCode::UnknownError; } + + bool IsArrowError() const { return code() == StatusCode::ArrowError; } + + // Return a string representation of this status suitable for printing. + // Returns the string "OK" for success. + std::string ToString() const; + + // Return a string representation of the status code, without the message + // text or posix code information. + std::string CodeAsString() const; + + // Get the POSIX code associated with this Status, or -1 if there is none. + int16_t posix_code() const; + + private: + // OK status has a NULL state_. Otherwise, state_ is a new[] array + // of the following form: + // state_[0..3] == length of message + // state_[4] == code + // state_[5..6] == posix_code + // state_[7..] == message + const char* state_; + + StatusCode code() const { + return ((state_ == NULL) ? + StatusCode::OK : static_cast(state_[4])); + } + + Status(StatusCode code, const std::string& msg, int16_t posix_code); + static const char* CopyState(const char* s); +}; + +inline Status::Status(const Status& s) { + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); +} + +inline void Status::operator=(const Status& s) { + // The following condition catches both aliasing (when this == &s), + // and the common case where both s and *this are ok. + if (state_ != s.state_) { + delete[] state_; + state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); + } +} + +} // namespace pyarrow + +#endif // PYARROW_STATUS_H_ From ae95dbd189477442d39e55fb0a1aede206906cd9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Mar 2016 22:39:07 -0800 Subject: [PATCH 024/210] ARROW-44: Python: prototype object model for array slot values ("scalars") Non-exhaustive, but this will facilitate inspecting Arrow data while the library is in development. ```python In [2]: arr = arrow.from_pylist([['foo', None], None, [], ['qux']]) In [3]: arr Out[3]: In [4]: arr[0] Out[4]: ['foo', None] In [5]: type(arr[0]) Out[5]: arrow.scalar.ListValue In [6]: arr[0][0] Out[6]: 'foo' In [7]: arr[0][1] Out[7]: NA In [8]: arr[1] Out[8]: NA In [9]: arr[2] Out[9]: [] In [10]: len(arr[2]) Out[10]: 0 In [11]: arr.type Out[11]: DataType(list) ``` Author: Wes McKinney Closes #20 from wesm/ARROW-44 and squashes the following commits: df06ba1 [Wes McKinney] Add tests for scalars proxying implemented Python list type conversions, fix associated bugs 20fbdc1 [Wes McKinney] Draft scalar box types, no tests yet --- cpp/src/arrow/types/list.h | 6 +- python/arrow/__init__.py | 6 +- python/arrow/array.pxd | 1 - python/arrow/array.pyx | 17 ++- python/arrow/compat.py | 6 + python/arrow/includes/arrow.pxd | 36 +++++- python/arrow/scalar.pxd | 25 +++- python/arrow/scalar.pyx | 165 +++++++++++++++++++++++++ python/arrow/schema.pxd | 2 + python/arrow/schema.pyx | 14 +++ python/arrow/tests/test_scalars.py | 82 ++++++++++++ python/src/pyarrow/adapters/builtin.cc | 2 +- 12 files changed, 342 insertions(+), 20 deletions(-) create mode 100644 python/arrow/tests/test_scalars.py diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f40a8245362..210c76a046c 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -63,7 +63,11 @@ class ListArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. - const ArrayPtr& values() const {return values_;} + const std::shared_ptr& values() const {return values_;} + + const std::shared_ptr& value_type() const { + return values_->type(); + } const int32_t* offsets() const { return offsets_;} diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py index 3c049b85e8c..3507ea0235a 100644 --- a/python/arrow/__init__.py +++ b/python/arrow/__init__.py @@ -24,7 +24,11 @@ from arrow.error import ArrowException -from arrow.scalar import ArrayValue, NA, Scalar +from arrow.scalar import (ArrayValue, Scalar, NA, NAType, + BooleanValue, + Int8Value, Int16Value, Int32Value, Int64Value, + UInt8Value, UInt16Value, UInt32Value, UInt64Value, + FloatValue, DoubleValue, ListValue, StringValue) from arrow.schema import (null, bool_, int8, int16, int32, int64, diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index e32d27769b5..04dd8d182bc 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -34,7 +34,6 @@ cdef class Array: DataType type cdef init(self, const shared_ptr[CArray]& sp_array) - cdef _getitem(self, int i) cdef class BooleanArray(Array): diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 3a3210d6cc1..8ebd01d1dbe 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -25,6 +25,7 @@ cimport arrow.includes.pyarrow as pyarrow from arrow.compat import frombytes, tobytes from arrow.error cimport check_status +cimport arrow.scalar as scalar from arrow.scalar import NA def total_allocated_bytes(): @@ -73,13 +74,7 @@ cdef class Array: while key < 0: key += len(self) - if self.ap.IsNull(key): - return NA - else: - return self._getitem(key) - - cdef _getitem(self, int i): - raise NotImplementedError + return scalar.box_arrow_scalar(self.type, self.sp_array, key) def slice(self, start, end): pass @@ -168,12 +163,16 @@ cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): return arr -def from_pylist(object list_obj, type=None): +def from_pylist(object list_obj, DataType type=None): """ Convert Python list to Arrow array """ cdef: shared_ptr[CArray] sp_array - check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + if type is None: + check_status(pyarrow.ConvertPySequence(list_obj, &sp_array)) + else: + raise NotImplementedError + return box_arrow_array(sp_array) diff --git a/python/arrow/compat.py b/python/arrow/compat.py index 2ac41ac8abf..08f0f237967 100644 --- a/python/arrow/compat.py +++ b/python/arrow/compat.py @@ -54,6 +54,9 @@ def dict_values(x): range = xrange long = long + def u(s): + return unicode(s, "unicode_escape") + def tobytes(o): if isinstance(o, unicode): return o.encode('utf8') @@ -73,6 +76,9 @@ def dict_values(x): from decimal import Decimal range = range + def u(s): + return s + def tobytes(o): if isinstance(o, str): return o.encode('utf8') diff --git a/python/arrow/includes/arrow.pxd b/python/arrow/includes/arrow.pxd index fde5de91091..0cc44c06cb6 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/arrow/includes/arrow.pxd @@ -84,13 +84,41 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsNull(int i) cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): - pass + uint8_t Value(int i) cdef cppclass CInt8Array" arrow::Int8Array"(CArray): - pass + int8_t Value(int i) + + cdef cppclass CUInt16Array" arrow::UInt16Array"(CArray): + uint16_t Value(int i) + + cdef cppclass CInt16Array" arrow::Int16Array"(CArray): + int16_t Value(int i) + + cdef cppclass CUInt32Array" arrow::UInt32Array"(CArray): + uint32_t Value(int i) + + cdef cppclass CInt32Array" arrow::Int32Array"(CArray): + int32_t Value(int i) + + cdef cppclass CUInt64Array" arrow::UInt64Array"(CArray): + uint64_t Value(int i) + + cdef cppclass CInt64Array" arrow::Int64Array"(CArray): + int64_t Value(int i) + + cdef cppclass CFloatArray" arrow::FloatArray"(CArray): + float Value(int i) + + cdef cppclass CDoubleArray" arrow::DoubleArray"(CArray): + double Value(int i) cdef cppclass CListArray" arrow::ListArray"(CArray): - pass + const int32_t* offsets() + int32_t offset(int i) + int32_t value_length(int i) + const shared_ptr[CArray]& values() + const shared_ptr[CDataType]& value_type() cdef cppclass CStringArray" arrow::StringArray"(CListArray): - pass + c_string GetString(int i) diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd index e193c09cd69..15cdc956a25 100644 --- a/python/arrow/scalar.pxd +++ b/python/arrow/scalar.pxd @@ -16,7 +16,7 @@ # under the License. from arrow.includes.common cimport * -from arrow.includes.arrow cimport CArray, CListArray +from arrow.includes.arrow cimport * from arrow.schema cimport DataType @@ -31,17 +31,36 @@ cdef class NAType(Scalar): cdef class ArrayValue(Scalar): cdef: - shared_ptr[CArray] array + shared_ptr[CArray] sp_array int index + cdef void init(self, DataType type, + const shared_ptr[CArray]& sp_array, int index) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array) + cdef class Int8Value(ArrayValue): pass -cdef class ListValue(ArrayValue): +cdef class Int64Value(ArrayValue): pass +cdef class ListValue(ArrayValue): + cdef readonly: + DataType value_type + + cdef: + CListArray* ap + + cdef _getitem(self, int i) + + cdef class StringValue(ArrayValue): pass + +cdef object box_arrow_scalar(DataType type, + const shared_ptr[CArray]& sp_array, + int index) diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx index 78dadecf9b4..951ede28776 100644 --- a/python/arrow/scalar.pyx +++ b/python/arrow/scalar.pyx @@ -15,14 +15,179 @@ # specific language governing permissions and limitations # under the License. +from arrow.schema cimport DataType, box_data_type + +from arrow.compat import frombytes import arrow.schema as schema +NA = None + cdef class NAType(Scalar): def __cinit__(self): + global NA + if NA is not None: + raise Exception('Cannot create multiple NAType instances') + self.type = schema.null() def __repr__(self): return 'NA' + def as_py(self): + return None + NA = NAType() + +cdef class ArrayValue(Scalar): + + cdef void init(self, DataType type, const shared_ptr[CArray]& sp_array, + int index): + self.type = type + self.index = index + self._set_array(sp_array) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + + def __repr__(self): + if hasattr(self, 'as_py'): + return repr(self.as_py()) + else: + return Scalar.__repr__(self) + + +cdef class BooleanValue(ArrayValue): + pass + + +cdef class Int8Value(ArrayValue): + + def as_py(self): + cdef CInt8Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt8Value(ArrayValue): + + def as_py(self): + cdef CUInt8Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int16Value(ArrayValue): + + def as_py(self): + cdef CInt16Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt16Value(ArrayValue): + + def as_py(self): + cdef CUInt16Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int32Value(ArrayValue): + + def as_py(self): + cdef CInt32Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt32Value(ArrayValue): + + def as_py(self): + cdef CUInt32Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class Int64Value(ArrayValue): + + def as_py(self): + cdef CInt64Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class UInt64Value(ArrayValue): + + def as_py(self): + cdef CUInt64Array* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class FloatValue(ArrayValue): + + def as_py(self): + cdef CFloatArray* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class DoubleValue(ArrayValue): + + def as_py(self): + cdef CDoubleArray* ap = self.sp_array.get() + return ap.Value(self.index) + + +cdef class StringValue(ArrayValue): + + def as_py(self): + cdef CStringArray* ap = self.sp_array.get() + return frombytes(ap.GetString(self.index)) + + +cdef class ListValue(ArrayValue): + + def __len__(self): + return self.ap.value_length(self.index) + + def __getitem__(self, i): + return self._getitem(i) + + cdef void _set_array(self, const shared_ptr[CArray]& sp_array): + self.sp_array = sp_array + self.ap = sp_array.get() + self.value_type = box_data_type(self.ap.value_type()) + + cdef _getitem(self, int i): + cdef int j = self.ap.offset(self.index) + i + return box_arrow_scalar(self.value_type, self.ap.values(), j) + + def as_py(self): + cdef: + int j + list result = [] + + for j in range(len(self)): + result.append(self._getitem(j).as_py()) + + return result + + +cdef dict _scalar_classes = { + LogicalType_UINT8: Int8Value, + LogicalType_UINT16: Int16Value, + LogicalType_UINT32: Int32Value, + LogicalType_UINT64: Int64Value, + LogicalType_INT8: Int8Value, + LogicalType_INT16: Int16Value, + LogicalType_INT32: Int32Value, + LogicalType_INT64: Int64Value, + LogicalType_FLOAT: FloatValue, + LogicalType_DOUBLE: DoubleValue, + LogicalType_LIST: ListValue, + LogicalType_STRING: StringValue +} + +cdef object box_arrow_scalar(DataType type, + const shared_ptr[CArray]& sp_array, + int index): + cdef ArrayValue val + if sp_array.get().IsNull(index): + return NA + else: + val = _scalar_classes[type.type.type]() + val.init(type, sp_array, index) + return val diff --git a/python/arrow/schema.pxd b/python/arrow/schema.pxd index 487c246f44a..8cc244aaba3 100644 --- a/python/arrow/schema.pxd +++ b/python/arrow/schema.pxd @@ -37,3 +37,5 @@ cdef class Schema: cdef: shared_ptr[CSchema] sp_schema CSchema* schema + +cdef DataType box_data_type(const shared_ptr[CDataType]& type) diff --git a/python/arrow/schema.pyx b/python/arrow/schema.pyx index 63cd6e888ab..3001531eb60 100644 --- a/python/arrow/schema.pyx +++ b/python/arrow/schema.pyx @@ -85,6 +85,14 @@ cdef DataType primitive_type(LogicalType type, bint nullable=True): def field(name, type): return Field(name, type) +cdef set PRIMITIVE_TYPES = set([ + LogicalType_NA, LogicalType_BOOL, + LogicalType_UINT8, LogicalType_INT8, + LogicalType_UINT16, LogicalType_INT16, + LogicalType_UINT32, LogicalType_INT32, + LogicalType_UINT64, LogicalType_INT64, + LogicalType_FLOAT, LogicalType_DOUBLE]) + def null(): return primitive_type(LogicalType_NA) @@ -148,3 +156,9 @@ def struct(fields, c_bool nullable=True): out.init(shared_ptr[CDataType]( new CStructType(c_fields, nullable))) return out + + +cdef DataType box_data_type(const shared_ptr[CDataType]& type): + cdef DataType out = DataType() + out.init(type) + return out diff --git a/python/arrow/tests/test_scalars.py b/python/arrow/tests/test_scalars.py new file mode 100644 index 00000000000..951380bd981 --- /dev/null +++ b/python/arrow/tests/test_scalars.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from arrow.compat import unittest, u +import arrow + + +class TestScalars(unittest.TestCase): + + def test_null_singleton(self): + with self.assertRaises(Exception): + arrow.NAType() + + def test_bool(self): + pass + + def test_int64(self): + arr = arrow.from_pylist([1, 2, None]) + + v = arr[0] + assert isinstance(v, arrow.Int64Value) + assert repr(v) == "1" + assert v.as_py() == 1 + + assert arr[2] is arrow.NA + + def test_double(self): + arr = arrow.from_pylist([1.5, None, 3]) + + v = arr[0] + assert isinstance(v, arrow.DoubleValue) + assert repr(v) == "1.5" + assert v.as_py() == 1.5 + + assert arr[1] is arrow.NA + + v = arr[2] + assert v.as_py() == 3.0 + + def test_string(self): + arr = arrow.from_pylist(['foo', None, u('bar')]) + + v = arr[0] + assert isinstance(v, arrow.StringValue) + assert repr(v) == "'foo'" + assert v.as_py() == 'foo' + + assert arr[1] is arrow.NA + + v = arr[2].as_py() + assert v == 'bar' + assert isinstance(v, str) + + def test_list(self): + arr = arrow.from_pylist([['foo', None], None, ['bar'], []]) + + v = arr[0] + assert len(v) == 2 + assert isinstance(v, arrow.ListValue) + assert repr(v) == "['foo', None]" + assert v.as_py() == ['foo', None] + assert v[0].as_py() == 'foo' + assert v[1] is arrow.NA + + assert arr[1] is arrow.NA + + v = arr[3] + assert len(v) == 0 diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index ae84fa12b0d..60d6248842e 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -276,7 +276,7 @@ class Int64Converter : public TypedConverter { class DoubleConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { - int64_t val; + double val; Py_ssize_t size = PySequence_Size(seq); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); From 45cd9fd8ddc75f5c8a558024c705ab8d37bbc5b5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 8 Mar 2016 12:48:42 -0800 Subject: [PATCH 025/210] ARROW-43: Python: format array values to in __repr__ for interactive computing Author: Wes McKinney Closes #21 from wesm/ARROW-43 and squashes the following commits: dee6ba2 [Wes McKinney] Basic array formatter, not tweaking too much for now --- python/arrow/array.pxd | 1 + python/arrow/array.pyx | 16 +++++++++++++- python/arrow/scalar.pxd | 2 +- python/arrow/scalar.pyx | 11 +++++++--- python/arrow/tests/test_array.py | 37 ++++++++++++++++++++++++++++++++ 5 files changed, 62 insertions(+), 5 deletions(-) diff --git a/python/arrow/array.pxd b/python/arrow/array.pxd index 04dd8d182bc..482f8f796dd 100644 --- a/python/arrow/array.pxd +++ b/python/arrow/array.pxd @@ -34,6 +34,7 @@ cdef class Array: DataType type cdef init(self, const shared_ptr[CArray]& sp_array) + cdef getitem(self, int i) cdef class BooleanArray(Array): diff --git a/python/arrow/array.pyx b/python/arrow/array.pyx index 8ebd01d1dbe..b367e3b84a8 100644 --- a/python/arrow/array.pyx +++ b/python/arrow/array.pyx @@ -46,6 +46,17 @@ cdef class Array: def __get__(self): return self.sp_array.get().null_count() + def __iter__(self): + for i in range(len(self)): + yield self.getitem(i) + raise StopIteration + + def __repr__(self): + from arrow.formatting import array_format + type_format = object.__repr__(self) + values = array_format(self, window=10) + return '{0}\n{1}'.format(type_format, values) + def __len__(self): return self.sp_array.get().length() @@ -74,7 +85,10 @@ cdef class Array: while key < 0: key += len(self) - return scalar.box_arrow_scalar(self.type, self.sp_array, key) + return self.getitem(key) + + cdef getitem(self, int i): + return scalar.box_arrow_scalar(self.type, self.sp_array, i) def slice(self, start, end): pass diff --git a/python/arrow/scalar.pxd b/python/arrow/scalar.pxd index 15cdc956a25..4e0a3647155 100644 --- a/python/arrow/scalar.pxd +++ b/python/arrow/scalar.pxd @@ -55,7 +55,7 @@ cdef class ListValue(ArrayValue): cdef: CListArray* ap - cdef _getitem(self, int i) + cdef getitem(self, int i) cdef class StringValue(ArrayValue): diff --git a/python/arrow/scalar.pyx b/python/arrow/scalar.pyx index 951ede28776..72a280e334f 100644 --- a/python/arrow/scalar.pyx +++ b/python/arrow/scalar.pyx @@ -144,14 +144,19 @@ cdef class ListValue(ArrayValue): return self.ap.value_length(self.index) def __getitem__(self, i): - return self._getitem(i) + return self.getitem(i) + + def __iter__(self): + for i in range(len(self)): + yield self.getitem(i) + raise StopIteration cdef void _set_array(self, const shared_ptr[CArray]& sp_array): self.sp_array = sp_array self.ap = sp_array.get() self.value_type = box_data_type(self.ap.value_type()) - cdef _getitem(self, int i): + cdef getitem(self, int i): cdef int j = self.ap.offset(self.index) + i return box_arrow_scalar(self.value_type, self.ap.values(), j) @@ -161,7 +166,7 @@ cdef class ListValue(ArrayValue): list result = [] for j in range(len(self)): - result.append(self._getitem(j).as_py()) + result.append(self.getitem(j).as_py()) return result diff --git a/python/arrow/tests/test_array.py b/python/arrow/tests/test_array.py index 8eaa5335206..ebd872c744e 100644 --- a/python/arrow/tests/test_array.py +++ b/python/arrow/tests/test_array.py @@ -17,6 +17,7 @@ from arrow.compat import unittest import arrow +import arrow.formatting as fmt class TestArrayAPI(unittest.TestCase): @@ -24,3 +25,39 @@ class TestArrayAPI(unittest.TestCase): def test_getitem_NA(self): arr = arrow.from_pylist([1, None, 2]) assert arr[1] is arrow.NA + + def test_list_format(self): + arr = arrow.from_pylist([[1], None, [2, 3]]) + result = fmt.array_format(arr) + expected = """\ +[ + [1], + NA, + [2, + 3] +]""" + assert result == expected + + def test_string_format(self): + arr = arrow.from_pylist(['foo', None, 'bar']) + result = fmt.array_format(arr) + expected = """\ +[ + 'foo', + NA, + 'bar' +]""" + assert result == expected + + def test_long_array_format(self): + arr = arrow.from_pylist(range(100)) + result = fmt.array_format(arr, window=2) + expected = """\ +[ + 0, + 1, + ... + 98, + 99 +]""" + assert result == expected From 1650026285bea52288c7f24720c3caf7cd3ce2a8 Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Mon, 29 Feb 2016 19:32:12 -0800 Subject: [PATCH 026/210] ARROW-17: set some vector fields to package level access for Drill compatibility --- .../codegen/templates/BasicTypeHelper.java | 1 + .../templates/NullableValueVectors.java | 6 ++- .../templates/RepeatedValueVectors.java | 2 +- .../main/codegen/templates/UnionVector.java | 4 +- .../templates/VariableLengthVectors.java | 2 +- .../org/apache/arrow/vector/BitVector.java | 4 +- .../arrow/vector/complex/ListVector.java | 4 +- .../arrow/vector/complex/MapVector.java | 2 +- .../vector/complex/RepeatedListVector.java | 3 +- .../vector/complex/RepeatedMapVector.java | 2 +- .../org/apache/arrow/vector/types/Types.java | 54 +++++++++++++++---- 11 files changed, 60 insertions(+), 24 deletions(-) diff --git a/java/vector/src/main/codegen/templates/BasicTypeHelper.java b/java/vector/src/main/codegen/templates/BasicTypeHelper.java index bb6446e8d6b..0bae715e352 100644 --- a/java/vector/src/main/codegen/templates/BasicTypeHelper.java +++ b/java/vector/src/main/codegen/templates/BasicTypeHelper.java @@ -231,6 +231,7 @@ public static ValueVector getNewVector(MaterializedField field, BufferAllocator return getNewVector(field, allocator, null); } public static ValueVector getNewVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ + field = field.clone(); MajorType type = field.getType(); switch (type.getMinorType()) { diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index 6893a25efbe..b0029f7ad4c 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -45,8 +45,10 @@ public final class ${className} extends BaseDataValueVector implements <#if type private final FieldReader reader = new Nullable${minor.class}ReaderImpl(Nullable${minor.class}Vector.this); private final MaterializedField bitsField = MaterializedField.create("$bits$", new MajorType(MinorType.UINT1, DataMode.REQUIRED)); - private final UInt1Vector bits = new UInt1Vector(bitsField, allocator); - private final ${valuesName} values = new ${minor.class}Vector(field, allocator); + private final MaterializedField valuesField = MaterializedField.create("$values$", new MajorType(field.getType().getMinorType(), DataMode.REQUIRED, field.getPrecision(), field.getScale())); + + final UInt1Vector bits = new UInt1Vector(bitsField, allocator); + final ${valuesName} values = new ${minor.class}Vector(valuesField, allocator); private final Mutator mutator = new Mutator(); private final Accessor accessor = new Accessor(); diff --git a/java/vector/src/main/codegen/templates/RepeatedValueVectors.java b/java/vector/src/main/codegen/templates/RepeatedValueVectors.java index 5ac80f57737..ceae53bbf58 100644 --- a/java/vector/src/main/codegen/templates/RepeatedValueVectors.java +++ b/java/vector/src/main/codegen/templates/RepeatedValueVectors.java @@ -42,7 +42,7 @@ public final class Repeated${minor.class}Vector extends BaseRepeatedValueVector //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Repeated${minor.class}Vector.class); // we maintain local reference to concrete vector type for performance reasons. - private ${minor.class}Vector values; + ${minor.class}Vector values; private final FieldReader reader = new Repeated${minor.class}ReaderImpl(Repeated${minor.class}Vector.this); private final Mutator mutator = new Mutator(); private final Accessor accessor = new Accessor(); diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index ba94ac22a05..6042a5bf683 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -51,9 +51,9 @@ public class UnionVector implements ValueVector { private BufferAllocator allocator; private Accessor accessor = new Accessor(); private Mutator mutator = new Mutator(); - private int valueCount; + int valueCount; - private MapVector internalMap; + MapVector internalMap; private UInt1Vector typeVector; private MapVector mapVector; diff --git a/java/vector/src/main/codegen/templates/VariableLengthVectors.java b/java/vector/src/main/codegen/templates/VariableLengthVectors.java index 13d53b8e846..84fb3eb5567 100644 --- a/java/vector/src/main/codegen/templates/VariableLengthVectors.java +++ b/java/vector/src/main/codegen/templates/VariableLengthVectors.java @@ -57,7 +57,7 @@ public final class ${minor.class}Vector extends BaseDataValueVector implements V public final static String OFFSETS_VECTOR_NAME = "$offsets$"; private final MaterializedField offsetsField = MaterializedField.create(OFFSETS_VECTOR_NAME, new MajorType(MinorType.UINT4, DataMode.REQUIRED)); - private final UInt${type.width}Vector offsetVector = new UInt${type.width}Vector(offsetsField, allocator); + final UInt${type.width}Vector offsetVector = new UInt${type.width}Vector(offsetsField, allocator); private final FieldReader reader = new ${minor.class}ReaderImpl(${minor.class}Vector.this); private final Accessor accessor; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index 952e9028e06..c5bcb2decc4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -41,7 +41,7 @@ public final class BitVector extends BaseDataValueVector implements FixedWidthVe private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); - private int valueCount; + int valueCount; private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION; private int allocationMonitor = 0; @@ -64,7 +64,7 @@ public int getBufferSizeFor(final int valueCount) { return getSizeFromCount(valueCount); } - private int getSizeFromCount(int valueCount) { + int getSizeFromCount(int valueCount) { return (int) Math.ceil(valueCount / 8.0); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 8387c9e5ba6..13610c4f03f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -47,8 +47,8 @@ public class ListVector extends BaseRepeatedValueVector { - private UInt4Vector offsets; - private final UInt1Vector bits; + UInt4Vector offsets; + final UInt1Vector bits; private Mutator mutator = new Mutator(); private Accessor accessor = new Accessor(); private UnionListWriter writer; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index 1bbce73d6ff..cc0953a1af8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -52,7 +52,7 @@ public class MapVector extends AbstractMapVector { private final SingleMapReaderImpl reader = new SingleMapReaderImpl(MapVector.this); private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); - private int valueCount; + int valueCount; public MapVector(String path, BufferAllocator allocator, CallBack callBack){ this(MaterializedField.create(path, TYPE), allocator, callBack); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java index 778fe81b5da..f337f9c4a60 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java @@ -49,7 +49,7 @@ public class RepeatedListVector extends AbstractContainerVector public final static MajorType TYPE = new MajorType(MinorType.LIST, DataMode.REPEATED); private final RepeatedListReaderImpl reader = new RepeatedListReaderImpl(null, this); - private final DelegateRepeatedVector delegate; + final DelegateRepeatedVector delegate; protected static class DelegateRepeatedVector extends BaseRepeatedValueVector { @@ -313,7 +313,6 @@ public AddOrGetResult addOrGetVector(VectorDescriptor if (result.isCreated() && callBack != null) { callBack.doWork(); } - this.field = delegate.getField(); return result; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java index e7eacd3c67c..686414e71ca 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java @@ -53,7 +53,7 @@ public class RepeatedMapVector extends AbstractMapVector public final static MajorType TYPE = new MajorType(MinorType.MAP, DataMode.REPEATED); - private final UInt4Vector offsets; // offsets to start of each record (considering record indices are 0-indexed) + final UInt4Vector offsets; // offsets to start of each record (considering record indices are 0-indexed) private final RepeatedMapReaderImpl reader = new RepeatedMapReaderImpl(RepeatedMapVector.this); private final RepeatedMapAccessor accessor = new RepeatedMapAccessor(); private final Mutator mutator = new Mutator(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index cef892ce880..88999cb8f5a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import java.util.Objects; public class Types { public enum MinorType { @@ -73,26 +74,35 @@ public enum DataMode { public static class MajorType { private MinorType minorType; private DataMode mode; - private Integer precision; - private Integer scale; - private Integer timezone; + private int precision; + private int scale; + private int timezone; + private int width; private List subTypes; public MajorType(MinorType minorType, DataMode mode) { - this(minorType, mode, null, null, null, null); + this(minorType, mode, 0, 0, 0, 0, null); } - public MajorType(MinorType minorType, DataMode mode, Integer precision, Integer scale) { - this(minorType, mode, precision, scale, null, null); + public MajorType(MinorType minorType, DataMode mode, int precision, int scale) { + this(minorType, mode, precision, scale, 0, 0, null); } - public MajorType(MinorType minorType, DataMode mode, Integer precision, Integer scale, Integer timezone, List subTypes) { + public MajorType(MinorType minorType, DataMode mode, int precision, int scale, int timezone, List subTypes) { + this(minorType, mode, precision, scale, timezone, 0, subTypes); + } + + public MajorType(MinorType minorType, DataMode mode, int precision, int scale, int timezone, int width, List subTypes) { this.minorType = minorType; this.mode = mode; this.precision = precision; this.scale = scale; this.timezone = timezone; + this.width = width; this.subTypes = subTypes; + if (subTypes == null) { + this.subTypes = new ArrayList<>(); + } } public MinorType getMinorType() { @@ -103,21 +113,45 @@ public DataMode getMode() { return mode; } - public Integer getPrecision() { + public int getPrecision() { return precision; } - public Integer getScale() { + public int getScale() { return scale; } - public Integer getTimezone() { + public int getTimezone() { return timezone; } public List getSubTypes() { return subTypes; } + + public int getWidth() { + return width; + } + + + @Override + public boolean equals(Object other) { + if (other == null) { + return false; + } + if (!(other instanceof MajorType)) { + return false; + } + MajorType that = (MajorType) other; + return this.minorType == that.minorType && + this.mode == that.mode && + this.precision == that.precision && + this.scale == that.scale && + this.timezone == that.timezone && + this.width == that.width && + Objects.equals(this.subTypes, that.subTypes); + } + } public static MajorType required(MinorType minorType) { From 243ed4e91d5ed922b205f7ac5fa8f9f821a07fbb Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Mon, 29 Feb 2016 19:33:44 -0800 Subject: [PATCH 027/210] ARROW-18: Fix decimal precision and scale in MapWriters --- java/vector/src/main/codegen/templates/MapWriters.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index 7001367bb37..42f39820393 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -206,7 +206,7 @@ public void end() { } public ${minor.class}Writer ${lowerName}(String name, int scale, int precision) { - final MajorType ${upperName}_TYPE = new MajorType(MinorType.${upperName}, DataMode.OPTIONAL, scale, precision, null, null); + final MajorType ${upperName}_TYPE = new MajorType(MinorType.${upperName}, DataMode.OPTIONAL, precision, scale, 0, null); <#else> private static final MajorType ${upperName}_TYPE = Types.optional(MinorType.${upperName}); @Override From 31def7d81a094dd051d2f4bbead78edaae25755a Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Tue, 8 Mar 2016 14:11:29 -0800 Subject: [PATCH 028/210] ARROW-51: Add simple ValueVector tests --- .../apache/arrow/vector/TestValueVector.java | 521 ++++++++++++++++++ 1 file changed, 521 insertions(+) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java new file mode 100644 index 00000000000..4488d750284 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -0,0 +1,521 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.Charset; + +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.RepeatedListVector; +import org.apache.arrow.vector.complex.RepeatedMapVector; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.util.BasicTypeHelper; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.holders.BitHolder; +import org.apache.arrow.vector.holders.IntHolder; +import org.apache.arrow.vector.holders.NullableFloat4Holder; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.holders.NullableVar16CharHolder; +import org.apache.arrow.vector.holders.NullableVarCharHolder; +import org.apache.arrow.vector.holders.RepeatedFloat4Holder; +import org.apache.arrow.vector.holders.RepeatedIntHolder; +import org.apache.arrow.vector.holders.RepeatedVarBinaryHolder; +import org.apache.arrow.vector.holders.UInt4Holder; +import org.apache.arrow.vector.holders.VarCharHolder; +import org.apache.arrow.memory.BufferAllocator; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + + +public class TestValueVector { + //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TestValueVector.class); + + private final static String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + private final static Charset utf8Charset = Charset.forName("UTF-8"); + private final static byte[] STR1 = new String("AAAAA1").getBytes(utf8Charset); + private final static byte[] STR2 = new String("BBBBBBBBB2").getBytes(utf8Charset); + private final static byte[] STR3 = new String("CCCC3").getBytes(utf8Charset); + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test(expected = OversizedAllocationException.class) + public void testFixedVectorReallocation() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + final UInt4Vector vector = new UInt4Vector(field, allocator); + // edge case 1: buffer size = max value capacity + final int expectedValueCapacity = BaseValueVector.MAX_ALLOCATION_SIZE / 4; + try { + vector.allocateNew(expectedValueCapacity); + assertEquals(expectedValueCapacity, vector.getValueCapacity()); + vector.reAlloc(); + assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); + } finally { + vector.close(); + } + + // common case: value count < max value capacity + try { + vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 8); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this should throw an IOOB + } finally { + vector.close(); + } + } + + @Test(expected = OversizedAllocationException.class) + public void testBitVectorReallocation() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + final BitVector vector = new BitVector(field, allocator); + // edge case 1: buffer size ~ max value capacity + final int expectedValueCapacity = 1 << 29; + try { + vector.allocateNew(expectedValueCapacity); + assertEquals(expectedValueCapacity, vector.getValueCapacity()); + vector.reAlloc(); + assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(expectedValueCapacity); + for (int i=0; i<3;i++) { + vector.reAlloc(); // expand buffer size + } + assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); + vector.reAlloc(); // buffer size ~ max allocation + assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); + vector.reAlloc(); // overflow + } finally { + vector.close(); + } + } + + + @Test(expected = OversizedAllocationException.class) + public void testVariableVectorReallocation() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + final VarCharVector vector = new VarCharVector(field, allocator); + // edge case 1: value count = MAX_VALUE_ALLOCATION + final int expectedAllocationInBytes = BaseValueVector.MAX_ALLOCATION_SIZE; + final int expectedOffsetSize = 10; + try { + vector.allocateNew(expectedAllocationInBytes, 10); + assertTrue(expectedOffsetSize <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes <= vector.getBuffer().capacity()); + vector.reAlloc(); + assertTrue(expectedOffsetSize * 2 <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes * 2 <= vector.getBuffer().capacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 2, 0); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this tests if it overflows + } finally { + vector.close(); + } + } + + @Test + public void testFixedType() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + + // Create a new value vector for 1024 integers. + try (final UInt4Vector vector = new UInt4Vector(field, allocator)) { + final UInt4Vector.Mutator m = vector.getMutator(); + vector.allocateNew(1024); + + // Put and set a few values + m.setSafe(0, 100); + m.setSafe(1, 101); + m.setSafe(100, 102); + m.setSafe(1022, 103); + m.setSafe(1023, 104); + + final UInt4Vector.Accessor accessor = vector.getAccessor(); + assertEquals(100, accessor.get(0)); + assertEquals(101, accessor.get(1)); + assertEquals(102, accessor.get(100)); + assertEquals(103, accessor.get(1022)); + assertEquals(104, accessor.get(1023)); + } + } + + @Test + public void testNullableVarLen2() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableVarCharHolder.TYPE); + + // Create a new value vector for 1024 integers. + try (final NullableVarCharVector vector = new NullableVarCharVector(field, allocator)) { + final NullableVarCharVector.Mutator m = vector.getMutator(); + vector.allocateNew(1024 * 10, 1024); + + m.set(0, STR1); + m.set(1, STR2); + m.set(2, STR3); + + // Check the sample strings. + final NullableVarCharVector.Accessor accessor = vector.getAccessor(); + assertArrayEquals(STR1, accessor.get(0)); + assertArrayEquals(STR2, accessor.get(1)); + assertArrayEquals(STR3, accessor.get(2)); + + // Ensure null value throws. + boolean b = false; + try { + vector.getAccessor().get(3); + } catch (IllegalStateException e) { + b = true; + } finally { + assertTrue(b); + } + } + } + + @Test + public void testRepeatedIntVector() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedIntHolder.TYPE); + + // Create a new value vector. + try (final RepeatedIntVector vector1 = new RepeatedIntVector(field, allocator)) { + + // Populate the vector. + final int[] values = {2, 3, 5, 7, 11, 13, 17, 19, 23, 27}; // some tricksy primes + final int nRecords = 7; + final int nElements = values.length; + vector1.allocateNew(nRecords, nRecords * nElements); + final RepeatedIntVector.Mutator mutator = vector1.getMutator(); + for (int recordIndex = 0; recordIndex < nRecords; ++recordIndex) { + mutator.startNewValue(recordIndex); + for (int elementIndex = 0; elementIndex < nElements; ++elementIndex) { + mutator.add(recordIndex, recordIndex * values[elementIndex]); + } + } + mutator.setValueCount(nRecords); + + // Verify the contents. + final RepeatedIntVector.Accessor accessor1 = vector1.getAccessor(); + assertEquals(nRecords, accessor1.getValueCount()); + for (int recordIndex = 0; recordIndex < nRecords; ++recordIndex) { + for (int elementIndex = 0; elementIndex < nElements; ++elementIndex) { + final int value = accessor1.get(recordIndex, elementIndex); + assertEquals(recordIndex * values[elementIndex], value); + } + } + } + } + + @Test + public void testNullableFixedType() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableUInt4Holder.TYPE); + + // Create a new value vector for 1024 integers. + try (final NullableUInt4Vector vector = new NullableUInt4Vector(field, allocator)) { + final NullableUInt4Vector.Mutator m = vector.getMutator(); + vector.allocateNew(1024); + + // Put and set a few values + m.set(0, 100); + m.set(1, 101); + m.set(100, 102); + m.set(1022, 103); + m.set(1023, 104); + + final NullableUInt4Vector.Accessor accessor = vector.getAccessor(); + assertEquals(100, accessor.get(0)); + assertEquals(101, accessor.get(1)); + assertEquals(102, accessor.get(100)); + assertEquals(103, accessor.get(1022)); + assertEquals(104, accessor.get(1023)); + + // Ensure null values throw + { + boolean b = false; + try { + accessor.get(3); + } catch (IllegalStateException e) { + b = true; + } finally { + assertTrue(b); + } + } + + vector.allocateNew(2048); + { + boolean b = false; + try { + accessor.get(0); + } catch (IllegalStateException e) { + b = true; + } finally { + assertTrue(b); + } + } + + m.set(0, 100); + m.set(1, 101); + m.set(100, 102); + m.set(1022, 103); + m.set(1023, 104); + assertEquals(100, accessor.get(0)); + assertEquals(101, accessor.get(1)); + assertEquals(102, accessor.get(100)); + assertEquals(103, accessor.get(1022)); + assertEquals(104, accessor.get(1023)); + + // Ensure null values throw. + { + boolean b = false; + try { + vector.getAccessor().get(3); + } catch (IllegalStateException e) { + b = true; + } finally { + assertTrue(b); + } + } + } + } + + @Test + public void testNullableFloat() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableFloat4Holder.TYPE); + + // Create a new value vector for 1024 integers + try (final NullableFloat4Vector vector = (NullableFloat4Vector) BasicTypeHelper.getNewVector(field, allocator)) { + final NullableFloat4Vector.Mutator m = vector.getMutator(); + vector.allocateNew(1024); + + // Put and set a few values. + m.set(0, 100.1f); + m.set(1, 101.2f); + m.set(100, 102.3f); + m.set(1022, 103.4f); + m.set(1023, 104.5f); + + final NullableFloat4Vector.Accessor accessor = vector.getAccessor(); + assertEquals(100.1f, accessor.get(0), 0); + assertEquals(101.2f, accessor.get(1), 0); + assertEquals(102.3f, accessor.get(100), 0); + assertEquals(103.4f, accessor.get(1022), 0); + assertEquals(104.5f, accessor.get(1023), 0); + + // Ensure null values throw. + { + boolean b = false; + try { + vector.getAccessor().get(3); + } catch (IllegalStateException e) { + b = true; + } finally { + assertTrue(b); + } + } + + vector.allocateNew(2048); + { + boolean b = false; + try { + accessor.get(0); + } catch (IllegalStateException e) { + b = true; + } finally { + assertTrue(b); + } + } + } + } + + @Test + public void testBitVector() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, BitHolder.TYPE); + + // Create a new value vector for 1024 integers + try (final BitVector vector = new BitVector(field, allocator)) { + final BitVector.Mutator m = vector.getMutator(); + vector.allocateNew(1024); + + // Put and set a few values + m.set(0, 1); + m.set(1, 0); + m.set(100, 0); + m.set(1022, 1); + + final BitVector.Accessor accessor = vector.getAccessor(); + assertEquals(1, accessor.get(0)); + assertEquals(0, accessor.get(1)); + assertEquals(0, accessor.get(100)); + assertEquals(1, accessor.get(1022)); + + // test setting the same value twice + m.set(0, 1); + m.set(0, 1); + m.set(1, 0); + m.set(1, 0); + assertEquals(1, accessor.get(0)); + assertEquals(0, accessor.get(1)); + + // test toggling the values + m.set(0, 0); + m.set(1, 1); + assertEquals(0, accessor.get(0)); + assertEquals(1, accessor.get(1)); + + // Ensure unallocated space returns 0 + assertEquals(0, accessor.get(3)); + } + } + + @Test + public void testReAllocNullableFixedWidthVector() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableFloat4Holder.TYPE); + + // Create a new value vector for 1024 integers + try (final NullableFloat4Vector vector = (NullableFloat4Vector) BasicTypeHelper.getNewVector(field, allocator)) { + final NullableFloat4Vector.Mutator m = vector.getMutator(); + vector.allocateNew(1024); + + assertEquals(1024, vector.getValueCapacity()); + + // Put values in indexes that fall within the initial allocation + m.setSafe(0, 100.1f); + m.setSafe(100, 102.3f); + m.setSafe(1023, 104.5f); + + // Now try to put values in space that falls beyond the initial allocation + m.setSafe(2000, 105.5f); + + // Check valueCapacity is more than initial allocation + assertEquals(1024 * 2, vector.getValueCapacity()); + + final NullableFloat4Vector.Accessor accessor = vector.getAccessor(); + assertEquals(100.1f, accessor.get(0), 0); + assertEquals(102.3f, accessor.get(100), 0); + assertEquals(104.5f, accessor.get(1023), 0); + assertEquals(105.5f, accessor.get(2000), 0); + + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for NullableValueVectors + // as we don't call setSafe for null values, but we do call setValueCount when all values are inserted into the + // vector + m.setValueCount(vector.getValueCapacity() + 200); + } + } + + @Test + public void testReAllocNullableVariableWidthVector() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableVarCharHolder.TYPE); + + // Create a new value vector for 1024 integers + try (final NullableVarCharVector vector = (NullableVarCharVector) BasicTypeHelper.getNewVector(field, allocator)) { + final NullableVarCharVector.Mutator m = vector.getMutator(); + vector.allocateNew(); + + int initialCapacity = vector.getValueCapacity(); + + // Put values in indexes that fall within the initial allocation + m.setSafe(0, STR1, 0, STR1.length); + m.setSafe(initialCapacity - 1, STR2, 0, STR2.length); + + // Now try to put values in space that falls beyond the initial allocation + m.setSafe(initialCapacity + 200, STR3, 0, STR3.length); + + // Check valueCapacity is more than initial allocation + assertEquals((initialCapacity + 1) * 2 - 1, vector.getValueCapacity()); + + final NullableVarCharVector.Accessor accessor = vector.getAccessor(); + assertArrayEquals(STR1, accessor.get(0)); + assertArrayEquals(STR2, accessor.get(initialCapacity - 1)); + assertArrayEquals(STR3, accessor.get(initialCapacity + 200)); + + // Set the valueCount to be more than valueCapacity of current allocation. This is possible for NullableValueVectors + // as we don't call setSafe for null values, but we do call setValueCount when the current batch is processed. + m.setValueCount(vector.getValueCapacity() + 200); + } + } + + @Test + public void testVVInitialCapacity() throws Exception { + final MaterializedField[] fields = new MaterializedField[9]; + final ValueVector[] valueVectors = new ValueVector[9]; + + fields[0] = MaterializedField.create(EMPTY_SCHEMA_PATH, BitHolder.TYPE); + fields[1] = MaterializedField.create(EMPTY_SCHEMA_PATH, IntHolder.TYPE); + fields[2] = MaterializedField.create(EMPTY_SCHEMA_PATH, VarCharHolder.TYPE); + fields[3] = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableVar16CharHolder.TYPE); + fields[4] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedFloat4Holder.TYPE); + fields[5] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedVarBinaryHolder.TYPE); + + fields[6] = MaterializedField.create(EMPTY_SCHEMA_PATH, MapVector.TYPE); + fields[6].addChild(fields[0] /*bit*/); + fields[6].addChild(fields[2] /*varchar*/); + + fields[7] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedMapVector.TYPE); + fields[7].addChild(fields[1] /*int*/); + fields[7].addChild(fields[3] /*optional var16char*/); + + fields[8] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedListVector.TYPE); + fields[8].addChild(fields[1] /*int*/); + + final int initialCapacity = 1024; + + try { + for (int i = 0; i < valueVectors.length; i++) { + valueVectors[i] = BasicTypeHelper.getNewVector(fields[i], allocator); + valueVectors[i].setInitialCapacity(initialCapacity); + valueVectors[i].allocateNew(); + } + + for (int i = 0; i < valueVectors.length; i++) { + final ValueVector vv = valueVectors[i]; + final int vvCapacity = vv.getValueCapacity(); + + // this can't be equality because Nullables will be allocated using power of two sized buffers (thus need 1025 + // spots in one vector > power of two is 2048, available capacity will be 2048 => 2047) + assertTrue(String.format("Incorrect value capacity for %s [%d]", vv.getField(), vvCapacity), + initialCapacity <= vvCapacity); + } + } finally { + for (ValueVector v : valueVectors) { + v.close(); + } + } + } + +} From e822ea758dc18ade9d3386acfd1d38e7b05ba3dd Mon Sep 17 00:00:00 2001 From: Minji Kim Date: Mon, 7 Mar 2016 15:23:33 -0800 Subject: [PATCH 029/210] ARROW-46: ListVector should initialize bits in allocateNew --- .../arrow/vector/complex/ListVector.java | 1 + .../apache/arrow/vector/TestValueVector.java | 20 +++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 13610c4f03f..3e60c768023 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -72,6 +72,7 @@ public UnionListWriter getWriter() { @Override public void allocateNew() throws OutOfMemoryException { super.allocateNewSafe(); + bits.allocateNewSafe(); } public void transferTo(ListVector target) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 4488d750284..ac3eebe98ea 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -24,10 +24,13 @@ import java.nio.charset.Charset; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.RepeatedListVector; import org.apache.arrow.vector.complex.RepeatedMapVector; import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.util.BasicTypeHelper; import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.holders.BitHolder; @@ -518,4 +521,21 @@ public void testVVInitialCapacity() throws Exception { } } + @Test + public void testListVectorShouldNotThrowOversizedAllocationException() throws Exception { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, + Types.optional(MinorType.LIST)); + ListVector vector = new ListVector(field, allocator, null); + ListVector vectorFrom = new ListVector(field, allocator, null); + vectorFrom.allocateNew(); + + for (int i = 0; i < 10000; i++) { + vector.allocateNew(); + vector.copyFromSafe(0, 0, vectorFrom); + vector.clear(); + } + + vectorFrom.clear(); + vector.clear(); + } } From 83675273bd2057552ae64b7d8632a54093a02ed9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 8 Mar 2016 20:28:58 -0800 Subject: [PATCH 030/210] ARROW-42: Add Python tests to Travis CI build Author: Wes McKinney Closes #22 from wesm/ARROW-42 and squashes the following commits: 3b056a1 [Wes McKinney] Modularize Travis CI build and add Python build script. Remove parquet.pyx from Cython build for now, suppress -Wunused-variable in Cython compilation. Add missing formatting.py file --- .travis.yml | 23 ++++++++++ ci/travis_before_script_cpp.sh | 26 ++++++++++++ ci/travis_script_cpp.sh | 22 +--------- ci/travis_script_python.sh | 59 ++++++++++++++++++++++++++ cpp/src/arrow/table/column-test.cc | 2 + cpp/src/arrow/table/schema-test.cc | 2 + cpp/src/arrow/table/table-test.cc | 4 ++ cpp/src/arrow/type.cc | 14 ------ cpp/src/arrow/type.h | 14 ------ python/CMakeLists.txt | 2 - python/arrow/formatting.py | 56 ++++++++++++++++++++++++ python/cmake_modules/UseCython.cmake | 5 +++ python/requirements.txt | 4 ++ python/setup.py | 2 +- python/src/pyarrow/adapters/builtin.cc | 20 ++++++--- python/src/pyarrow/adapters/builtin.h | 2 + python/src/pyarrow/helpers.cc | 14 ++++++ python/src/pyarrow/helpers.h | 14 ++++++ python/src/pyarrow/util/CMakeLists.txt | 18 +------- 19 files changed, 228 insertions(+), 75 deletions(-) create mode 100755 ci/travis_before_script_cpp.sh create mode 100755 ci/travis_script_python.sh create mode 100644 python/arrow/formatting.py create mode 100644 python/requirements.txt diff --git a/.travis.yml b/.travis.yml index cb2d5cb1bad..9e858d7d98e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,7 +8,9 @@ addons: packages: - gcc-4.9 # Needed for C++11 - g++-4.9 # Needed for C++11 + - gdb - gcov + - ccache - cmake - valgrind @@ -17,11 +19,32 @@ matrix: - compiler: gcc language: cpp os: linux + before_script: + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: + - export CC="gcc-4.9" + - export CXX="g++-4.9" - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh - compiler: clang language: cpp os: osx addons: + before_script: + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh + - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh + +before_install: +- ulimit -c unlimited -S +- export CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build +- export ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install +- export LD_LIBRARY_PATH=$ARROW_CPP_INSTALL/lib:$LD_LIBRARY_PATH + +after_script: +- rm -rf $CPP_BUILD_DIR + +after_failure: +- COREFILE=$(find . -maxdepth 2 -name "core*" | head -n 1) +- if [[ -f "$COREFILE" ]]; then gdb -c "$COREFILE" example -ex "thread apply all bt" -ex "set pagination 0" -batch; fi diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh new file mode 100755 index 00000000000..4d5bef8bbdf --- /dev/null +++ b/ci/travis_before_script_cpp.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -e + +: ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build} + +mkdir $CPP_BUILD_DIR +pushd $CPP_BUILD_DIR + +CPP_DIR=$TRAVIS_BUILD_DIR/cpp + +# Build an isolated thirdparty +cp -r $CPP_DIR/thirdparty . +cp $CPP_DIR/setup_build_env.sh . + +source setup_build_env.sh + +echo $GTEST_HOME + +: ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} + +cmake -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +make -j4 +make install + +popd diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index 28f16cc021f..3e843dd759e 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -2,28 +2,11 @@ set -e -mkdir $TRAVIS_BUILD_DIR/cpp-build -pushd $TRAVIS_BUILD_DIR/cpp-build +: ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build} -CPP_DIR=$TRAVIS_BUILD_DIR/cpp +pushd $CPP_BUILD_DIR -# Build an isolated thirdparty -cp -r $CPP_DIR/thirdparty . -cp $CPP_DIR/setup_build_env.sh . - -if [ $TRAVIS_OS_NAME == "linux" ]; then - # Use a C++11 compiler on Linux - export CC="gcc-4.9" - export CXX="g++-4.9" -fi - -source setup_build_env.sh - -echo $GTEST_HOME - -cmake -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR make lint -make -j4 if [ $TRAVIS_OS_NAME == "linux" ]; then valgrind --tool=memcheck --leak-check=yes --error-exitcode=1 ctest @@ -32,4 +15,3 @@ else fi popd -rm -rf cpp-build diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh new file mode 100755 index 00000000000..9b0bd4f54cb --- /dev/null +++ b/ci/travis_script_python.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +set -e + +PYTHON_DIR=$TRAVIS_BUILD_DIR/python + +# Share environment with C++ +pushd $CPP_BUILD_DIR +source setup_build_env.sh +popd + +pushd $PYTHON_DIR + +# Bootstrap a Conda Python environment + +if [ $TRAVIS_OS_NAME == "linux" ]; then + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" +else + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" +fi + +curl $MINICONDA_URL > miniconda.sh +MINICONDA=$TRAVIS_BUILD_DIR/miniconda +bash miniconda.sh -b -p $MINICONDA +export PATH="$MINICONDA/bin:$PATH" +conda update -y -q conda +conda info -a + +PYTHON_VERSION=3.5 +CONDA_ENV_NAME=pyarrow-test + +conda create -y -q -n $CONDA_ENV_NAME python=$PYTHON_VERSION +source activate $CONDA_ENV_NAME + +python --version +which python + +# faster builds, please +conda install -y nomkl + +# Expensive dependencies install from Continuum package repo +conda install -y pip numpy pandas cython + +# Other stuff pip install +pip install -r requirements.txt + +export ARROW_HOME=$ARROW_CPP_INSTALL + +python setup.py build_ext --inplace + +py.test -vv -r sxX arrow + +# if [ $TRAVIS_OS_NAME == "linux" ]; then +# valgrind --tool=memcheck py.test -vv -r sxX arrow +# else +# py.test -vv -r sxX arrow +# fi + +popd diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/table/column-test.cc index bf95932916c..3b102e48c87 100644 --- a/cpp/src/arrow/table/column-test.cc +++ b/cpp/src/arrow/table/column-test.cc @@ -33,6 +33,8 @@ using std::vector; namespace arrow { +const auto INT32 = std::make_shared(); + class TestColumn : public TestBase { protected: std::shared_ptr data_; diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/table/schema-test.cc index d6725cc08c0..9dfade26953 100644 --- a/cpp/src/arrow/table/schema-test.cc +++ b/cpp/src/arrow/table/schema-test.cc @@ -29,6 +29,8 @@ using std::vector; namespace arrow { +const auto INT32 = std::make_shared(); + TEST(TestField, Basics) { shared_ptr ftype = INT32; shared_ptr ftype_nn = std::make_shared(false); diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table/table-test.cc index c4fdb062db8..8b354e8503c 100644 --- a/cpp/src/arrow/table/table-test.cc +++ b/cpp/src/arrow/table/table-test.cc @@ -34,6 +34,10 @@ using std::vector; namespace arrow { +const auto INT16 = std::make_shared(); +const auto UINT8 = std::make_shared(); +const auto INT32 = std::make_shared(); + class TestTable : public TestBase { public: void MakeExample1(int length) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 265770822ce..0a2e817ad30 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -66,18 +66,4 @@ std::string StructType::ToString() const { return s.str(); } -const std::shared_ptr NA = std::make_shared(); -const std::shared_ptr BOOL = std::make_shared(); -const std::shared_ptr UINT8 = std::make_shared(); -const std::shared_ptr UINT16 = std::make_shared(); -const std::shared_ptr UINT32 = std::make_shared(); -const std::shared_ptr UINT64 = std::make_shared(); -const std::shared_ptr INT8 = std::make_shared(); -const std::shared_ptr INT16 = std::make_shared(); -const std::shared_ptr INT32 = std::make_shared(); -const std::shared_ptr INT64 = std::make_shared(); -const std::shared_ptr FLOAT = std::make_shared(); -const std::shared_ptr DOUBLE = std::make_shared(); -const std::shared_ptr STRING = std::make_shared(); - } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index e78e4949119..00b01ea86e8 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -338,20 +338,6 @@ struct StructType : public DataType { std::string ToString() const override; }; -extern const std::shared_ptr NA; -extern const std::shared_ptr BOOL; -extern const std::shared_ptr UINT8; -extern const std::shared_ptr UINT16; -extern const std::shared_ptr UINT32; -extern const std::shared_ptr UINT64; -extern const std::shared_ptr INT8; -extern const std::shared_ptr INT16; -extern const std::shared_ptr INT32; -extern const std::shared_ptr INT64; -extern const std::shared_ptr FLOAT; -extern const std::shared_ptr DOUBLE; -extern const std::shared_ptr STRING; - } // namespace arrow #endif // ARROW_TYPE_H diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8fdd829010e..8f5c27b0f76 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -404,7 +404,6 @@ set(PYARROW_SRCS ) set(LINK_LIBS - pyarrow_util arrow ) @@ -428,7 +427,6 @@ set(CYTHON_EXTENSIONS array config error - parquet scalar schema ) diff --git a/python/arrow/formatting.py b/python/arrow/formatting.py new file mode 100644 index 00000000000..a42d4e4bb57 --- /dev/null +++ b/python/arrow/formatting.py @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Pretty-printing and other formatting utilities for Arrow data structures + +import arrow.scalar as scalar + + +def array_format(arr, window=None): + values = [] + + if window is None or window * 2 >= len(arr): + for x in arr: + values.append(value_format(x, 0)) + contents = _indent(',\n'.join(values), 2) + else: + for i in range(window): + values.append(value_format(arr[i], 0) + ',') + values.append('...') + for i in range(len(arr) - window, len(arr)): + formatted = value_format(arr[i], 0) + if i < len(arr) - 1: + formatted += ',' + values.append(formatted) + contents = _indent('\n'.join(values), 2) + + return '[\n{0}\n]'.format(contents) + + +def value_format(x, indent_level=0): + if isinstance(x, scalar.ListValue): + contents = ',\n'.join(value_format(item) for item in x) + return '[{0}]'.format(_indent(contents, 1).strip()) + else: + return repr(x) + + +def _indent(text, spaces): + if spaces == 0: + return text + block = ' ' * spaces + return '\n'.join(block + x for x in text.split('\n')) diff --git a/python/cmake_modules/UseCython.cmake b/python/cmake_modules/UseCython.cmake index e7034db52f3..3b1c201edff 100644 --- a/python/cmake_modules/UseCython.cmake +++ b/python/cmake_modules/UseCython.cmake @@ -121,6 +121,11 @@ function( compile_pyx _name pyx_target_name generated_files pyx_file) set( _generated_files "${_name}.${extension}") endif() set_source_files_properties( ${_generated_files} PROPERTIES GENERATED TRUE ) + + # Cython creates a lot of compiler warning detritus on clang + set_source_files_properties(${_generated_files} PROPERTIES + COMPILE_FLAGS -Wno-unused-function) + set( ${generated_files} ${_generated_files} PARENT_SCOPE ) # Add the command to run the compiler. diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 00000000000..a82cb20aab8 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,4 @@ +pytest +numpy>=1.7.0 +pandas>=0.12.0 +six diff --git a/python/setup.py b/python/setup.py index 9a0de071a9c..eb3ff2a1547 100644 --- a/python/setup.py +++ b/python/setup.py @@ -210,7 +210,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['array', 'config', 'error', 'parquet', 'scalar', 'schema'] + return ['array', 'config', 'error', 'scalar', 'schema'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 60d6248842e..bb7905236c5 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -22,6 +22,7 @@ #include +#include "pyarrow/helpers.h" #include "pyarrow/status.h" using arrow::ArrayBuilder; @@ -74,16 +75,16 @@ class ScalarVisitor { std::shared_ptr GetType() { // TODO(wesm): handling mixed-type cases if (float_count_) { - return arrow::DOUBLE; + return DOUBLE; } else if (int_count_) { // TODO(wesm): tighter type later - return arrow::INT64; + return INT64; } else if (bool_count_) { - return arrow::BOOL; + return BOOL; } else if (string_count_) { - return arrow::STRING; + return STRING; } else { - return arrow::NA; + return NA; } } @@ -145,7 +146,7 @@ class SeqVisitor { std::shared_ptr GetType() { if (scalars_.total_count() == 0) { if (max_nesting_level_ == 0) { - return arrow::NA; + return NA; } else { return nullptr; } @@ -209,7 +210,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size, // For 0-length sequences, refuse to guess if (*size == 0) { - *out_type = arrow::NA; + *out_type = NA; } SeqVisitor seq_visitor; @@ -217,6 +218,11 @@ static Status InferArrowType(PyObject* obj, int64_t* size, PY_RETURN_NOT_OK(seq_visitor.Validate()); *out_type = seq_visitor.GetType(); + + if (*out_type == nullptr) { + return Status::TypeError("Unable to determine data type"); + } + return Status::OK(); } diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h index 24886f4970d..88869c20480 100644 --- a/python/src/pyarrow/adapters/builtin.h +++ b/python/src/pyarrow/adapters/builtin.h @@ -25,6 +25,8 @@ #include +#include + #include "pyarrow/common.h" namespace arrow { class Array; } diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index d0969dacc21..0921fc49945 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -23,6 +23,20 @@ using namespace arrow; namespace pyarrow { +const std::shared_ptr NA = std::make_shared(); +const std::shared_ptr BOOL = std::make_shared(); +const std::shared_ptr UINT8 = std::make_shared(); +const std::shared_ptr UINT16 = std::make_shared(); +const std::shared_ptr UINT32 = std::make_shared(); +const std::shared_ptr UINT64 = std::make_shared(); +const std::shared_ptr INT8 = std::make_shared(); +const std::shared_ptr INT16 = std::make_shared(); +const std::shared_ptr INT32 = std::make_shared(); +const std::shared_ptr INT64 = std::make_shared(); +const std::shared_ptr FLOAT = std::make_shared(); +const std::shared_ptr DOUBLE = std::make_shared(); +const std::shared_ptr STRING = std::make_shared(); + #define GET_PRIMITIVE_TYPE(NAME, Type) \ case LogicalType::NAME: \ if (nullable) { \ diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index 1a24f056feb..e41568d5881 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -26,6 +26,20 @@ namespace pyarrow { using arrow::DataType; using arrow::LogicalType; +extern const std::shared_ptr NA; +extern const std::shared_ptr BOOL; +extern const std::shared_ptr UINT8; +extern const std::shared_ptr UINT16; +extern const std::shared_ptr UINT32; +extern const std::shared_ptr UINT64; +extern const std::shared_ptr INT8; +extern const std::shared_ptr INT16; +extern const std::shared_ptr INT32; +extern const std::shared_ptr INT64; +extern const std::shared_ptr FLOAT; +extern const std::shared_ptr DOUBLE; +extern const std::shared_ptr STRING; + std::shared_ptr GetPrimitiveType(LogicalType::type type, bool nullable); diff --git a/python/src/pyarrow/util/CMakeLists.txt b/python/src/pyarrow/util/CMakeLists.txt index 60dc80eb38c..3fd8bac3150 100644 --- a/python/src/pyarrow/util/CMakeLists.txt +++ b/python/src/pyarrow/util/CMakeLists.txt @@ -15,22 +15,6 @@ # specific language governing permissions and limitations # under the License. -####################################### -# pyarrow_util -####################################### - -set(UTIL_SRCS -) - -set(UTIL_LIBS -) - -add_library(pyarrow_util STATIC - ${UTIL_SRCS} -) -target_link_libraries(pyarrow_util ${UTIL_LIBS}) -SET_TARGET_PROPERTIES(pyarrow_util PROPERTIES LINKER_LANGUAGE CXX) - ####################################### # pyarrow_test_main ####################################### @@ -40,7 +24,7 @@ add_library(pyarrow_test_main if (APPLE) target_link_libraries(pyarrow_test_main - gmock + gtest dl) set_target_properties(pyarrow_test_main PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") From 6fdcd4943ff9a8cc66afbee380217cec40c0cda0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 9 Mar 2016 15:45:05 -0800 Subject: [PATCH 031/210] ARROW-54: [Python] Rename package to "pyarrow" Also fixed rpath issues (at great cost) per ARROW-53 Author: Wes McKinney Closes #23 from wesm/ARROW-54 and squashes the following commits: b8ce0e8 [Wes McKinney] Update installation instructions cae9b39 [Wes McKinney] Fix rpath issues per ARROW-53 7554539 [Wes McKinney] Twiddle rpath stuff, remove empty arrow_test_util module 8cca41a [Wes McKinney] Fix Travis CI script for renamed package 1d37c93 [Wes McKinney] Opt in to building unit tests 60088d0 [Wes McKinney] Rename package to pyarrow e3d0caf [Wes McKinney] Note on other Python interpreters 80d3bac [Wes McKinney] Start installation document --- .travis.yml | 4 +- ci/travis_script_python.sh | 2 +- cpp/CMakeLists.txt | 29 ++++--- cpp/src/arrow/CMakeLists.txt | 2 +- cpp/src/arrow/util/CMakeLists.txt | 44 ++++------ python/CMakeLists.txt | 31 ++++--- python/arrow/__init__.py | 38 -------- python/doc/INSTALL.md | 87 +++++++++++++++++++ python/pyarrow/__init__.py | 38 ++++++++ python/{arrow => pyarrow}/array.pxd | 8 +- python/{arrow => pyarrow}/array.pyx | 14 +-- python/{arrow => pyarrow}/compat.py | 0 python/{arrow => pyarrow}/config.pyx | 0 python/{arrow => pyarrow}/error.pxd | 2 +- python/{arrow => pyarrow}/error.pyx | 5 +- python/{arrow => pyarrow}/formatting.py | 2 +- .../{arrow => pyarrow}/includes/__init__.pxd | 0 python/{arrow => pyarrow}/includes/common.pxd | 0 .../includes/libarrow.pxd} | 2 +- .../{arrow => pyarrow}/includes/parquet.pxd | 2 +- .../{arrow => pyarrow}/includes/pyarrow.pxd | 6 +- python/{arrow => pyarrow}/parquet.pyx | 4 +- python/{arrow => pyarrow}/scalar.pxd | 6 +- python/{arrow => pyarrow}/scalar.pyx | 6 +- python/{arrow => pyarrow}/schema.pxd | 4 +- python/{arrow => pyarrow}/schema.pyx | 6 +- python/{arrow => pyarrow}/tests/__init__.py | 0 python/{arrow => pyarrow}/tests/test_array.py | 16 ++-- .../tests/test_convert_builtin.py | 52 +++++------ .../{arrow => pyarrow}/tests/test_scalars.py | 4 +- .../{arrow => pyarrow}/tests/test_schema.py | 4 +- python/requirements.txt | 1 - python/setup.py | 52 ++++++----- python/src/pyarrow/util/CMakeLists.txt | 30 ++++--- 34 files changed, 300 insertions(+), 201 deletions(-) delete mode 100644 python/arrow/__init__.py create mode 100644 python/doc/INSTALL.md create mode 100644 python/pyarrow/__init__.py rename python/{arrow => pyarrow}/array.pxd (90%) rename python/{arrow => pyarrow}/array.pyx (93%) rename python/{arrow => pyarrow}/compat.py (100%) rename python/{arrow => pyarrow}/config.pyx (100%) rename python/{arrow => pyarrow}/error.pxd (95%) rename python/{arrow => pyarrow}/error.pyx (92%) rename python/{arrow => pyarrow}/formatting.py (98%) rename python/{arrow => pyarrow}/includes/__init__.pxd (100%) rename python/{arrow => pyarrow}/includes/common.pxd (100%) rename python/{arrow/includes/arrow.pxd => pyarrow/includes/libarrow.pxd} (99%) rename python/{arrow => pyarrow}/includes/parquet.pxd (97%) rename python/{arrow => pyarrow}/includes/pyarrow.pxd (90%) rename python/{arrow => pyarrow}/parquet.pyx (91%) rename python/{arrow => pyarrow}/scalar.pxd (93%) rename python/{arrow => pyarrow}/scalar.pyx (97%) rename python/{arrow => pyarrow}/schema.pxd (91%) rename python/{arrow => pyarrow}/schema.pyx (97%) rename python/{arrow => pyarrow}/tests/__init__.py (100%) rename python/{arrow => pyarrow}/tests/test_array.py (80%) rename python/{arrow => pyarrow}/tests/test_convert_builtin.py (58%) rename python/{arrow => pyarrow}/tests/test_scalars.py (97%) rename python/{arrow => pyarrow}/tests/test_schema.py (96%) diff --git a/.travis.yml b/.travis.yml index 9e858d7d98e..49a956ead3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,7 +27,8 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh - compiler: clang - language: cpp + language: objective-c + osx_image: xcode6.4 os: osx addons: before_script: @@ -40,7 +41,6 @@ before_install: - ulimit -c unlimited -S - export CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build - export ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install -- export LD_LIBRARY_PATH=$ARROW_CPP_INSTALL/lib:$LD_LIBRARY_PATH after_script: - rm -rf $CPP_BUILD_DIR diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 9b0bd4f54cb..14d66b44ff8 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -48,7 +48,7 @@ export ARROW_HOME=$ARROW_CPP_INSTALL python setup.py build_ext --inplace -py.test -vv -r sxX arrow +py.test -vv -r sxX pyarrow # if [ $TRAVIS_OS_NAME == "linux" ]; then # valgrind --tool=memcheck py.test -vv -r sxX arrow diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e8cb88c0b4d..f5f60380311 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -37,11 +37,6 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() -if(APPLE) - # In newer versions of CMake, this is the default setting - set(CMAKE_MACOSX_RPATH 1) -endif() - find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) @@ -339,10 +334,13 @@ endfunction() if ("$ENV{GTEST_HOME}" STREQUAL "") set(GTest_HOME ${THIRDPARTY_DIR}/googletest-release-1.7.0) endif() -find_package(GTest REQUIRED) -include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) -ADD_THIRDPARTY_LIB(gtest - STATIC_LIB ${GTEST_STATIC_LIB}) + +if(ARROW_BUILD_TESTS) + find_package(GTest REQUIRED) + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(gtest + STATIC_LIB ${GTEST_STATIC_LIB}) +endif() ## Google PerfTools ## @@ -366,7 +364,7 @@ ADD_THIRDPARTY_LIB(gtest ############################################################ # Linker setup ############################################################ -set(ARROW_MIN_TEST_LIBS arrow arrow_test_main arrow_test_util ${ARROW_BASE_LIBS}) +set(ARROW_MIN_TEST_LIBS arrow arrow_test_main ${ARROW_BASE_LIBS}) set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) ############################################################ @@ -461,9 +459,18 @@ add_library(arrow ${LIBARROW_LINKAGE} ${ARROW_SRCS} ) + +if (APPLE) + set_target_properties(arrow + PROPERTIES + BUILD_WITH_INSTALL_RPATH ON + INSTALL_NAME_DIR "@rpath") +endif() + set_target_properties(arrow PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") + LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" +) target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) add_subdirectory(src/arrow) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 77326ce38d7..73e6a9b22c9 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -27,6 +27,6 @@ install(FILES # Unit tests ####################################### -set(ARROW_TEST_LINK_LIBS arrow_test_util ${ARROW_MIN_TEST_LIBS}) +set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) ADD_ARROW_TEST(array-test) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 4272ce42854..d8e2f98f2c8 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -28,37 +28,27 @@ install(FILES status.h DESTINATION include/arrow/util) -####################################### -# arrow_test_util -####################################### - -add_library(arrow_test_util) -target_link_libraries(arrow_test_util -) - -SET_TARGET_PROPERTIES(arrow_test_util PROPERTIES LINKER_LANGUAGE CXX) - ####################################### # arrow_test_main ####################################### -add_library(arrow_test_main - test_main.cc) - -if (APPLE) - target_link_libraries(arrow_test_main - gtest - arrow_test_util - dl) - set_target_properties(arrow_test_main - PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") -else() - target_link_libraries(arrow_test_main - gtest - arrow_test_util - pthread - dl - ) +if (ARROW_BUILD_TESTS) + add_library(arrow_test_main + test_main.cc) + + if (APPLE) + target_link_libraries(arrow_test_main + gtest + dl) + set_target_properties(arrow_test_main + PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + else() + target_link_libraries(arrow_test_main + gtest + pthread + dl + ) + endif() endif() ADD_ARROW_TEST(bit-util-test) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8f5c27b0f76..0ecafc7202e 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -45,6 +45,13 @@ if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() +# Top level cmake dir +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + option(PYARROW_BUILD_TESTS + "Build the PyArrow C++ googletest unit tests" + OFF) +endif() + find_program(CCACHE_FOUND ccache) if(CCACHE_FOUND) set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache) @@ -322,10 +329,12 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) endfunction() ## GMock -find_package(GTest REQUIRED) -include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) -ADD_THIRDPARTY_LIB(gtest - STATIC_LIB ${GTEST_STATIC_LIB}) +if (PYARROW_BUILD_TESTS) + find_package(GTest REQUIRED) + include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(gtest + STATIC_LIB ${GTEST_STATIC_LIB}) +endif() ## Arrow find_package(Arrow REQUIRED) @@ -391,6 +400,10 @@ endif (UNIX) # Subdirectories ############################################################ +if (UNIX) + set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) +endif() + add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) @@ -407,10 +420,11 @@ set(LINK_LIBS arrow ) +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + add_library(pyarrow SHARED ${PYARROW_SRCS}) target_link_libraries(pyarrow ${LINK_LIBS}) -set_target_properties(pyarrow PROPERTIES LINKER_LANGUAGE CXX) if(APPLE) set_target_properties(pyarrow PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") @@ -420,9 +434,6 @@ endif() # Setup and build Cython modules ############################################################ -set(USE_RELATIVE_RPATH ON) -set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) - set(CYTHON_EXTENSIONS array config @@ -437,7 +448,7 @@ foreach(module ${CYTHON_EXTENSIONS}) list(REMOVE_AT directories -1) string(REPLACE "." "/" module_root "${module}") - set(module_SRC arrow/${module_root}.pyx) + set(module_SRC pyarrow/${module_root}.pyx) set_source_files_properties(${module_SRC} PROPERTIES CYTHON_IS_CXX 1) cython_add_module(${module_name} @@ -463,7 +474,7 @@ foreach(module ${CYTHON_EXTENSIONS}) endwhile(${i} GREATER 0) # for inplace development for now - set(module_install_rpath "${CMAKE_SOURCE_DIR}/arrow/") + #set(module_install_rpath "${CMAKE_SOURCE_DIR}/pyarrow/") set_target_properties(${module_name} PROPERTIES INSTALL_RPATH ${module_install_rpath}) diff --git a/python/arrow/__init__.py b/python/arrow/__init__.py deleted file mode 100644 index 3507ea0235a..00000000000 --- a/python/arrow/__init__.py +++ /dev/null @@ -1,38 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# flake8: noqa - -from arrow.array import (Array, from_pylist, total_allocated_bytes, - BooleanArray, NumericArray, - Int8Array, UInt8Array, - ListArray, StringArray) - -from arrow.error import ArrowException - -from arrow.scalar import (ArrayValue, Scalar, NA, NAType, - BooleanValue, - Int8Value, Int16Value, Int32Value, Int64Value, - UInt8Value, UInt16Value, UInt32Value, UInt64Value, - FloatValue, DoubleValue, ListValue, StringValue) - -from arrow.schema import (null, bool_, - int8, int16, int32, int64, - uint8, uint16, uint32, uint64, - float_, double, string, - list_, struct, field, - DataType, Field, Schema) diff --git a/python/doc/INSTALL.md b/python/doc/INSTALL.md new file mode 100644 index 00000000000..d30a03046ed --- /dev/null +++ b/python/doc/INSTALL.md @@ -0,0 +1,87 @@ +## Building pyarrow (Apache Arrow Python library) + +First, clone the master git repository: + +```bash +git clone https://github.com/apache/arrow.git arrow +``` + +#### System requirements + +Building pyarrow requires: + +* A C++11 compiler + + * Linux: gcc >= 4.8 or clang >= 3.5 + * OS X: XCode 6.4 or higher preferred + +* [cmake][1] + +#### Python requirements + +You will need Python (CPython) 2.7, 3.4, or 3.5 installed. Earlier releases and +are not being targeted. + +> This library targets CPython only due to an emphasis on interoperability with +> pandas and NumPy, which are only available for CPython. + +The build requires NumPy, Cython, and a few other Python dependencies: + +```bash +pip install cython +cd arrow/python +pip install -r requirements.txt +``` + +#### Installing Arrow C++ library + +First, you should choose an installation location for Arrow C++. In the future +using the default system install location will work, but for now we are being +explicit: + +```bash +export ARROW_HOME=$HOME/local +``` + +Now, we build Arrow: + +```bash +cd arrow/cpp + +mkdir dev-build +cd dev-build + +cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. + +make + +# Use sudo here if $ARROW_HOME requires it +make install +``` + +#### Install `pyarrow` + +```bash +cd arrow/python + +python setup.py install +``` + +> On XCode 6 and prior there are some known OS X `@rpath` issues. If you are +> unable to import pyarrow, upgrading XCode may be the solution. + + +```python +In [1]: import pyarrow + +In [2]: pyarrow.from_pylist([1,2,3]) +Out[2]: + +[ + 1, + 2, + 3 +] +``` + +[1]: https://cmake.org/ \ No newline at end of file diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py new file mode 100644 index 00000000000..8d93a156bcc --- /dev/null +++ b/python/pyarrow/__init__.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# flake8: noqa + +from pyarrow.array import (Array, from_pylist, total_allocated_bytes, + BooleanArray, NumericArray, + Int8Array, UInt8Array, + ListArray, StringArray) + +from pyarrow.error import ArrowException + +from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType, + BooleanValue, + Int8Value, Int16Value, Int32Value, Int64Value, + UInt8Value, UInt16Value, UInt32Value, UInt64Value, + FloatValue, DoubleValue, ListValue, StringValue) + +from pyarrow.schema import (null, bool_, + int8, int16, int32, int64, + uint8, uint16, uint32, uint64, + float_, double, string, + list_, struct, field, + DataType, Field, Schema) diff --git a/python/arrow/array.pxd b/python/pyarrow/array.pxd similarity index 90% rename from python/arrow/array.pxd rename to python/pyarrow/array.pxd index 482f8f796dd..d0d3486c032 100644 --- a/python/arrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -15,12 +15,12 @@ # specific language governing permissions and limitations # under the License. -from arrow.includes.common cimport shared_ptr -from arrow.includes.arrow cimport CArray, LogicalType +from pyarrow.includes.common cimport shared_ptr +from pyarrow.includes.libarrow cimport CArray, LogicalType -from arrow.scalar import NA +from pyarrow.scalar import NA -from arrow.schema cimport DataType +from pyarrow.schema cimport DataType cdef extern from "Python.h": int PySlice_Check(object) diff --git a/python/arrow/array.pyx b/python/pyarrow/array.pyx similarity index 93% rename from python/arrow/array.pyx rename to python/pyarrow/array.pyx index b367e3b84a8..bceb333c94e 100644 --- a/python/arrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -19,14 +19,14 @@ # distutils: language = c++ # cython: embedsignature = True -from arrow.includes.arrow cimport * -cimport arrow.includes.pyarrow as pyarrow +from pyarrow.includes.libarrow cimport * +cimport pyarrow.includes.pyarrow as pyarrow -from arrow.compat import frombytes, tobytes -from arrow.error cimport check_status +from pyarrow.compat import frombytes, tobytes +from pyarrow.error cimport check_status -cimport arrow.scalar as scalar -from arrow.scalar import NA +cimport pyarrow.scalar as scalar +from pyarrow.scalar import NA def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.GetMemoryPool() @@ -52,7 +52,7 @@ cdef class Array: raise StopIteration def __repr__(self): - from arrow.formatting import array_format + from pyarrow.formatting import array_format type_format = object.__repr__(self) values = array_format(self, window=10) return '{0}\n{1}'.format(type_format, values) diff --git a/python/arrow/compat.py b/python/pyarrow/compat.py similarity index 100% rename from python/arrow/compat.py rename to python/pyarrow/compat.py diff --git a/python/arrow/config.pyx b/python/pyarrow/config.pyx similarity index 100% rename from python/arrow/config.pyx rename to python/pyarrow/config.pyx diff --git a/python/arrow/error.pxd b/python/pyarrow/error.pxd similarity index 95% rename from python/arrow/error.pxd rename to python/pyarrow/error.pxd index c18cb3efffc..d226abeda04 100644 --- a/python/arrow/error.pxd +++ b/python/pyarrow/error.pxd @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -from arrow.includes.pyarrow cimport * +from pyarrow.includes.pyarrow cimport * cdef check_status(const Status& status) diff --git a/python/arrow/error.pyx b/python/pyarrow/error.pyx similarity index 92% rename from python/arrow/error.pyx rename to python/pyarrow/error.pyx index f1d51635881..3f8d7dd6460 100644 --- a/python/arrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -15,9 +15,8 @@ # specific language governing permissions and limitations # under the License. -from arrow.includes.common cimport c_string - -from arrow.compat import frombytes +from pyarrow.includes.common cimport c_string +from pyarrow.compat import frombytes class ArrowException(Exception): pass diff --git a/python/arrow/formatting.py b/python/pyarrow/formatting.py similarity index 98% rename from python/arrow/formatting.py rename to python/pyarrow/formatting.py index a42d4e4bb57..5fe0611f845 100644 --- a/python/arrow/formatting.py +++ b/python/pyarrow/formatting.py @@ -17,7 +17,7 @@ # Pretty-printing and other formatting utilities for Arrow data structures -import arrow.scalar as scalar +import pyarrow.scalar as scalar def array_format(arr, window=None): diff --git a/python/arrow/includes/__init__.pxd b/python/pyarrow/includes/__init__.pxd similarity index 100% rename from python/arrow/includes/__init__.pxd rename to python/pyarrow/includes/__init__.pxd diff --git a/python/arrow/includes/common.pxd b/python/pyarrow/includes/common.pxd similarity index 100% rename from python/arrow/includes/common.pxd rename to python/pyarrow/includes/common.pxd diff --git a/python/arrow/includes/arrow.pxd b/python/pyarrow/includes/libarrow.pxd similarity index 99% rename from python/arrow/includes/arrow.pxd rename to python/pyarrow/includes/libarrow.pxd index 0cc44c06cb6..baba112833e 100644 --- a/python/arrow/includes/arrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -17,7 +17,7 @@ # distutils: language = c++ -from arrow.includes.common cimport * +from pyarrow.includes.common cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: diff --git a/python/arrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd similarity index 97% rename from python/arrow/includes/parquet.pxd rename to python/pyarrow/includes/parquet.pxd index 62342f30669..99a2d423d9c 100644 --- a/python/arrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -17,7 +17,7 @@ # distutils: language = c++ -from arrow.includes.common cimport * +from pyarrow.includes.common cimport * cdef extern from "parquet/api/reader.h" namespace "parquet_cpp" nogil: cdef cppclass ColumnReader: diff --git a/python/arrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd similarity index 90% rename from python/arrow/includes/pyarrow.pxd rename to python/pyarrow/includes/pyarrow.pxd index 3eed5b85424..9a0c004b768 100644 --- a/python/arrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -17,9 +17,9 @@ # distutils: language = c++ -from arrow.includes.common cimport * -from arrow.includes.arrow cimport (CArray, CDataType, LogicalType, - MemoryPool) +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport (CArray, CDataType, LogicalType, + MemoryPool) cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed diff --git a/python/arrow/parquet.pyx b/python/pyarrow/parquet.pyx similarity index 91% rename from python/arrow/parquet.pyx rename to python/pyarrow/parquet.pyx index 23c3838bcad..622e7d07724 100644 --- a/python/arrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -19,5 +19,5 @@ # distutils: language = c++ # cython: embedsignature = True -from arrow.compat import frombytes, tobytes -from arrow.includes.parquet cimport * +from pyarrow.compat import frombytes, tobytes +from pyarrow.includes.parquet cimport * diff --git a/python/arrow/scalar.pxd b/python/pyarrow/scalar.pxd similarity index 93% rename from python/arrow/scalar.pxd rename to python/pyarrow/scalar.pxd index 4e0a3647155..b0684571864 100644 --- a/python/arrow/scalar.pxd +++ b/python/pyarrow/scalar.pxd @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. -from arrow.includes.common cimport * -from arrow.includes.arrow cimport * +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * -from arrow.schema cimport DataType +from pyarrow.schema cimport DataType cdef class Scalar: cdef readonly: diff --git a/python/arrow/scalar.pyx b/python/pyarrow/scalar.pyx similarity index 97% rename from python/arrow/scalar.pyx rename to python/pyarrow/scalar.pyx index 72a280e334f..261a38967c4 100644 --- a/python/arrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. -from arrow.schema cimport DataType, box_data_type +from pyarrow.schema cimport DataType, box_data_type -from arrow.compat import frombytes -import arrow.schema as schema +from pyarrow.compat import frombytes +import pyarrow.schema as schema NA = None diff --git a/python/arrow/schema.pxd b/python/pyarrow/schema.pxd similarity index 91% rename from python/arrow/schema.pxd rename to python/pyarrow/schema.pxd index 8cc244aaba3..07b9bd04da2 100644 --- a/python/arrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from arrow.includes.common cimport shared_ptr -from arrow.includes.arrow cimport CDataType, CField, CSchema +from pyarrow.includes.common cimport shared_ptr +from pyarrow.includes.libarrow cimport CDataType, CField, CSchema cdef class DataType: cdef: diff --git a/python/arrow/schema.pyx b/python/pyarrow/schema.pyx similarity index 97% rename from python/arrow/schema.pyx rename to python/pyarrow/schema.pyx index 3001531eb60..ea878720d5b 100644 --- a/python/arrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -22,9 +22,9 @@ # distutils: language = c++ # cython: embedsignature = True -from arrow.compat import frombytes, tobytes -from arrow.includes.arrow cimport * -cimport arrow.includes.pyarrow as pyarrow +from pyarrow.compat import frombytes, tobytes +from pyarrow.includes.libarrow cimport * +cimport pyarrow.includes.pyarrow as pyarrow cimport cpython diff --git a/python/arrow/tests/__init__.py b/python/pyarrow/tests/__init__.py similarity index 100% rename from python/arrow/tests/__init__.py rename to python/pyarrow/tests/__init__.py diff --git a/python/arrow/tests/test_array.py b/python/pyarrow/tests/test_array.py similarity index 80% rename from python/arrow/tests/test_array.py rename to python/pyarrow/tests/test_array.py index ebd872c744e..034c1576551 100644 --- a/python/arrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -15,19 +15,19 @@ # specific language governing permissions and limitations # under the License. -from arrow.compat import unittest -import arrow -import arrow.formatting as fmt +from pyarrow.compat import unittest +import pyarrow +import pyarrow.formatting as fmt class TestArrayAPI(unittest.TestCase): def test_getitem_NA(self): - arr = arrow.from_pylist([1, None, 2]) - assert arr[1] is arrow.NA + arr = pyarrow.from_pylist([1, None, 2]) + assert arr[1] is pyarrow.NA def test_list_format(self): - arr = arrow.from_pylist([[1], None, [2, 3]]) + arr = pyarrow.from_pylist([[1], None, [2, 3]]) result = fmt.array_format(arr) expected = """\ [ @@ -39,7 +39,7 @@ def test_list_format(self): assert result == expected def test_string_format(self): - arr = arrow.from_pylist(['foo', None, 'bar']) + arr = pyarrow.from_pylist(['foo', None, 'bar']) result = fmt.array_format(arr) expected = """\ [ @@ -50,7 +50,7 @@ def test_string_format(self): assert result == expected def test_long_array_format(self): - arr = arrow.from_pylist(range(100)) + arr = pyarrow.from_pylist(range(100)) result = fmt.array_format(arr, window=2) expected = """\ [ diff --git a/python/arrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py similarity index 58% rename from python/arrow/tests/test_convert_builtin.py rename to python/pyarrow/tests/test_convert_builtin.py index 57e6ab9f0e7..25f69691210 100644 --- a/python/arrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from arrow.compat import unittest -import arrow +from pyarrow.compat import unittest +import pyarrow class TestConvertList(unittest.TestCase): @@ -25,61 +25,61 @@ def test_boolean(self): pass def test_empty_list(self): - arr = arrow.from_pylist([]) + arr = pyarrow.from_pylist([]) assert len(arr) == 0 assert arr.null_count == 0 - assert arr.type == arrow.null() + assert arr.type == pyarrow.null() def test_all_none(self): - arr = arrow.from_pylist([None, None]) + arr = pyarrow.from_pylist([None, None]) assert len(arr) == 2 assert arr.null_count == 2 - assert arr.type == arrow.null() + assert arr.type == pyarrow.null() def test_integer(self): - arr = arrow.from_pylist([1, None, 3, None]) + arr = pyarrow.from_pylist([1, None, 3, None]) assert len(arr) == 4 assert arr.null_count == 2 - assert arr.type == arrow.int64() + assert arr.type == pyarrow.int64() def test_garbage_collection(self): import gc - bytes_before = arrow.total_allocated_bytes() - arrow.from_pylist([1, None, 3, None]) + bytes_before = pyarrow.total_allocated_bytes() + pyarrow.from_pylist([1, None, 3, None]) gc.collect() - assert arrow.total_allocated_bytes() == bytes_before + assert pyarrow.total_allocated_bytes() == bytes_before def test_double(self): data = [1.5, 1, None, 2.5, None, None] - arr = arrow.from_pylist(data) + arr = pyarrow.from_pylist(data) assert len(arr) == 6 assert arr.null_count == 3 - assert arr.type == arrow.double() + assert arr.type == pyarrow.double() def test_string(self): data = ['foo', b'bar', None, 'arrow'] - arr = arrow.from_pylist(data) + arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 - assert arr.type == arrow.string() + assert arr.type == pyarrow.string() def test_mixed_nesting_levels(self): - arrow.from_pylist([1, 2, None]) - arrow.from_pylist([[1], [2], None]) - arrow.from_pylist([[1], [2], [None]]) + pyarrow.from_pylist([1, 2, None]) + pyarrow.from_pylist([[1], [2], None]) + pyarrow.from_pylist([[1], [2], [None]]) - with self.assertRaises(arrow.ArrowException): - arrow.from_pylist([1, 2, [1]]) + with self.assertRaises(pyarrow.ArrowException): + pyarrow.from_pylist([1, 2, [1]]) - with self.assertRaises(arrow.ArrowException): - arrow.from_pylist([1, 2, []]) + with self.assertRaises(pyarrow.ArrowException): + pyarrow.from_pylist([1, 2, []]) - with self.assertRaises(arrow.ArrowException): - arrow.from_pylist([[1], [2], [None, [1]]]) + with self.assertRaises(pyarrow.ArrowException): + pyarrow.from_pylist([[1], [2], [None, [1]]]) def test_list_of_int(self): data = [[1, 2, 3], [], None, [1, 2]] - arr = arrow.from_pylist(data) + arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 - assert arr.type == arrow.list_(arrow.int64()) + assert arr.type == pyarrow.list_(pyarrow.int64()) diff --git a/python/arrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py similarity index 97% rename from python/arrow/tests/test_scalars.py rename to python/pyarrow/tests/test_scalars.py index 951380bd981..021737db672 100644 --- a/python/arrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from arrow.compat import unittest, u -import arrow +from pyarrow.compat import unittest, u +import pyarrow as arrow class TestScalars(unittest.TestCase): diff --git a/python/arrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py similarity index 96% rename from python/arrow/tests/test_schema.py rename to python/pyarrow/tests/test_schema.py index a89edd74a0a..0235526198f 100644 --- a/python/arrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -from arrow.compat import unittest -import arrow +from pyarrow.compat import unittest +import pyarrow as arrow class TestTypes(unittest.TestCase): diff --git a/python/requirements.txt b/python/requirements.txt index a82cb20aab8..f42c90c5c9b 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -1,4 +1,3 @@ pytest numpy>=1.7.0 -pandas>=0.12.0 six diff --git a/python/setup.py b/python/setup.py index eb3ff2a1547..5cc871aba9f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -27,7 +27,7 @@ import sys import pkg_resources -from setuptools import setup +from setuptools import setup, Extension import os @@ -40,10 +40,12 @@ is_64_bit = sys.maxsize > 2**32 # Check if this is a debug build of Python. -if hasattr(sys, 'gettotalrefcount'): - build_type = 'Debug' -else: - build_type = 'Release' +# if hasattr(sys, 'gettotalrefcount'): +# build_type = 'Debug' +# else: +# build_type = 'Release' + +build_type = 'Debug' if Cython.__version__ < '0.19.1': raise Exception('Please upgrade to Cython 0.19.1 or newer') @@ -51,7 +53,7 @@ MAJOR = 0 MINOR = 1 MICRO = 0 -VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) +VERSION = '%d.%d.%ddev' % (MAJOR, MINOR, MICRO) class clean(_clean): @@ -70,6 +72,9 @@ class build_ext(_build_ext): def build_extensions(self): numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') + self.extensions = [ext for ext in self.extensions + if ext.name != '__dummy__'] + for ext in self.extensions: if (hasattr(ext, 'include_dirs') and numpy_incl not in ext.include_dirs): @@ -98,6 +103,7 @@ def _run_cmake(self): # The staging directory for the module being built build_temp = pjoin(os.getcwd(), self.build_temp) + build_lib = os.path.join(os.getcwd(), self.build_lib) # Change to the build directory saved_cwd = os.getcwd() @@ -124,7 +130,7 @@ def _run_cmake(self): static_lib_option, source] self.spawn(cmake_command) - args = ['make'] + args = ['make', 'VERBOSE=1'] if 'PYARROW_PARALLEL' in os.environ: args.append('-j{0}'.format(os.environ['PYARROW_PARALLEL'])) self.spawn(args) @@ -150,21 +156,19 @@ def _run_cmake(self): if self.inplace: # a bit hacky build_lib = saved_cwd - else: - build_lib = pjoin(os.getcwd(), self.build_lib) # Move the built libpyarrow library to the place expected by the Python # build if sys.platform != 'win32': name, = glob.glob('libpyarrow.*') try: - os.makedirs(pjoin(build_lib, 'arrow')) + os.makedirs(pjoin(build_lib, 'pyarrow')) except OSError: pass - shutil.move(name, pjoin(build_lib, 'arrow', name)) + shutil.move(name, pjoin(build_lib, 'pyarrow', name)) else: shutil.move(pjoin(build_type, 'pyarrow.dll'), - pjoin(build_lib, 'arrow', 'pyarrow.dll')) + pjoin(build_lib, 'pyarrow', 'pyarrow.dll')) # Move the built C-extension to the place expected by the Python build self._found_names = [] @@ -192,7 +196,7 @@ def _get_inplace_dir(self): def _get_cmake_ext_path(self, name): # Get the package directory from build_py build_py = self.get_finalized_command('build_py') - package_dir = build_py.get_package_dir('arrow') + package_dir = build_py.get_package_dir('pyarrow') # This is the name of the arrow C-extension suffix = sysconfig.get_config_var('EXT_SUFFIX') if suffix is None: @@ -217,23 +221,23 @@ def get_names(self): def get_outputs(self): # Just the C extensions - cmake_exts = [self._get_cmake_ext_path(name) - for name in self.get_names()] - regular_exts = _build_ext.get_outputs(self) - return regular_exts + cmake_exts + # regular_exts = _build_ext.get_outputs(self) + return [self._get_cmake_ext_path(name) + for name in self.get_names()] -extensions = [] - DESC = """\ Python library for Apache Arrow""" setup( - name="arrow", - packages=['arrow', 'arrow.tests'], + name="pyarrow", + packages=['pyarrow', 'pyarrow.tests'], version=VERSION, - package_data={'arrow': ['*.pxd', '*.pyx']}, - ext_modules=extensions, + zip_safe=False, + package_data={'pyarrow': ['*.pxd', '*.pyx']}, + # Dummy extension to trigger build_ext + ext_modules=[Extension('__dummy__', sources=[])], + cmdclass={ 'clean': clean, 'build_ext': build_ext @@ -243,5 +247,5 @@ def get_outputs(self): license='Apache License, Version 2.0', maintainer="Apache Arrow Developers", maintainer_email="dev@arrow.apache.org", - test_suite="arrow.tests" + test_suite="pyarrow.tests" ) diff --git a/python/src/pyarrow/util/CMakeLists.txt b/python/src/pyarrow/util/CMakeLists.txt index 3fd8bac3150..4afb4d0f912 100644 --- a/python/src/pyarrow/util/CMakeLists.txt +++ b/python/src/pyarrow/util/CMakeLists.txt @@ -19,19 +19,21 @@ # pyarrow_test_main ####################################### -add_library(pyarrow_test_main - test_main.cc) +if (PYARROW_BUILD_TESTS) + add_library(pyarrow_test_main + test_main.cc) -if (APPLE) - target_link_libraries(pyarrow_test_main - gtest - dl) - set_target_properties(pyarrow_test_main - PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") -else() - target_link_libraries(pyarrow_test_main - gtest - pthread - dl - ) + if (APPLE) + target_link_libraries(pyarrow_test_main + gtest + dl) + set_target_properties(pyarrow_test_main + PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + else() + target_link_libraries(pyarrow_test_main + gtest + pthread + dl + ) + endif() endif() From 883c62bddc534df2c0a4ee1e8bef38772aa4a7cd Mon Sep 17 00:00:00 2001 From: Dan Robinson Date: Wed, 16 Mar 2016 15:11:56 -0700 Subject: [PATCH 032/210] ARROW-55: [Python] Fix unit tests in 2.7 Fixing the #define check for Python 2 makes all unit tests pass in Python 2.7. Author: Dan Robinson Closes #25 from danrobinson/ARROW-55 and squashes the following commits: dda4396 [Dan Robinson] ARROW-55: Add Python 2.7 tests to travis-ci b00524b [Dan Robinson] ARROW-55: [Python] Fix unit tests in 2.7 --- ci/travis_script_python.sh | 35 ++++++++++++++++++++--------------- python/src/pyarrow/common.h | 2 +- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 14d66b44ff8..af6b0085724 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -26,29 +26,34 @@ export PATH="$MINICONDA/bin:$PATH" conda update -y -q conda conda info -a -PYTHON_VERSION=3.5 -CONDA_ENV_NAME=pyarrow-test +python_version_tests() { + PYTHON_VERSION=$1 + CONDA_ENV_NAME="pyarrow-test-${PYTHON_VERSION}" + conda create -y -q -n $CONDA_ENV_NAME python=$PYTHON_VERSION + source activate $CONDA_ENV_NAME -conda create -y -q -n $CONDA_ENV_NAME python=$PYTHON_VERSION -source activate $CONDA_ENV_NAME + python --version + which python -python --version -which python + # faster builds, please + conda install -y nomkl -# faster builds, please -conda install -y nomkl + # Expensive dependencies install from Continuum package repo + conda install -y pip numpy pandas cython -# Expensive dependencies install from Continuum package repo -conda install -y pip numpy pandas cython + # Other stuff pip install + pip install -r requirements.txt -# Other stuff pip install -pip install -r requirements.txt + export ARROW_HOME=$ARROW_CPP_INSTALL -export ARROW_HOME=$ARROW_CPP_INSTALL + python setup.py build_ext --inplace -python setup.py build_ext --inplace + py.test -vv -r sxX pyarrow +} -py.test -vv -r sxX pyarrow +# run tests for python 2.7 and 3.5 +python_version_tests 2.7 +python_version_tests 3.5 # if [ $TRAVIS_OS_NAME == "linux" ]; then # valgrind --tool=memcheck py.test -vv -r sxX arrow diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index a43e4d28c89..db6361384c1 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -24,7 +24,7 @@ namespace arrow { class MemoryPool; } namespace pyarrow { -#define PYARROW_IS_PY2 PY_MAJOR_VERSION < 2 +#define PYARROW_IS_PY2 PY_MAJOR_VERSION <= 2 #define RETURN_ARROW_NOT_OK(s) do { \ arrow::Status _s = (s); \ From 5881aacefc577ef8a2c39dc40d8f9cd978d50a88 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 16 Mar 2016 15:13:55 -0700 Subject: [PATCH 033/210] ARROW-64: Add zsh support to C++ build scripts All scripts that have to be sourced during development currently only support bash. This patch adds zsh support. Author: Uwe L. Korn Closes #24 from xhochy/zsh-support and squashes the following commits: d3590aa [Uwe L. Korn] ARROW-64: Add zsh support to C++ build scripts --- cpp/setup_build_env.sh | 2 +- cpp/thirdparty/build_thirdparty.sh | 2 +- cpp/thirdparty/download_thirdparty.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index e9901bdbecd..26a727c87e5 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -1,6 +1,6 @@ #!/bin/bash -SOURCE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) ./thirdparty/download_thirdparty.sh ./thirdparty/build_thirdparty.sh diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index 46794def400..8de56a6d086 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -2,7 +2,7 @@ set -x set -e -TP_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) source $TP_DIR/versions.sh PREFIX=$TP_DIR/installed diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh index 8ffb22a93f7..0c801179e8d 100755 --- a/cpp/thirdparty/download_thirdparty.sh +++ b/cpp/thirdparty/download_thirdparty.sh @@ -3,7 +3,7 @@ set -x set -e -TP_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) source $TP_DIR/versions.sh From c99661069c2f1dbd29c3a86e1e0bd5fa3c6c809f Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 17 Mar 2016 15:05:24 -0700 Subject: [PATCH 034/210] ARROW-68: Better error handling for not fully setup systems Author: Micah Kornfield Closes #27 from emkornfield/emk_add_nice_errors_PR and squashes the following commits: c0b9d78 [Micah Kornfield] ARROW-68: Better error handling for systems missing prerequistites --- cpp/setup_build_env.sh | 4 ++-- cpp/thirdparty/build_thirdparty.sh | 9 ++++++--- cpp/thirdparty/download_thirdparty.sh | 1 + 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index 26a727c87e5..1a33fe386f1 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -2,8 +2,8 @@ SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -./thirdparty/download_thirdparty.sh -./thirdparty/build_thirdparty.sh +./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } +./thirdparty/build_thirdparty.sh || { echo "build_thirdparty.sh failed" ; return; } source thirdparty/versions.sh export GTEST_HOME=$SOURCE_DIR/thirdparty/$GTEST_BASEDIR diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index 8de56a6d086..beb24880359 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -44,18 +44,21 @@ ln -sf lib "$PREFIX/lib64" # use the compiled tools export PATH=$PREFIX/bin:$PATH +type cmake >/dev/null 2>&1 || { echo >&2 "cmake not installed. Aborting."; exit 1; } +type make >/dev/null 2>&1 || { echo >&2 "make not installed. Aborting."; exit 1; } # build googletest +GOOGLETEST_ERROR="failed for googletest!" if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then cd $TP_DIR/$GTEST_BASEDIR if [[ "$OSTYPE" == "darwin"* ]]; then - CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="-std=c++11 -stdlib=libc++ -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" + CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="-std=c++11 -stdlib=libc++ -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" || { echo "cmake $GOOGLETEST_ERROR" ; exit 1; } else - CXXFLAGS=-fPIC cmake . + CXXFLAGS=-fPIC cmake . || { echo "cmake $GOOGLETEST_ERROR"; exit 1; } fi - make VERBOSE=1 + make VERBOSE=1 || { echo "Make $GOOGLETEST_ERROR" ; exit 1; } fi echo "---------------------" diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh index 0c801179e8d..c18dd4d8e80 100755 --- a/cpp/thirdparty/download_thirdparty.sh +++ b/cpp/thirdparty/download_thirdparty.sh @@ -8,6 +8,7 @@ TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) source $TP_DIR/versions.sh download_extract_and_cleanup() { + type curl >/dev/null 2>&1 || { echo >&2 "curl not installed. Aborting."; exit 1; } filename=$TP_DIR/$(basename "$1") curl -#LC - "$1" -o $filename tar xzf $filename -C $TP_DIR From 3a99f39d64d4e0d6556582c0560140c7b06ee21d Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 21 Mar 2016 16:31:21 -0700 Subject: [PATCH 035/210] ARROW-73: Support older CMake versions Author: Uwe L. Korn Closes #31 from xhochy/arrow-73 and squashes the following commits: c92ce5c [Uwe L. Korn] ARROW-73: Support older CMake versions --- cpp/cmake_modules/FindGTest.cmake | 2 +- cpp/cmake_modules/FindParquet.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/FindGTest.cmake b/cpp/cmake_modules/FindGTest.cmake index e47faf0dd89..3c5d2b67e44 100644 --- a/cpp/cmake_modules/FindGTest.cmake +++ b/cpp/cmake_modules/FindGTest.cmake @@ -54,7 +54,7 @@ endif () if (GTEST_INCLUDE_DIR AND GTEST_LIBRARIES) set(GTEST_FOUND TRUE) - get_filename_component( GTEST_LIBS ${GTEST_LIBRARIES} DIRECTORY ) + get_filename_component( GTEST_LIBS ${GTEST_LIBRARIES} PATH ) set(GTEST_LIB_NAME libgtest) set(GTEST_STATIC_LIB ${GTEST_LIBS}/${GTEST_LIB_NAME}.a) set(GTEST_SHARED_LIB ${GTEST_LIBS}/${GTEST_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index 76c2d1dbee9..d16e6c98f8d 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -43,7 +43,7 @@ endif () if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES) set(PARQUET_FOUND TRUE) - get_filename_component( PARQUET_LIBS ${PARQUET_LIBRARIES} DIRECTORY ) + get_filename_component( PARQUET_LIBS ${PARQUET_LIBRARIES} PATH ) set(PARQUET_LIB_NAME libparquet) set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a) set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) From 016b92bccf60de480da07acbabe876fb695c45e5 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 21 Mar 2016 16:34:07 -0700 Subject: [PATCH 036/210] ARROW-72: Search for alternative parquet-cpp header Author: Uwe L. Korn Closes #30 from xhochy/arrow-72 and squashes the following commits: 5b6b328 [Uwe L. Korn] ARROW-72: Search for alternative parquet-cpp header --- cpp/cmake_modules/FindParquet.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index d16e6c98f8d..e3350d6e13d 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -29,14 +29,14 @@ endif() # Try the parameterized roots, if they exist if ( _parquet_roots ) - find_path( PARQUET_INCLUDE_DIR NAMES parquet/parquet.h + find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h PATHS ${_parquet_roots} NO_DEFAULT_PATH PATH_SUFFIXES "include" ) find_library( PARQUET_LIBRARIES NAMES parquet PATHS ${_parquet_roots} NO_DEFAULT_PATH PATH_SUFFIXES "lib" ) else () - find_path( PARQUET_INCLUDE_DIR NAMES parquet/parquet.h ) + find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h ) find_library( PARQUET_LIBRARIES NAMES parquet ) endif () From 4ec034bbe18bd961a4bac64f2e25dba0472c28c9 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 22 Mar 2016 08:51:23 -0700 Subject: [PATCH 037/210] ARROW-28: Adding google's benchmark library to the toolchain This isn't yet complete, but before I go further I think its worth asking some questions on peoples' preferences: 1. It seems that the build third-party script is setting up an install directory that it is not making use of. Do we want to keep this functionality and start adding new libraries to be placed there? The gtest component of the tool-chain assumes it is in its own location, and this how I patterned google benchmark integration. 2. Do we want to couple unit test builds with benchmark builds? I am currently aiming for having them decoupled and having benchmarks off by default. 3. I am not familiar with the Darwin/mac build environment and it is not clear if the CXX flags are required universally. (I need to fix it anyways to move -DGTEST_USE_OWN_TR1_TUPLE=1 back to be gtest only). Travis-ci might provide the answer. 4. Any other basic features in the benchmark toolchain people would like to see as part of this PR? Wes mentioned starting to create benchmarking tools lib, but I think that likely belongs in a separate PR. Author: Micah Kornfield Closes #29 from emkornfield/emk_add_benchmark and squashes the following commits: dbd4e71 [Micah Kornfield] only run unittests is travis ab21150 [Micah Kornfield] Enable benchmarks in cpp toolchain 40847ee [Micah Kornfield] WIP-Adding google's benchmark library to the toolchain --- ci/travis_before_script_cpp.sh | 2 +- ci/travis_script_cpp.sh | 4 +- cpp/CMakeLists.txt | 88 ++++++++++++- cpp/README.md | 23 +++- cpp/build-support/run-test.sh | 160 ++++++++++++++---------- cpp/cmake_modules/FindGBenchmark.cmake | 88 +++++++++++++ cpp/setup_build_env.sh | 1 + cpp/src/arrow/table/CMakeLists.txt | 2 + cpp/src/arrow/table/column-benchmark.cc | 55 ++++++++ cpp/src/arrow/util/CMakeLists.txt | 14 +++ cpp/src/arrow/util/benchmark_main.cc | 24 ++++ cpp/thirdparty/build_thirdparty.sh | 20 ++- cpp/thirdparty/download_thirdparty.sh | 6 + cpp/thirdparty/versions.sh | 4 + 14 files changed, 415 insertions(+), 76 deletions(-) create mode 100644 cpp/cmake_modules/FindGBenchmark.cmake create mode 100644 cpp/src/arrow/table/column-benchmark.cc create mode 100644 cpp/src/arrow/util/benchmark_main.cc diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 4d5bef8bbdf..49dcc395fbc 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -19,7 +19,7 @@ echo $GTEST_HOME : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} -cmake -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +cmake -DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR make -j4 make install diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index 3e843dd759e..d96b98f8d37 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -9,9 +9,9 @@ pushd $CPP_BUILD_DIR make lint if [ $TRAVIS_OS_NAME == "linux" ]; then - valgrind --tool=memcheck --leak-check=yes --error-exitcode=1 ctest + valgrind --tool=memcheck --leak-check=yes --error-exitcode=1 ctest -L unittest else - ctest + ctest -L unittest fi popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f5f60380311..268c1d11e1e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -55,12 +55,21 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests" ON) + + option(ARROW_BUILD_BENCHMARKS + "Build the Arrow micro benchmarks" + OFF) + endif() if(NOT ARROW_BUILD_TESTS) set(NO_TESTS 1) endif() +if(NOT ARROW_BUILD_BENCHMARKS) + set(NO_BENCHMARKS 1) +endif() + ############################################################ # Compiler flags @@ -251,9 +260,63 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") include_directories(src) ############################################################ -# Testing +# Benchmarking ############################################################ +# Add a new micro benchmark, with or without an executable that should be built. +# If benchmarks are enabled then they will be run along side unit tests with ctest. +# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# respectively. +# +# REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component +# (e.g. monotime-benchmark) or contain additional components (e.g. +# net/net_util-benchmark). Either way, the last component must be a globally +# unique name. + +# The benchmark will registered as unit test with ctest with a label +# of 'benchmark'. +# +# Arguments after the test name will be passed to set_tests_properties(). +function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) + if(NO_BENCHMARKS) + return() + endif() + get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc) + # This benchmark has a corresponding .cc file, set it up as an executable. + set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") + add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc") + target_link_libraries(${BENCHMARK_NAME} ${ARROW_BENCHMARK_LINK_LIBS}) + add_dependencies(runbenchmark ${BENCHMARK_NAME}) + set(NO_COLOR "--color_print=false") + else() + # No executable, just invoke the benchmark (probably a script) directly. + set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}) + set(NO_COLOR "") + endif() + + add_test(${BENCHMARK_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) + set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") + if(ARGN) + set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN}) + endif() +endfunction() + +# A wrapper for add_dependencies() that is compatible with NO_BENCHMARKS. +function(ADD_ARROW_BENCHMARK_DEPENDENCIES REL_BENCHMARK_NAME) + if(NO_BENCHMARKS) + return() + endif() + get_filename_component(BENCMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + add_dependencies(${BENCHMARK_NAME} ${ARGN}) +endfunction() + + +############################################################ +# Testing +############################################################ # Add a new test case, with or without an executable that should be built. # # REL_TEST_NAME is the name of the test. It may be a single component @@ -261,6 +324,9 @@ include_directories(src) # net/net_util-test). Either way, the last component must be a globally # unique name. # +# The unit test is added with a label of "unittest" to support filtering with +# ctest. +# # Arguments after the test name will be passed to set_tests_properties(). function(ADD_ARROW_TEST REL_TEST_NAME) if(NO_TESTS) @@ -273,13 +339,15 @@ function(ADD_ARROW_TEST REL_TEST_NAME) set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") target_link_libraries(${TEST_NAME} ${ARROW_TEST_LINK_LIBS}) + add_dependencies(unittest ${TEST_NAME}) else() # No executable, just invoke the test (probably a script) directly. set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) endif() add_test(${TEST_NAME} - ${BUILD_SUPPORT_DIR}/run-test.sh ${TEST_PATH}) + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) + set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") if(ARGN) set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) endif() @@ -335,13 +403,28 @@ if ("$ENV{GTEST_HOME}" STREQUAL "") set(GTest_HOME ${THIRDPARTY_DIR}/googletest-release-1.7.0) endif() +## Google Benchmark +if ("$ENV{GBENCHMARK_HOME}" STREQUAL "") + set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed) +endif() + + if(ARROW_BUILD_TESTS) + add_custom_target(unittest ctest -L unittest) find_package(GTest REQUIRED) include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) ADD_THIRDPARTY_LIB(gtest STATIC_LIB ${GTEST_STATIC_LIB}) endif() +if(ARROW_BUILD_BENCHMARKS) + add_custom_target(runbenchmark ctest -L benchmark) + find_package(GBenchmark REQUIRED) + include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(benchmark + STATIC_LIB ${GBENCHMARK_STATIC_LIB}) +endif() + ## Google PerfTools ## ## Disabled with TSAN/ASAN as well as with gold+dynamic linking (see comment @@ -366,6 +449,7 @@ endif() ############################################################ set(ARROW_MIN_TEST_LIBS arrow arrow_test_main ${ARROW_BASE_LIBS}) set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) +set(ARROW_BENCHMARK_LINK_LIBS arrow arrow_benchmark_main ${ARROW_BASE_LIBS}) ############################################################ # "make ctags" target diff --git a/cpp/README.md b/cpp/README.md index 378dc4e28de..542cce43a13 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -29,16 +29,29 @@ Simple debug build: mkdir debug cd debug cmake .. - make - ctest + make unittest Simple release build: mkdir release cd release cmake .. -DCMAKE_BUILD_TYPE=Release - make - ctest + make unittest + +Detailed unit test logs will be placed in the build directory under `build/test-logs`. + +### Building/Running benchmarks + +Follow the directions for simple build except run cmake +with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: + + cmake -DARROW_BUILD_BENCHMARKS=ON .. + +and instead of make unittest run either `make; ctest` to run both unit tests +and benchmarks or `make runbenchmark` to run only the benchmark tests. + +Benchmark logs will be placed in the build directory under `build/benchmark-logs`. + ### Third-party environment variables @@ -46,3 +59,5 @@ To set up your own specific build toolchain, here are the relevant environment variables * Googletest: `GTEST_HOME` (only required to build the unit tests) +* Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) + diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index b2039134d55..0e628e26ecd 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -16,24 +16,23 @@ # Script which wraps running a test and redirects its output to a # test log directory. # -# If KUDU_COMPRESS_TEST_OUTPUT is non-empty, then the logs will be -# gzip-compressed while they are written. +# Arguments: +# $1 - Base path for logs/artifacts. +# $2 - type of test (e.g. test or benchmark) +# $3 - path to executable +# $ARGN - arguments for executable # -# If KUDU_FLAKY_TEST_ATTEMPTS is non-zero, and the test being run matches -# one of the lines in the file KUDU_FLAKY_TEST_LIST, then the test will -# be retried on failure up to the specified number of times. This can be -# used in the gerrit workflow to prevent annoying false -1s caused by -# tests that are known to be flaky in master. -# -# If KUDU_REPORT_TEST_RESULTS is non-zero, then tests are reported to the -# central test server. +OUTPUT_ROOT=$1 +shift ROOT=$(cd $(dirname $BASH_SOURCE)/..; pwd) -TEST_LOGDIR=$ROOT/build/test-logs +TEST_LOGDIR=$OUTPUT_ROOT/build/$1-logs mkdir -p $TEST_LOGDIR -TEST_DEBUGDIR=$ROOT/build/test-debug +RUN_TYPE=$1 +shift +TEST_DEBUGDIR=$OUTPUT_ROOT/build/$RUN_TYPE-debug mkdir -p $TEST_DEBUGDIR TEST_DIRNAME=$(cd $(dirname $1); pwd) @@ -43,7 +42,7 @@ TEST_EXECUTABLE="$TEST_DIRNAME/$TEST_FILENAME" TEST_NAME=$(echo $TEST_FILENAME | perl -pe 's/\..+?$//') # Remove path and extension (if any). # We run each test in its own subdir to avoid core file related races. -TEST_WORKDIR=$ROOT/build/test-work/$TEST_NAME +TEST_WORKDIR=$OUTPUT_ROOT/build/test-work/$TEST_NAME mkdir -p $TEST_WORKDIR pushd $TEST_WORKDIR >/dev/null || exit 1 rm -f * @@ -61,55 +60,49 @@ rm -f $LOGFILE $LOGFILE.gz pipe_cmd=cat -# Configure TSAN (ignored if this isn't a TSAN build). -# -# Deadlock detection (new in clang 3.5) is disabled because: -# 1. The clang 3.5 deadlock detector crashes in some unit tests. It -# needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others. -# 2. Many unit tests report lock-order-inversion warnings; they should be -# fixed before reenabling the detector. -TSAN_OPTIONS="$TSAN_OPTIONS detect_deadlocks=0" -TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$ROOT/build-support/tsan-suppressions.txt" -TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" -export TSAN_OPTIONS - -# Enable leak detection even under LLVM 3.4, where it was disabled by default. -# This flag only takes effect when running an ASAN build. -ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" -export ASAN_OPTIONS - -# Set up suppressions for LeakSanitizer -LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt" -export LSAN_OPTIONS - -# Suppressions require symbolization. We'll default to using the symbolizer in -# thirdparty. -if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then - export ASAN_SYMBOLIZER_PATH=$(find $NATIVE_TOOLCHAIN/llvm-3.7.0/bin -name llvm-symbolizer) -fi - # Allow for collecting core dumps. ARROW_TEST_ULIMIT_CORE=${ARROW_TEST_ULIMIT_CORE:-0} ulimit -c $ARROW_TEST_ULIMIT_CORE -# Run the actual test. -for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do - if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then - # If the test fails, the test output may or may not be left behind, - # depending on whether the test cleaned up or exited immediately. Either - # way we need to clean it up. We do this by comparing the data directory - # contents before and after the test runs, and deleting anything new. - # - # The comm program requires that its two inputs be sorted. - TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + +function setup_sanitizers() { + # Sets environment variables for different sanitizers (it configures how) the run_tests. Function works. + + # Configure TSAN (ignored if this isn't a TSAN build). + # + # Deadlock detection (new in clang 3.5) is disabled because: + # 1. The clang 3.5 deadlock detector crashes in some unit tests. It + # needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others. + # 2. Many unit tests report lock-order-inversion warnings; they should be + # fixed before reenabling the detector. + TSAN_OPTIONS="$TSAN_OPTIONS detect_deadlocks=0" + TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$ROOT/build-support/tsan-suppressions.txt" + TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" + export TSAN_OPTIONS + + # Enable leak detection even under LLVM 3.4, where it was disabled by default. + # This flag only takes effect when running an ASAN build. + ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" + export ASAN_OPTIONS + + # Set up suppressions for LeakSanitizer + LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt" + export LSAN_OPTIONS + + # Suppressions require symbolization. We'll default to using the symbolizer in + # thirdparty. + if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then + export ASAN_SYMBOLIZER_PATH=$(find $NATIVE_TOOLCHAIN/llvm-3.7.0/bin -name llvm-symbolizer) fi +} + +function run_test() { + # Run gtest style tests with sanitizers if they are setup appropriately. # gtest won't overwrite old junit test files, resulting in a build failure # even when retries are successful. rm -f $XMLFILE - echo "Running $TEST_NAME, redirecting output into $LOGFILE" \ - "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)" $TEST_EXECUTABLE "$@" 2>&1 \ | $ROOT/build-support/asan_symbolize.py \ | c++filt \ @@ -131,6 +124,46 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do STATUS=1 rm -f $XMLFILE fi +} + +function post_process_tests() { + # If we have a LeakSanitizer report, and XML reporting is configured, add a new test + # case result to the XML file for the leak report. Otherwise Jenkins won't show + # us which tests had LSAN errors. + if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then + echo Test had memory leaks. Editing XML + perl -p -i -e ' + if (m##) { + print "\n"; + print " \n"; + print " See txt log file for details\n"; + print " \n"; + print "\n"; + }' $XMLFILE + fi +} + +function run_other() { + # Generic run function for test like executables that aren't actually gtest + $TEST_EXECUTABLE "$@" 2>&1 | $pipe_cmd > $LOGFILE + STATUS=$? +} + +if [ $RUN_TYPE = "test" ]; then + setup_sanitizers +fi + +# Run the actual test. +for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do + if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then + # If the test fails, the test output may or may not be left behind, + # depending on whether the test cleaned up or exited immediately. Either + # way we need to clean it up. We do this by comparing the data directory + # contents before and after the test runs, and deleting anything new. + # + # The comm program requires that its two inputs be sorted. + TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + fi if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then # Now delete any new test output. @@ -150,7 +183,13 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do fi done fi - + echo "Running $TEST_NAME, redirecting output into $LOGFILE" \ + "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)" + if [ $RUN_TYPE = "test" ]; then + run_test $* + else + run_other $* + fi if [ "$STATUS" -eq "0" ]; then break elif [ "$ATTEMPT_NUMBER" -lt "$TEST_EXECUTION_ATTEMPTS" ]; then @@ -159,19 +198,8 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do fi done -# If we have a LeakSanitizer report, and XML reporting is configured, add a new test -# case result to the XML file for the leak report. Otherwise Jenkins won't show -# us which tests had LSAN errors. -if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then - echo Test had memory leaks. Editing XML - perl -p -i -e ' - if (m##) { - print "\n"; - print " \n"; - print " See txt log file for details\n"; - print " \n"; - print "\n"; - }' $XMLFILE +if [ $RUN_TYPE = "test" ]; then + post_process_tests fi # Capture and compress core file and binary. diff --git a/cpp/cmake_modules/FindGBenchmark.cmake b/cpp/cmake_modules/FindGBenchmark.cmake new file mode 100644 index 00000000000..3e46a60f5e6 --- /dev/null +++ b/cpp/cmake_modules/FindGBenchmark.cmake @@ -0,0 +1,88 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find Google benchmark headers and libraries. +# +# Usage of this module as follows: +# +# find_package(GBenchark) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# GBenchmark_HOME - When set, this path is inspected instead of standard library +# locations as the root of the benchark installation. +# The environment variable GBENCHMARK_HOME overrides this veriable. +# +# This module defines +# GBENCHMARK_INCLUDE_DIR, directory containing benchmark header directory +# GBENCHMARK_LIBS, directory containing benchmark libraries +# GBENCHMARK_STATIC_LIB, path to libbenchmark.a +# GBENCHMARK_FOUND, whether gbenchmark has been found + +if( NOT "$ENV{GBENCHMARK_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{GBENCHMARK_HOME}" _native_path ) + list( APPEND _gbenchmark_roots ${_native_path} ) +elseif ( GBenchmark_HOME ) + list( APPEND _gbenchmark_roots ${GBenchmark_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _gbenchmark_roots ) + find_path( GBENCHMARK_INCLUDE_DIR NAMES benchmark/benchmark.h + PATHS ${_gbenchmark_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( GBENCHMARK_LIBRARIES NAMES benchmark + PATHS ${_gbenchmark_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else () + find_path( GBENCHMARK_INCLUDE_DIR NAMES benchmark/benchmark.hh ) + find_library( GBENCHMARK_LIBRARIES NAMES benchmark ) +endif () + + +if (GBENCHMARK_INCLUDE_DIR AND GBENCHMARK_LIBRARIES) + set(GBENCHMARK_FOUND TRUE) + get_filename_component( GBENCHMARK_LIBS ${GBENCHMARK_LIBRARIES} PATH ) + set(GBENCHMARK_LIB_NAME libbenchmark) + set(GBENCHMARK_STATIC_LIB ${GBENCHMARK_LIBS}/${GBENCHMARK_LIB_NAME}.a) +else () + set(GBENCHMARK_FOUND FALSE) +endif () + +if (GBENCHMARK_FOUND) + if (NOT GBenchmark_FIND_QUIETLY) + message(STATUS "Found the GBenchmark library: ${GBENCHMARK_LIBRARIES}") + endif () +else () + if (NOT GBenchmark_FIND_QUIETLY) + set(GBENCHMARK_ERR_MSG "Could not find the GBenchmark library. Looked in ") + if ( _gbenchmark_roots ) + set(GBENCHMARK_ERR_MSG "${GBENCHMARK_ERR_MSG} in ${_gbenchmark_roots}.") + else () + set(GBENCHMARK_ERR_MSG "${GBENCHMARK_ERR_MSG} system search paths.") + endif () + if (GBenchmark_FIND_REQUIRED) + message(FATAL_ERROR "${GBENCHMARK_ERR_MSG}") + else (GBenchmark_FIND_REQUIRED) + message(STATUS "${GBENCHMARK_ERR_MSG}") + endif (GBenchmark_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + GBENCHMARK_INCLUDE_DIR + GBENCHMARK_LIBS + GBENCHMARK_LIBRARIES + GBENCHMARK_STATIC_LIB +) diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index 1a33fe386f1..04688e7d594 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -7,5 +7,6 @@ SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) source thirdparty/versions.sh export GTEST_HOME=$SOURCE_DIR/thirdparty/$GTEST_BASEDIR +export GBENCHMARK_HOME=$SOURCE_DIR/thirdparty/installed echo "Build env initialized" diff --git a/cpp/src/arrow/table/CMakeLists.txt b/cpp/src/arrow/table/CMakeLists.txt index 26d843d853b..d9f00e74a37 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/cpp/src/arrow/table/CMakeLists.txt @@ -29,3 +29,5 @@ install(FILES ADD_ARROW_TEST(column-test) ADD_ARROW_TEST(schema-test) ADD_ARROW_TEST(table-test) + +ADD_ARROW_BENCHMARK(column-benchmark) diff --git a/cpp/src/arrow/table/column-benchmark.cc b/cpp/src/arrow/table/column-benchmark.cc new file mode 100644 index 00000000000..c01146d7b09 --- /dev/null +++ b/cpp/src/arrow/table/column-benchmark.cc @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +#include "benchmark/benchmark.h" + +#include "arrow/test-util.h" +#include "arrow/table/test-common.h" +#include "arrow/types/integer.h" +#include "arrow/util/memory-pool.h" + +namespace arrow { +namespace { + template + std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { + auto pool = GetDefaultMemoryPool(); + auto data = std::make_shared(pool); + auto nulls = std::make_shared(pool); + data->Resize(length * sizeof(typename ArrayType::value_type)); + nulls->Resize(util::bytes_for_bits(length)); + return std::make_shared(length, data, 10, nulls); + } +} // anonymous namespace + + +static void BM_BuildInt32ColumnByChunk(benchmark::State& state) { //NOLINT non-const reference + ArrayVector arrays; + for (int chunk_n = 0; chunk_n < state.range_x(); ++chunk_n) { + arrays.push_back(MakePrimitive(100, 10)); + } + const auto INT32 = std::make_shared(); + const auto field = std::make_shared("c0", INT32); + std::unique_ptr column; + while (state.KeepRunning()) { + column.reset(new Column(field, arrays)); + } +} + +BENCHMARK(BM_BuildInt32ColumnByChunk)->Range(5, 50000); + +} // namespace arrow diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index d8e2f98f2c8..fed05e3690c 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -51,6 +51,20 @@ if (ARROW_BUILD_TESTS) endif() endif() +if (ARROW_BUILD_BENCHMARKS) + add_library(arrow_benchmark_main benchmark_main.cc) + if (APPLE) + target_link_libraries(arrow_benchmark_main + benchmark + ) + else() + target_link_libraries(arrow_benchmark_main + benchmark + pthread + ) + endif() +endif() + ADD_ARROW_TEST(bit-util-test) ADD_ARROW_TEST(buffer-test) ADD_ARROW_TEST(memory-pool-test) diff --git a/cpp/src/arrow/util/benchmark_main.cc b/cpp/src/arrow/util/benchmark_main.cc new file mode 100644 index 00000000000..c9739af03fb --- /dev/null +++ b/cpp/src/arrow/util/benchmark_main.cc @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +int main(int argc, char** argv) { + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index beb24880359..294737cc505 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -16,6 +16,7 @@ else for arg in "$*"; do case $arg in "gtest") F_GTEST=1 ;; + "gbenchmark") F_GBENCHMARK=1 ;; *) echo "Unknown module: $arg"; exit 1 ;; esac done @@ -47,13 +48,15 @@ export PATH=$PREFIX/bin:$PATH type cmake >/dev/null 2>&1 || { echo >&2 "cmake not installed. Aborting."; exit 1; } type make >/dev/null 2>&1 || { echo >&2 "make not installed. Aborting."; exit 1; } +STANDARD_DARWIN_FLAGS="-std=c++11 -stdlib=libc++" + # build googletest GOOGLETEST_ERROR="failed for googletest!" if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then cd $TP_DIR/$GTEST_BASEDIR if [[ "$OSTYPE" == "darwin"* ]]; then - CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="-std=c++11 -stdlib=libc++ -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" || { echo "cmake $GOOGLETEST_ERROR" ; exit 1; } + CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="$STANDARD_DARWIN_FLAGS -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" || { echo "cmake $GOOGLETEST_ERROR" ; exit 1; } else CXXFLAGS=-fPIC cmake . || { echo "cmake $GOOGLETEST_ERROR"; exit 1; } fi @@ -61,5 +64,20 @@ if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then make VERBOSE=1 || { echo "Make $GOOGLETEST_ERROR" ; exit 1; } fi +# build google benchmark +GBENCHMARK_ERROR="failed for google benchmark" +if [ -n "$F_ALL" -o -n "$F_GBENCHMARK" ]; then + cd $TP_DIR/$GBENCHMARK_BASEDIR + + CMAKE_CXX_FLAGS="--std=c++11" + if [[ "$OSTYPE" == "darwin"* ]]; then + CMAKE_CXX_FLAGS=$STANDARD_DARWIN_FLAGS + fi + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_CXX_FLAGS="-fPIC $CMAKE_CXX_FLAGS" . || { echo "cmake $GBENCHMARK_ERROR" ; exit 1; } + + make VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; } +fi + + echo "---------------------" echo "Thirdparty dependencies built and installed into $PREFIX successfully" diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh index c18dd4d8e80..d22c559b3e3 100755 --- a/cpp/thirdparty/download_thirdparty.sh +++ b/cpp/thirdparty/download_thirdparty.sh @@ -19,3 +19,9 @@ if [ ! -d ${GTEST_BASEDIR} ]; then echo "Fetching gtest" download_extract_and_cleanup $GTEST_URL fi + +echo ${GBENCHMARK_BASEDIR} +if [ ! -d ${GBENCHMARK_BASEDIR} ]; then + echo "Fetching google benchmark" + download_extract_and_cleanup $GBENCHMARK_URL +fi diff --git a/cpp/thirdparty/versions.sh b/cpp/thirdparty/versions.sh index 12ad56ef001..9cfc7cd94b5 100755 --- a/cpp/thirdparty/versions.sh +++ b/cpp/thirdparty/versions.sh @@ -1,3 +1,7 @@ GTEST_VERSION=1.7.0 GTEST_URL="https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz" GTEST_BASEDIR=googletest-release-$GTEST_VERSION + +GBENCHMARK_VERSION=1.0.0 +GBENCHMARK_URL="https://github.com/google/benchmark/archive/v${GBENCHMARK_VERSION}.tar.gz" +GBENCHMARK_BASEDIR=benchmark-$GBENCHMARK_VERSION From 093f9bd8c30b1b77b3e6e7a4123cab9a6dd9daa1 Mon Sep 17 00:00:00 2001 From: Dan Robinson Date: Tue, 22 Mar 2016 14:15:38 -0700 Subject: [PATCH 038/210] ARROW-75: Fix handling of empty strings Fixes [ARROW-75](https://issues.apache.org/jira/browse/ARROW-75) (and changes Python tests to verify that behavior). Author: Dan Robinson Closes #32 from danrobinson/ARROW-75 and squashes the following commits: cb8e527 [Dan Robinson] ARROW-75: remove whitespace 9604a21 [Dan Robinson] ARROW-75: Changed tests 722df19 [Dan Robinson] ARROW-75: Fixed braces 1ef3b75 [Dan Robinson] ARROW-75: Fix handling of empty strings --- cpp/src/arrow/types/primitive.h | 4 +++- cpp/src/arrow/types/string-test.cc | 2 +- python/pyarrow/tests/test_array.py | 6 +++--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 1073bb6e1c3..22ab59c309a 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -168,7 +168,9 @@ class PrimitiveBuilder : public ArrayBuilder { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } - memcpy(raw_buffer() + length_, values, length * elsize_); + if (length > 0) { + memcpy(raw_buffer() + length_, values, length * elsize_); + } if (null_bytes != nullptr) { AppendNulls(null_bytes, length); diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 8e82fd95dd8..6381093dcbb 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -181,7 +181,7 @@ class TestStringBuilder : public TestBuilder { }; TEST_F(TestStringBuilder, TestScalarAppend) { - std::vector strings = {"a", "bb", "", "", "ccc"}; + std::vector strings = {"", "bb", "a", "", "ccc"}; std::vector is_null = {0, 0, 0, 1, 0}; int N = strings.size(); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 034c1576551..36aaaa4f93d 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -39,13 +39,13 @@ def test_list_format(self): assert result == expected def test_string_format(self): - arr = pyarrow.from_pylist(['foo', None, 'bar']) + arr = pyarrow.from_pylist(['', None, 'foo']) result = fmt.array_format(arr) expected = """\ [ - 'foo', + '', NA, - 'bar' + 'foo' ]""" assert result == expected From 65db0da80b6a1fb6887b7ac1df24e2423d41dfb9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 22 Mar 2016 18:45:13 -0700 Subject: [PATCH 039/210] ARROW-67: C++ metadata flatbuffer serialization and data movement to memory maps Several things here: * Add Google flatbuffers dependency * Flatbuffers IDL draft in collaboration with @jacques-n and @stevenmphillips * Add Schema wrapper in Cython * arrow::Schema conversion to/from flatbuffer representation * Remove unneeded physical layout types from type.h * Refactor ListType to be a nested type with a single child * Implement shared memory round-trip for numeric row batches * mmap-based shared memory interface and MemorySource abstract API Quite a bit of judicious code cleaning and consolidation as part of this. For example, List types are now internally equivalent to a nested type with 1 named child field (versus a struct, which can have any number of child fields). Associated JIRAs: ARROW-48, ARROW-57, ARROW-58 Author: Wes McKinney Closes #28 from wesm/cpp-ipc-draft and squashes the following commits: 0cef7ea [Wes McKinney] Add NullArray type now that Array is virtual, fix pyarrow build 5e841f7 [Wes McKinney] Create explicit PrimitiveArray subclasses to avoid unwanted template instantiation 6fa6319 [Wes McKinney] ARROW-28: Draft C++ shared memory IPC workflow and related refactoring / scaffolding / cleaning. --- ci/travis_before_script_cpp.sh | 9 +- ci/travis_script_cpp.sh | 6 +- cpp/CMakeLists.txt | 96 ++++-- cpp/cmake_modules/FindFlatbuffers.cmake | 95 ++++++ cpp/setup_build_env.sh | 5 +- cpp/src/arrow/CMakeLists.txt | 8 + cpp/src/arrow/api.h | 11 +- cpp/src/arrow/array-test.cc | 14 +- cpp/src/arrow/array.cc | 26 +- cpp/src/arrow/array.h | 27 +- cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/{table => }/column-benchmark.cc | 5 +- cpp/src/arrow/{table => }/column-test.cc | 10 +- cpp/src/arrow/{table => }/column.cc | 4 +- cpp/src/arrow/{table => }/column.h | 13 +- cpp/src/arrow/ipc/.gitignore | 1 + cpp/src/arrow/ipc/CMakeLists.txt | 51 +++ cpp/src/arrow/ipc/adapter.cc | 305 +++++++++++++++++ cpp/src/arrow/ipc/adapter.h | 86 +++++ cpp/src/arrow/ipc/ipc-adapter-test.cc | 112 +++++++ cpp/src/arrow/ipc/ipc-memory-test.cc | 82 +++++ cpp/src/arrow/ipc/ipc-metadata-test.cc | 99 ++++++ cpp/src/arrow/ipc/memory.cc | 162 +++++++++ cpp/src/arrow/ipc/memory.h | 131 ++++++++ cpp/src/arrow/ipc/metadata-internal.cc | 317 ++++++++++++++++++ cpp/src/arrow/ipc/metadata-internal.h | 69 ++++ cpp/src/arrow/ipc/metadata.cc | 238 +++++++++++++ cpp/src/arrow/ipc/metadata.h | 146 ++++++++ .../{types/floating.h => ipc/test-common.h} | 43 ++- cpp/src/arrow/{table => }/schema-test.cc | 48 ++- cpp/src/arrow/{table => }/schema.cc | 11 +- cpp/src/arrow/{table => }/schema.h | 8 +- cpp/src/arrow/{table => }/table-test.cc | 18 +- cpp/src/arrow/{table => }/table.cc | 35 +- cpp/src/arrow/{table => }/table.h | 58 +++- cpp/src/arrow/table/test-common.h | 54 --- cpp/src/arrow/test-util.h | 68 +++- cpp/src/arrow/type.cc | 24 +- cpp/src/arrow/type.h | 177 ++++------ cpp/src/arrow/types/CMakeLists.txt | 2 - cpp/src/arrow/types/boolean.h | 2 +- cpp/src/arrow/types/collection.h | 2 +- cpp/src/arrow/types/construct.cc | 53 +-- cpp/src/arrow/types/construct.h | 11 +- cpp/src/arrow/types/datetime.h | 16 +- cpp/src/arrow/types/floating.cc | 22 -- cpp/src/arrow/types/integer.cc | 22 -- cpp/src/arrow/types/integer.h | 57 ---- cpp/src/arrow/types/json.cc | 1 - cpp/src/arrow/types/json.h | 4 +- cpp/src/arrow/types/list-test.cc | 28 +- cpp/src/arrow/types/list.cc | 29 ++ cpp/src/arrow/types/list.h | 28 +- cpp/src/arrow/types/primitive-test.cc | 41 +-- cpp/src/arrow/types/primitive.cc | 16 +- cpp/src/arrow/types/primitive.h | 102 +++--- cpp/src/arrow/types/string-test.cc | 54 ++- cpp/src/arrow/types/string.h | 55 +-- cpp/src/arrow/types/struct-test.cc | 15 +- cpp/src/arrow/types/test-common.h | 5 +- cpp/src/arrow/types/union.h | 18 +- cpp/src/arrow/util/bit-util-test.cc | 4 +- cpp/src/arrow/util/bit-util.h | 1 - cpp/src/arrow/util/buffer-test.cc | 3 +- cpp/src/arrow/util/buffer.cc | 2 +- cpp/src/arrow/util/memory-pool-test.cc | 7 +- cpp/src/arrow/util/memory-pool.cc | 6 +- cpp/src/arrow/util/memory-pool.h | 2 +- cpp/src/arrow/util/status.cc | 3 + cpp/src/arrow/util/status.h | 6 + cpp/src/arrow/util/test_main.cc | 2 +- cpp/thirdparty/build_thirdparty.sh | 9 + cpp/thirdparty/download_thirdparty.sh | 5 + cpp/thirdparty/versions.sh | 4 + format/Message.fbs | 183 ++++++++++ python/pyarrow/__init__.py | 4 +- python/pyarrow/array.pxd | 2 +- python/pyarrow/array.pyx | 47 ++- python/pyarrow/includes/libarrow.pxd | 107 ++++-- python/pyarrow/includes/pyarrow.pxd | 5 +- python/pyarrow/scalar.pyx | 24 +- python/pyarrow/schema.pxd | 6 +- python/pyarrow/schema.pyx | 155 ++++++--- python/pyarrow/tests/test_schema.py | 28 +- .../pyarrow/tests/test_table.py | 39 ++- python/src/pyarrow/adapters/builtin.cc | 20 +- python/src/pyarrow/helpers.cc | 15 +- python/src/pyarrow/helpers.h | 5 +- 88 files changed, 3113 insertions(+), 838 deletions(-) create mode 100644 cpp/cmake_modules/FindFlatbuffers.cmake rename cpp/src/arrow/{table => }/column-benchmark.cc (94%) rename cpp/src/arrow/{table => }/column-test.cc (93%) rename cpp/src/arrow/{table => }/column.cc (96%) rename cpp/src/arrow/{table => }/column.h (93%) create mode 100644 cpp/src/arrow/ipc/.gitignore create mode 100644 cpp/src/arrow/ipc/CMakeLists.txt create mode 100644 cpp/src/arrow/ipc/adapter.cc create mode 100644 cpp/src/arrow/ipc/adapter.h create mode 100644 cpp/src/arrow/ipc/ipc-adapter-test.cc create mode 100644 cpp/src/arrow/ipc/ipc-memory-test.cc create mode 100644 cpp/src/arrow/ipc/ipc-metadata-test.cc create mode 100644 cpp/src/arrow/ipc/memory.cc create mode 100644 cpp/src/arrow/ipc/memory.h create mode 100644 cpp/src/arrow/ipc/metadata-internal.cc create mode 100644 cpp/src/arrow/ipc/metadata-internal.h create mode 100644 cpp/src/arrow/ipc/metadata.cc create mode 100644 cpp/src/arrow/ipc/metadata.h rename cpp/src/arrow/{types/floating.h => ipc/test-common.h} (59%) rename cpp/src/arrow/{table => }/schema-test.cc (72%) rename cpp/src/arrow/{table => }/schema.cc (88%) rename cpp/src/arrow/{table => }/schema.h (91%) rename cpp/src/arrow/{table => }/table-test.cc (92%) rename cpp/src/arrow/{table => }/table.cc (69%) rename cpp/src/arrow/{table => }/table.h (55%) delete mode 100644 cpp/src/arrow/table/test-common.h delete mode 100644 cpp/src/arrow/types/floating.cc delete mode 100644 cpp/src/arrow/types/integer.cc delete mode 100644 cpp/src/arrow/types/integer.h create mode 100644 format/Message.fbs rename cpp/src/arrow/table/CMakeLists.txt => python/pyarrow/tests/test_table.py (58%) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 49dcc395fbc..193c76feba1 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -19,7 +19,14 @@ echo $GTEST_HOME : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} -cmake -DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" + +if [ $TRAVIS_OS_NAME == "linux" ]; then + cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +else + cmake $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR +fi + make -j4 make install diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index d96b98f8d37..997bdf35e83 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -8,10 +8,6 @@ pushd $CPP_BUILD_DIR make lint -if [ $TRAVIS_OS_NAME == "linux" ]; then - valgrind --tool=memcheck --leak-check=yes --error-exitcode=1 ctest -L unittest -else - ctest -L unittest -fi +ctest -L unittest popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 268c1d11e1e..6d701079b48 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -51,7 +51,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(ARROW_PARQUET "Build the Parquet adapter and link to libparquet" OFF) - + option(ARROW_TEST_MEMCHECK + "Run the test suite using valgrind --tool=memcheck" + OFF) option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests" ON) @@ -60,6 +62,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow micro benchmarks" OFF) + option(ARROW_IPC + "Build the Arrow IPC extensions" + ON) + endif() if(NOT ARROW_BUILD_TESTS) @@ -260,17 +266,17 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") include_directories(src) ############################################################ -# Benchmarking +# Benchmarking ############################################################ # Add a new micro benchmark, with or without an executable that should be built. # If benchmarks are enabled then they will be run along side unit tests with ctest. -# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, # respectively. # # REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component # (e.g. monotime-benchmark) or contain additional components (e.g. # net/net_util-benchmark). Either way, the last component must be a globally -# unique name. +# unique name. # The benchmark will registered as unit test with ctest with a label # of 'benchmark'. @@ -281,7 +287,7 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) return() endif() get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) - + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc) # This benchmark has a corresponding .cc file, set it up as an executable. set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") @@ -294,7 +300,7 @@ function(ADD_ARROW_BENCHMARK REL_BENCHMARK_NAME) set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}) set(NO_COLOR "") endif() - + add_test(${BENCHMARK_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") @@ -345,9 +351,18 @@ function(ADD_ARROW_TEST REL_TEST_NAME) set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) endif() - add_test(${TEST_NAME} - ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) + if (ARROW_TEST_MEMCHECK) + SET_PROPERTY(TARGET ${TEST_NAME} + APPEND_STRING PROPERTY + COMPILE_FLAGS " -DARROW_VALGRIND") + add_test(${TEST_NAME} + valgrind --tool=memcheck --leak-check=full --error-exitcode=1 ${TEST_PATH}) + else() + add_test(${TEST_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) + endif() set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") + if(ARGN) set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) endif() @@ -403,7 +418,7 @@ if ("$ENV{GTEST_HOME}" STREQUAL "") set(GTest_HOME ${THIRDPARTY_DIR}/googletest-release-1.7.0) endif() -## Google Benchmark +## Google Benchmark if ("$ENV{GBENCHMARK_HOME}" STREQUAL "") set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed) endif() @@ -487,24 +502,10 @@ if (UNIX) add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py --verbose=2 --linelength=90 - --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11 - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h`) + --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) endif (UNIX) -#---------------------------------------------------------------------- -# Parquet adapter - -if(ARROW_PARQUET) - find_package(Parquet REQUIRED) - include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(parquet - STATIC_LIB ${PARQUET_STATIC_LIB} - SHARED_LIB ${PARQUET_SHARED_LIB}) - - add_subdirectory(src/arrow/parquet) - list(APPEND LINK_LIBS arrow_parquet parquet) -endif() - ############################################################ # Subdirectories ############################################################ @@ -515,15 +516,18 @@ set(LIBARROW_LINK_LIBS set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc + src/arrow/column.cc + src/arrow/schema.cc + src/arrow/table.cc src/arrow/type.cc - src/arrow/table/column.cc - src/arrow/table/schema.cc - src/arrow/table/table.cc + # IPC / Shared memory library; to be turned into an optional component + src/arrow/ipc/adapter.cc + src/arrow/ipc/memory.cc + src/arrow/ipc/metadata.cc + src/arrow/ipc/metadata-internal.cc src/arrow/types/construct.cc - src/arrow/types/floating.cc - src/arrow/types/integer.cc src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc @@ -559,9 +563,39 @@ target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) add_subdirectory(src/arrow) add_subdirectory(src/arrow/util) -add_subdirectory(src/arrow/table) add_subdirectory(src/arrow/types) install(TARGETS arrow LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) + +#---------------------------------------------------------------------- +# Parquet adapter library + +if(ARROW_PARQUET) + find_package(Parquet REQUIRED) + include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) + ADD_THIRDPARTY_LIB(parquet + STATIC_LIB ${PARQUET_STATIC_LIB} + SHARED_LIB ${PARQUET_SHARED_LIB}) + + add_subdirectory(src/arrow/parquet) + list(APPEND LINK_LIBS arrow_parquet parquet) +endif() + +#---------------------------------------------------------------------- +# IPC library + +## Flatbuffers +if(ARROW_IPC) + find_package(Flatbuffers REQUIRED) + message(STATUS "Flatbuffers include dir: ${FLATBUFFERS_INCLUDE_DIR}") + message(STATUS "Flatbuffers static library: ${FLATBUFFERS_STATIC_LIB}") + message(STATUS "Flatbuffers compiler: ${FLATBUFFERS_COMPILER}") + include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) + add_library(flatbuffers STATIC IMPORTED) + set_target_properties(flatbuffers PROPERTIES + IMPORTED_LOCATION ${FLATBUFFERS_STATIC_LIB}) + + add_subdirectory(src/arrow/ipc) +endif() diff --git a/cpp/cmake_modules/FindFlatbuffers.cmake b/cpp/cmake_modules/FindFlatbuffers.cmake new file mode 100644 index 00000000000..ee472d1c899 --- /dev/null +++ b/cpp/cmake_modules/FindFlatbuffers.cmake @@ -0,0 +1,95 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find Flatbuffers headers and libraries. +# +# Usage of this module as follows: +# +# find_package(Flatbuffers) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Flatbuffers_HOME - +# When set, this path is inspected instead of standard library locations as +# the root of the Flatbuffers installation. The environment variable +# FLATBUFFERS_HOME overrides this veriable. +# +# This module defines +# FLATBUFFERS_INCLUDE_DIR, directory containing headers +# FLATBUFFERS_LIBS, directory containing flatbuffers libraries +# FLATBUFFERS_STATIC_LIB, path to libflatbuffers.a +# FLATBUFFERS_FOUND, whether flatbuffers has been found + +if( NOT "$ENV{FLATBUFFERS_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{FLATBUFFERS_HOME}" _native_path ) + list( APPEND _flatbuffers_roots ${_native_path} ) +elseif ( Flatbuffers_HOME ) + list( APPEND _flatbuffers_roots ${Flatbuffers_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _flatbuffers_roots ) + find_path( FLATBUFFERS_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h + PATHS ${_flatbuffers_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( FLATBUFFERS_LIBRARIES NAMES flatbuffers + PATHS ${_flatbuffers_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else () + find_path( FLATBUFFERS_INCLUDE_DIR NAMES flatbuffers/flatbuffers.h ) + find_library( FLATBUFFERS_LIBRARIES NAMES flatbuffers ) +endif () + +find_program(FLATBUFFERS_COMPILER flatc + $ENV{FLATBUFFERS_HOME}/bin + /usr/local/bin + /usr/bin + NO_DEFAULT_PATH +) + +if (FLATBUFFERS_INCLUDE_DIR AND FLATBUFFERS_LIBRARIES) + set(FLATBUFFERS_FOUND TRUE) + get_filename_component( FLATBUFFERS_LIBS ${FLATBUFFERS_LIBRARIES} PATH ) + set(FLATBUFFERS_LIB_NAME libflatbuffers) + set(FLATBUFFERS_STATIC_LIB ${FLATBUFFERS_LIBS}/${FLATBUFFERS_LIB_NAME}.a) +else () + set(FLATBUFFERS_FOUND FALSE) +endif () + +if (FLATBUFFERS_FOUND) + if (NOT Flatbuffers_FIND_QUIETLY) + message(STATUS "Found the Flatbuffers library: ${FLATBUFFERS_LIBRARIES}") + endif () +else () + if (NOT Flatbuffers_FIND_QUIETLY) + set(FLATBUFFERS_ERR_MSG "Could not find the Flatbuffers library. Looked in ") + if ( _flatbuffers_roots ) + set(FLATBUFFERS_ERR_MSG "${FLATBUFFERS_ERR_MSG} in ${_flatbuffers_roots}.") + else () + set(FLATBUFFERS_ERR_MSG "${FLATBUFFERS_ERR_MSG} system search paths.") + endif () + if (Flatbuffers_FIND_REQUIRED) + message(FATAL_ERROR "${FLATBUFFERS_ERR_MSG}") + else (Flatbuffers_FIND_REQUIRED) + message(STATUS "${FLATBUFFERS_ERR_MSG}") + endif (Flatbuffers_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + FLATBUFFERS_INCLUDE_DIR + FLATBUFFERS_LIBS + FLATBUFFERS_STATIC_LIB + FLATBUFFERS_COMPILER +) diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index 04688e7d594..6520dbd43f7 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -2,11 +2,12 @@ SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } -./thirdparty/build_thirdparty.sh || { echo "build_thirdparty.sh failed" ; return; } +./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } +./thirdparty/build_thirdparty.sh || { echo "build_thirdparty.sh failed" ; return; } source thirdparty/versions.sh export GTEST_HOME=$SOURCE_DIR/thirdparty/$GTEST_BASEDIR export GBENCHMARK_HOME=$SOURCE_DIR/thirdparty/installed +export FLATBUFFERS_HOME=$SOURCE_DIR/thirdparty/installed echo "Build env initialized" diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 73e6a9b22c9..2d42edcfbd4 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -19,7 +19,10 @@ install(FILES api.h array.h + column.h builder.h + schema.h + table.h type.h DESTINATION include/arrow) @@ -30,3 +33,8 @@ install(FILES set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) ADD_ARROW_TEST(array-test) +ADD_ARROW_TEST(column-test) +ADD_ARROW_TEST(schema-test) +ADD_ARROW_TEST(table-test) + +ADD_ARROW_BENCHMARK(column-benchmark) diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index c73d4b386cf..7be7f88c22e 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -22,20 +22,19 @@ #include "arrow/array.h" #include "arrow/builder.h" +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/type.h" -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" - #include "arrow/types/boolean.h" #include "arrow/types/construct.h" -#include "arrow/types/floating.h" -#include "arrow/types/integer.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index df827aaa113..eded5941e89 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -15,30 +15,26 @@ // specific language governing permissions and limitations // under the License. -#include - #include #include #include #include +#include "gtest/gtest.h" + #include "arrow/array.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/integer.h" #include "arrow/types/primitive.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" namespace arrow { -static TypePtr int32 = TypePtr(new Int32Type()); - class TestArray : public ::testing::Test { public: void SetUp() { - pool_ = GetDefaultMemoryPool(); + pool_ = default_memory_pool(); } protected: @@ -75,10 +71,10 @@ TEST_F(TestArray, TestIsNull) { if (x > 0) ++null_count; } - std::shared_ptr null_buf = bytes_to_null_buffer(nulls.data(), + std::shared_ptr null_buf = test::bytes_to_null_buffer(nulls.data(), nulls.size()); std::unique_ptr arr; - arr.reset(new Array(int32, nulls.size(), null_count, null_buf)); + arr.reset(new Int32Array(nulls.size(), nullptr, null_count, null_buf)); ASSERT_EQ(null_count, arr->null_count()); ASSERT_EQ(5, null_buf->size()); diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index ee4ef66d11e..5a5bc1069db 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -28,11 +28,6 @@ namespace arrow { Array::Array(const TypePtr& type, int32_t length, int32_t null_count, const std::shared_ptr& nulls) { - Init(type, length, null_count, nulls); -} - -void Array::Init(const TypePtr& type, int32_t length, int32_t null_count, - const std::shared_ptr& nulls) { type_ = type; length_ = length; null_count_ = null_count; @@ -42,4 +37,25 @@ void Array::Init(const TypePtr& type, int32_t length, int32_t null_count, } } +bool Array::EqualsExact(const Array& other) const { + if (this == &other) return true; + if (length_ != other.length_ || null_count_ != other.null_count_ || + type_enum() != other.type_enum()) { + return false; + } + if (null_count_ > 0) { + return nulls_->Equals(*other.nulls_, util::bytes_for_bits(length_)); + } else { + return true; + } +} + +bool NullArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (Type::NA != arr->type_enum()) { + return false; + } + return arr->length() == length_; +} + } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 85e853e2ae5..65fc0aaf583 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -40,20 +40,11 @@ class Buffer; // explicitly increment its reference count class Array { public: - Array() : - null_count_(0), - length_(0), - nulls_(nullptr), - null_bits_(nullptr) {} - Array(const TypePtr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); virtual ~Array() {} - void Init(const TypePtr& type, int32_t length, int32_t null_count, - const std::shared_ptr& nulls); - // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { return null_count_ > 0 && util::get_bit(null_bits_, i); @@ -63,12 +54,15 @@ class Array { int32_t null_count() const { return null_count_;} const std::shared_ptr& type() const { return type_;} - LogicalType::type logical_type() const { return type_->type;} + Type::type type_enum() const { return type_->type;} const std::shared_ptr& nulls() const { return nulls_; } + bool EqualsExact(const Array& arr) const; + virtual bool Equals(const std::shared_ptr& arr) const = 0; + protected: TypePtr type_; int32_t null_count_; @@ -78,9 +72,22 @@ class Array { const uint8_t* null_bits_; private: + Array() {} DISALLOW_COPY_AND_ASSIGN(Array); }; +// Degenerate null type Array +class NullArray : public Array { + public: + NullArray(const std::shared_ptr& type, int32_t length) : + Array(type, length, length, nullptr) {} + + explicit NullArray(int32_t length) : + NullArray(std::make_shared(), length) {} + + bool Equals(const std::shared_ptr& arr) const override; +}; + typedef std::shared_ptr ArrayPtr; } // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 8cc689c3e81..d5d1fdf95af 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -99,7 +99,7 @@ class ArrayBuilder { int32_t capacity_; // Child value array builders. These are owned by this class - std::vector > children_; + std::vector> children_; private: DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); diff --git a/cpp/src/arrow/table/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc similarity index 94% rename from cpp/src/arrow/table/column-benchmark.cc rename to cpp/src/arrow/column-benchmark.cc index c01146d7b09..69ee52c3e09 100644 --- a/cpp/src/arrow/table/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -19,15 +19,14 @@ #include "benchmark/benchmark.h" #include "arrow/test-util.h" -#include "arrow/table/test-common.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" #include "arrow/util/memory-pool.h" namespace arrow { namespace { template std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { - auto pool = GetDefaultMemoryPool(); + auto pool = default_memory_pool(); auto data = std::make_shared(pool); auto nulls = std::make_shared(pool); data->Resize(length * sizeof(typename ArrayType::value_type)); diff --git a/cpp/src/arrow/table/column-test.cc b/cpp/src/arrow/column-test.cc similarity index 93% rename from cpp/src/arrow/table/column-test.cc rename to cpp/src/arrow/column-test.cc index 3b102e48c87..0630785630e 100644 --- a/cpp/src/arrow/table/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -15,18 +15,18 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include #include -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/test-common.h" +#include "gtest/gtest.h" + +#include "arrow/column.h" +#include "arrow/schema.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" using std::shared_ptr; using std::vector; diff --git a/cpp/src/arrow/table/column.cc b/cpp/src/arrow/column.cc similarity index 96% rename from cpp/src/arrow/table/column.cc rename to cpp/src/arrow/column.cc index 573e6508759..46acf8df2ff 100644 --- a/cpp/src/arrow/table/column.cc +++ b/cpp/src/arrow/column.cc @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/table/column.h" +#include "arrow/column.h" #include #include +#include "arrow/array.h" #include "arrow/type.h" #include "arrow/util/status.h" @@ -28,6 +29,7 @@ namespace arrow { ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { length_ = 0; + null_count_ = 0; for (const std::shared_ptr& chunk : chunks) { length_ += chunk->length(); null_count_ += chunk->null_count(); diff --git a/cpp/src/arrow/table/column.h b/cpp/src/arrow/column.h similarity index 93% rename from cpp/src/arrow/table/column.h rename to cpp/src/arrow/column.h index dfc7516e26a..1ad97b20863 100644 --- a/cpp/src/arrow/table/column.h +++ b/cpp/src/arrow/column.h @@ -15,19 +15,22 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TABLE_COLUMN_H -#define ARROW_TABLE_COLUMN_H +#ifndef ARROW_COLUMN_H +#define ARROW_COLUMN_H +#include #include #include #include -#include "arrow/array.h" #include "arrow/type.h" namespace arrow { -typedef std::vector > ArrayVector; +class Array; +class Status; + +typedef std::vector> ArrayVector; // A data structure managing a list of primitive Arrow arrays logically as one // large array @@ -102,4 +105,4 @@ class Column { } // namespace arrow -#endif // ARROW_TABLE_COLUMN_H +#endif // ARROW_COLUMN_H diff --git a/cpp/src/arrow/ipc/.gitignore b/cpp/src/arrow/ipc/.gitignore new file mode 100644 index 00000000000..8150d7efe33 --- /dev/null +++ b/cpp/src/arrow/ipc/.gitignore @@ -0,0 +1 @@ +*_generated.h \ No newline at end of file diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt new file mode 100644 index 00000000000..383684f42f9 --- /dev/null +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +####################################### +# arrow_ipc +####################################### + +# Headers: top level +install(FILES + adapter.h + metadata.h + memory.h + DESTINATION include/arrow/ipc) + +ADD_ARROW_TEST(ipc-adapter-test) +ADD_ARROW_TEST(ipc-memory-test) +ADD_ARROW_TEST(ipc-metadata-test) + +# make clean will delete the generated file +set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE) + +set(OUTPUT_DIR ${CMAKE_SOURCE_DIR}/src/arrow/ipc) +set(FBS_OUTPUT_FILES "${OUTPUT_DIR}/Message_generated.h") + +set(FBS_SRC ${CMAKE_SOURCE_DIR}/../format/Message.fbs) +get_filename_component(ABS_FBS_SRC ${FBS_SRC} ABSOLUTE) + +add_custom_command( + OUTPUT ${FBS_OUTPUT_FILES} + COMMAND ${FLATBUFFERS_COMPILER} -c -o ${OUTPUT_DIR} ${ABS_FBS_SRC} + DEPENDS ${ABS_FBS_SRC} + COMMENT "Running flatc compiler on ${FBS_SRC}" + VERBATIM +) + +add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) +add_dependencies(arrow metadata_fbs) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc new file mode 100644 index 00000000000..7cdb965f5f4 --- /dev/null +++ b/cpp/src/arrow/ipc/adapter.cc @@ -0,0 +1,305 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/adapter.h" + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/ipc/memory.h" +#include "arrow/ipc/Message_generated.h" +#include "arrow/ipc/metadata.h" +#include "arrow/ipc/metadata-internal.h" +#include "arrow/schema.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/types/construct.h" +#include "arrow/types/primitive.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +namespace ipc { + +static bool IsPrimitive(const DataType* type) { + switch (type->type) { + // NA is null type or "no type", considered primitive for now + case Type::NA: + case Type::BOOL: + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::UINT64: + case Type::INT64: + case Type::FLOAT: + case Type::DOUBLE: + return true; + default: + return false; + } +} + +// ---------------------------------------------------------------------- +// Row batch write path + +Status VisitArray(const Array* arr, std::vector* field_nodes, + std::vector>* buffers) { + if (IsPrimitive(arr->type().get())) { + const PrimitiveArray* prim_arr = static_cast(arr); + + field_nodes->push_back( + flatbuf::FieldNode(prim_arr->length(), prim_arr->null_count())); + + if (prim_arr->null_count() > 0) { + buffers->push_back(prim_arr->nulls()); + } else { + // Push a dummy zero-length buffer, not to be copied + buffers->push_back(std::make_shared(nullptr, 0)); + } + buffers->push_back(prim_arr->data()); + } else if (arr->type_enum() == Type::LIST) { + // TODO(wesm) + return Status::NotImplemented("List type"); + } else if (arr->type_enum() == Type::STRUCT) { + // TODO(wesm) + return Status::NotImplemented("Struct type"); + } + + return Status::OK(); +} + +class RowBatchWriter { + public: + explicit RowBatchWriter(const RowBatch* batch) : + batch_(batch) {} + + Status AssemblePayload() { + // Perform depth-first traversal of the row-batch + for (int i = 0; i < batch_->num_columns(); ++i) { + const Array* arr = batch_->column(i).get(); + RETURN_NOT_OK(VisitArray(arr, &field_nodes_, &buffers_)); + } + return Status::OK(); + } + + Status Write(MemorySource* dst, int64_t position, int64_t* data_header_offset) { + // Write out all the buffers contiguously and compute the total size of the + // memory payload + int64_t offset = 0; + for (size_t i = 0; i < buffers_.size(); ++i) { + const Buffer* buffer = buffers_[i].get(); + int64_t size = buffer->size(); + + // TODO(wesm): We currently have no notion of shared memory page id's, + // but we've included it in the metadata IDL for when we have it in the + // future. Use page=0 for now + // + // Note that page ids are a bespoke notion for Arrow and not a feature we + // are using from any OS-level shared memory. The thought is that systems + // may (in the future) associate integer page id's with physical memory + // pages (according to whatever is the desired shared memory mechanism) + buffer_meta_.push_back(flatbuf::Buffer(0, position + offset, size)); + + if (size > 0) { + RETURN_NOT_OK(dst->Write(position + offset, buffer->data(), size)); + offset += size; + } + } + + // Now that we have computed the locations of all of the buffers in shared + // memory, the data header can be converted to a flatbuffer and written out + // + // Note: The memory written here is prefixed by the size of the flatbuffer + // itself as an int32_t. On reading from a MemorySource, you will have to + // determine the data header size then request a buffer such that you can + // construct the flatbuffer data accessor object (see arrow::ipc::Message) + std::shared_ptr data_header; + RETURN_NOT_OK(WriteDataHeader(batch_->num_rows(), offset, + field_nodes_, buffer_meta_, &data_header)); + + // Write the data header at the end + RETURN_NOT_OK(dst->Write(position + offset, data_header->data(), + data_header->size())); + + *data_header_offset = position + offset; + return Status::OK(); + } + + // This must be called after invoking AssemblePayload + int64_t DataHeaderSize() { + // TODO(wesm): In case it is needed, compute the upper bound for the size + // of the buffer containing the flatbuffer data header. + return 0; + } + + // Total footprint of buffers. This must be called after invoking + // AssemblePayload + int64_t TotalBytes() { + int64_t total = 0; + for (const std::shared_ptr& buffer : buffers_) { + total += buffer->size(); + } + return total; + } + + private: + const RowBatch* batch_; + + std::vector field_nodes_; + std::vector buffer_meta_; + std::vector> buffers_; +}; + +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset) { + RowBatchWriter serializer(batch); + RETURN_NOT_OK(serializer.AssemblePayload()); + return serializer.Write(dst, position, header_offset); +} +// ---------------------------------------------------------------------- +// Row batch read path + +static constexpr int64_t INIT_METADATA_SIZE = 4096; + +class RowBatchReader::Impl { + public: + Impl(MemorySource* source, const std::shared_ptr& metadata) : + source_(source), + metadata_(metadata) { + num_buffers_ = metadata->num_buffers(); + num_flattened_fields_ = metadata->num_fields(); + } + + Status AssembleBatch(const std::shared_ptr& schema, + std::shared_ptr* out) { + std::vector> arrays(schema->num_fields()); + + // The field_index and buffer_index are incremented in NextArray based on + // how much of the batch is "consumed" (through nested data reconstruction, + // for example) + field_index_ = 0; + buffer_index_ = 0; + for (int i = 0; i < schema->num_fields(); ++i) { + const Field* field = schema->field(i).get(); + RETURN_NOT_OK(NextArray(field, &arrays[i])); + } + + *out = std::make_shared(schema, metadata_->length(), + arrays); + return Status::OK(); + } + + private: + // Traverse the flattened record batch metadata and reassemble the + // corresponding array containers + Status NextArray(const Field* field, std::shared_ptr* out) { + const std::shared_ptr& type = field->type; + + // pop off a field + if (field_index_ >= num_flattened_fields_) { + return Status::Invalid("Ran out of field metadata, likely malformed"); + } + + // This only contains the length and null count, which we need to figure + // out what to do with the buffers. For example, if null_count == 0, then + // we can skip that buffer without reading from shared memory + FieldMetadata field_meta = metadata_->field(field_index_++); + + if (IsPrimitive(type.get())) { + std::shared_ptr nulls; + std::shared_ptr data; + if (field_meta.null_count == 0) { + nulls = nullptr; + ++buffer_index_; + } else { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &nulls)); + } + if (field_meta.length > 0) { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &data)); + } else { + data.reset(new Buffer(nullptr, 0)); + } + return MakePrimitiveArray(type, field_meta.length, data, + field_meta.null_count, nulls, out); + } else { + return Status::NotImplemented("Non-primitive types not complete yet"); + } + } + + Status GetBuffer(int buffer_index, std::shared_ptr* out) { + BufferMetadata metadata = metadata_->buffer(buffer_index); + return source_->ReadAt(metadata.offset, metadata.length, out); + } + + MemorySource* source_; + std::shared_ptr metadata_; + + int field_index_; + int buffer_index_; + int num_buffers_; + int num_flattened_fields_; +}; + +Status RowBatchReader::Open(MemorySource* source, int64_t position, + std::shared_ptr* out) { + std::shared_ptr metadata; + RETURN_NOT_OK(source->ReadAt(position, INIT_METADATA_SIZE, &metadata)); + + int32_t metadata_size = *reinterpret_cast(metadata->data()); + + // We may not need to call source->ReadAt again + if (metadata_size > static_cast(INIT_METADATA_SIZE - sizeof(int32_t))) { + // We don't have enough data, read the indicated metadata size. + RETURN_NOT_OK(source->ReadAt(position + sizeof(int32_t), + metadata_size, &metadata)); + } + + // TODO(wesm): buffer slicing here would be better in case ReadAt returns + // allocated memory + + std::shared_ptr message; + RETURN_NOT_OK(Message::Open(metadata, &message)); + + if (message->type() != Message::RECORD_BATCH) { + return Status::Invalid("Metadata message is not a record batch"); + } + + std::shared_ptr batch_meta = message->GetRecordBatch(); + + std::shared_ptr result(new RowBatchReader()); + result->impl_.reset(new Impl(source, batch_meta)); + *out = result; + + return Status::OK(); +} + +Status RowBatchReader::GetRowBatch(const std::shared_ptr& schema, + std::shared_ptr* out) { + return impl_->AssembleBatch(schema, out); +} + + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h new file mode 100644 index 00000000000..26dea6d04b8 --- /dev/null +++ b/cpp/src/arrow/ipc/adapter.h @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for writing and accessing (with zero copy, if possible) Arrow +// data in shared memory + +#ifndef ARROW_IPC_ADAPTER_H +#define ARROW_IPC_ADAPTER_H + +#include +#include + +namespace arrow { + +class Array; +class RowBatch; +class Schema; +class Status; + +namespace ipc { + +class MemorySource; +class RecordBatchMessage; + +// ---------------------------------------------------------------------- +// Write path + +// Write the RowBatch (collection of equal-length Arrow arrays) to the memory +// source at the indicated position +// +// First, each of the memory buffers are written out end-to-end in starting at +// the indicated position. +// +// Then, this function writes the batch metadata as a flatbuffer (see +// format/Message.fbs -- the RecordBatch message type) like so: +// +// +// +// Finally, the memory offset to the start of the metadata / data header is +// returned in an out-variable +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset); + +// int64_t GetRowBatchMetadata(const RowBatch* batch); + +// Compute the precise number of bytes needed in a contiguous memory segment to +// write the row batch. This involves generating the complete serialized +// Flatbuffers metadata. +int64_t GetRowBatchSize(const RowBatch* batch); + +// ---------------------------------------------------------------------- +// "Read" path; does not copy data if the MemorySource does not + +class RowBatchReader { + public: + static Status Open(MemorySource* source, int64_t position, + std::shared_ptr* out); + + // Reassemble the row batch. A Schema is required to be able to construct the + // right array containers + Status GetRowBatch(const std::shared_ptr& schema, + std::shared_ptr* out); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_MEMORY_H diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc new file mode 100644 index 00000000000..d75998f0a5d --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -0,0 +1,112 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/ipc/adapter.h" +#include "arrow/ipc/memory.h" +#include "arrow/ipc/test-common.h" + +#include "arrow/test-util.h" +#include "arrow/types/primitive.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +class TestWriteRowBatch : public ::testing::Test, public MemoryMapFixture { + public: + void SetUp() { + pool_ = default_memory_pool(); + } + void TearDown() { + MemoryMapFixture::TearDown(); + } + + void InitMemoryMap(int64_t size) { + std::string path = "test-write-row-batch"; + MemoryMapFixture::CreateFile(path, size); + ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &mmap_)); + } + + protected: + MemoryPool* pool_; + std::shared_ptr mmap_; +}; + +const auto INT32 = std::make_shared(); + +TEST_F(TestWriteRowBatch, IntegerRoundTrip) { + const int length = 1000; + + // Make the schema + auto f0 = std::make_shared("f0", INT32); + auto f1 = std::make_shared("f1", INT32); + std::shared_ptr schema(new Schema({f0, f1})); + + // Example data + + auto data = std::make_shared(pool_); + ASSERT_OK(data->Resize(length * sizeof(int32_t))); + test::rand_uniform_int(length, 0, 0, std::numeric_limits::max(), + reinterpret_cast(data->mutable_data())); + + auto nulls = std::make_shared(pool_); + int null_bytes = util::bytes_for_bits(length); + ASSERT_OK(nulls->Resize(null_bytes)); + test::random_bytes(null_bytes, 0, nulls->mutable_data()); + + auto a0 = std::make_shared(length, data); + auto a1 = std::make_shared(length, data, + test::bitmap_popcount(nulls->data(), length), nulls); + + RowBatch batch(schema, length, {a0, a1}); + + // TODO(wesm): computing memory requirements for a row batch + // 64k is plenty of space + InitMemoryMap(1 << 16); + + int64_t header_location; + ASSERT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); + + std::shared_ptr result; + ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &result)); + + std::shared_ptr batch_result; + ASSERT_OK(result->GetRowBatch(schema, &batch_result)); + EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); + + for (int i = 0; i < batch.num_columns(); ++i) { + EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) + << i << batch.column_name(i); + } +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-memory-test.cc b/cpp/src/arrow/ipc/ipc-memory-test.cc new file mode 100644 index 00000000000..332ad2a2b80 --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-memory-test.cc @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/ipc/memory.h" +#include "arrow/ipc/test-common.h" +#include "arrow/test-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +class TestMemoryMappedSource : public ::testing::Test, public MemoryMapFixture { + public: + void TearDown() { + MemoryMapFixture::TearDown(); + } +}; + +TEST_F(TestMemoryMappedSource, InvalidUsages) { +} + +TEST_F(TestMemoryMappedSource, WriteRead) { + const int64_t buffer_size = 1024; + std::vector buffer(buffer_size); + + test::random_bytes(1024, 0, buffer.data()); + + const int reps = 5; + + std::string path = "ipc-write-read-test"; + CreateFile(path, reps * buffer_size); + + std::shared_ptr result; + ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &result)); + + int64_t position = 0; + + std::shared_ptr out_buffer; + for (int i = 0; i < reps; ++i) { + ASSERT_OK(result->Write(position, buffer.data(), buffer_size)); + ASSERT_OK(result->ReadAt(position, buffer_size, &out_buffer)); + + ASSERT_EQ(0, memcmp(out_buffer->data(), buffer.data(), buffer_size)); + + position += buffer_size; + } +} + +TEST_F(TestMemoryMappedSource, InvalidFile) { + std::string non_existent_path = "invalid-file-name-asfd"; + + std::shared_ptr result; + ASSERT_RAISES(IOError, MemoryMappedSource::Open(non_existent_path, + MemorySource::READ_ONLY, &result)); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-metadata-test.cc b/cpp/src/arrow/ipc/ipc-metadata-test.cc new file mode 100644 index 00000000000..ceabec0fa7c --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-metadata-test.cc @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/ipc/metadata.h" +#include "arrow/schema.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/util/status.h" + +namespace arrow { + +class Buffer; + +static inline void assert_schema_equal(const Schema* lhs, const Schema* rhs) { + if (!lhs->Equals(*rhs)) { + std::stringstream ss; + ss << "left schema: " << lhs->ToString() << std::endl + << "right schema: " << rhs->ToString() << std::endl; + FAIL() << ss.str(); + } +} + +class TestSchemaMessage : public ::testing::Test { + public: + void SetUp() {} + + void CheckRoundtrip(const Schema* schema) { + std::shared_ptr buffer; + ASSERT_OK(ipc::WriteSchema(schema, &buffer)); + + std::shared_ptr message; + ASSERT_OK(ipc::Message::Open(buffer, &message)); + + ASSERT_EQ(ipc::Message::SCHEMA, message->type()); + + std::shared_ptr schema_msg = message->GetSchema(); + ASSERT_EQ(schema->num_fields(), schema_msg->num_fields()); + + std::shared_ptr schema2; + ASSERT_OK(schema_msg->GetSchema(&schema2)); + + assert_schema_equal(schema, schema2.get()); + } +}; + +const std::shared_ptr INT32 = std::make_shared(); + +TEST_F(TestSchemaMessage, PrimitiveFields) { + auto f0 = std::make_shared("f0", std::make_shared()); + auto f1 = std::make_shared("f1", std::make_shared()); + auto f2 = std::make_shared("f2", std::make_shared()); + auto f3 = std::make_shared("f3", std::make_shared()); + auto f4 = std::make_shared("f4", std::make_shared()); + auto f5 = std::make_shared("f5", std::make_shared()); + auto f6 = std::make_shared("f6", std::make_shared()); + auto f7 = std::make_shared("f7", std::make_shared()); + auto f8 = std::make_shared("f8", std::make_shared()); + auto f9 = std::make_shared("f9", std::make_shared()); + auto f10 = std::make_shared("f10", std::make_shared()); + + Schema schema({f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10}); + CheckRoundtrip(&schema); +} + +TEST_F(TestSchemaMessage, NestedFields) { + auto type = std::make_shared(std::make_shared()); + auto f0 = std::make_shared("f0", type); + + std::shared_ptr type2(new StructType({ + std::make_shared("k1", INT32), + std::make_shared("k2", INT32), + std::make_shared("k3", INT32)})); + auto f1 = std::make_shared("f1", type2); + + Schema schema({f0, f1}); + CheckRoundtrip(&schema); +} + +} // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc new file mode 100644 index 00000000000..e630ccd109b --- /dev/null +++ b/cpp/src/arrow/ipc/memory.cc @@ -0,0 +1,162 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/memory.h" + +#include // For memory-mapping +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +MemorySource::MemorySource(AccessMode access_mode) : + access_mode_(access_mode) {} + +MemorySource::~MemorySource() {} + +// Implement MemoryMappedSource + +class MemoryMappedSource::Impl { + public: + Impl() : + file_(nullptr), + is_open_(false), + data_(nullptr) {} + + ~Impl() { + if (is_open_) { + munmap(data_, size_); + fclose(file_); + } + } + + Status Open(const std::string& path, MemorySource::AccessMode mode) { + if (is_open_) { + return Status::IOError("A file is already open"); + } + + path_ = path; + + if (mode == MemorySource::READ_WRITE) { + file_ = fopen(path.c_str(), "r+b"); + } else { + file_ = fopen(path.c_str(), "rb"); + } + if (file_ == nullptr) { + std::stringstream ss; + ss << "Unable to open file, errno: " << errno; + return Status::IOError(ss.str()); + } + + fseek(file_, 0L, SEEK_END); + if (ferror(file_)) { + return Status::IOError("Unable to seek to end of file"); + } + size_ = ftell(file_); + + fseek(file_, 0L, SEEK_SET); + is_open_ = true; + + // TODO(wesm): Add read-only version of this + data_ = reinterpret_cast(mmap(nullptr, size_, + PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(file_), 0)); + if (data_ == nullptr) { + std::stringstream ss; + ss << "Memory mapping file failed, errno: " << errno; + return Status::IOError(ss.str()); + } + + return Status::OK(); + } + + int64_t size() const { + return size_; + } + + uint8_t* data() { + return data_; + } + + private: + std::string path_; + FILE* file_; + int64_t size_; + bool is_open_; + + // The memory map + uint8_t* data_; +}; + +MemoryMappedSource::MemoryMappedSource(AccessMode access_mode) : + MemorySource(access_mode) {} + +Status MemoryMappedSource::Open(const std::string& path, AccessMode access_mode, + std::shared_ptr* out) { + std::shared_ptr result(new MemoryMappedSource(access_mode)); + + result->impl_.reset(new Impl()); + RETURN_NOT_OK(result->impl_->Open(path, access_mode)); + + *out = result; + return Status::OK(); +} + +int64_t MemoryMappedSource::Size() const { + return impl_->size(); +} + +Status MemoryMappedSource::Close() { + // munmap handled in ::Impl dtor + return Status::OK(); +} + +Status MemoryMappedSource::ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) { + if (position < 0 || position >= impl_->size()) { + return Status::Invalid("position is out of bounds"); + } + + nbytes = std::min(nbytes, impl_->size() - position); + *out = std::make_shared(impl_->data() + position, nbytes); + return Status::OK(); +} + +Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, + int64_t nbytes) { + if (position < 0 || position >= impl_->size()) { + return Status::Invalid("position is out of bounds"); + } + + // TODO(wesm): verify we are not writing past the end of the buffer + uint8_t* dst = impl_->data() + position; + memcpy(dst, data, nbytes); + + return Status::OK(); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h new file mode 100644 index 00000000000..0b4d8347c34 --- /dev/null +++ b/cpp/src/arrow/ipc/memory.h @@ -0,0 +1,131 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for different interprocess memory sharing mechanisms + +#ifndef ARROW_IPC_MEMORY_H +#define ARROW_IPC_MEMORY_H + +#include +#include +#include + +#include "arrow/util/macros.h" + +namespace arrow { + +class Buffer; +class MutableBuffer; +class Status; + +namespace ipc { + +// Abstract output stream +class OutputStream { + public: + virtual ~OutputStream() {} + // Close the output stream + virtual Status Close() = 0; + + // The current position in the output stream + virtual int64_t Tell() const = 0; + + // Write bytes to the stream + virtual Status Write(const uint8_t* data, int64_t length) = 0; +}; + +// An output stream that writes to a MutableBuffer, such as one obtained from a +// memory map +class BufferOutputStream : public OutputStream { + public: + explicit BufferOutputStream(const std::shared_ptr& buffer): + buffer_(buffer) {} + + // Implement the OutputStream interface + Status Close() override; + int64_t Tell() const override; + Status Write(const uint8_t* data, int64_t length) override; + + // Returns the number of bytes remaining in the buffer + int64_t bytes_remaining() const; + + private: + std::shared_ptr buffer_; + int64_t capacity_; + int64_t position_; +}; + +class MemorySource { + public: + // Indicates the access permissions of the memory source + enum AccessMode { + READ_ONLY, + READ_WRITE + }; + + virtual ~MemorySource(); + + // Retrieve a buffer of memory from the source of the indicates size and at + // the indicated location + // @returns: arrow::Status indicating success / failure. The buffer is set + // into the *out argument + virtual Status ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) = 0; + + virtual Status Close() = 0; + + virtual Status Write(int64_t position, const uint8_t* data, int64_t nbytes) = 0; + + // @return: the size in bytes of the memory source + virtual int64_t Size() const = 0; + + protected: + explicit MemorySource(AccessMode access_mode = AccessMode::READ_WRITE); + + AccessMode access_mode_; + + private: + DISALLOW_COPY_AND_ASSIGN(MemorySource); +}; + +// A memory source that uses memory-mapped files for memory interactions +class MemoryMappedSource : public MemorySource { + public: + static Status Open(const std::string& path, AccessMode access_mode, + std::shared_ptr* out); + + Status Close() override; + + Status ReadAt(int64_t position, int64_t nbytes, + std::shared_ptr* out) override; + + Status Write(int64_t position, const uint8_t* data, int64_t nbytes) override; + + // @return: the size in bytes of the memory source + int64_t Size() const override; + + private: + explicit MemoryMappedSource(AccessMode access_mode); + // Hide the internal details of this class for now + class Impl; + std::unique_ptr impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_MEMORY_H diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc new file mode 100644 index 00000000000..14b186906c3 --- /dev/null +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -0,0 +1,317 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/metadata-internal.h" + +#include +#include +#include +#include +#include +#include + +#include "arrow/ipc/Message_generated.h" +#include "arrow/schema.h" +#include "arrow/type.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +typedef flatbuffers::FlatBufferBuilder FBB; +typedef flatbuffers::Offset FieldOffset; +typedef flatbuffers::Offset Offset; + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +namespace ipc { + +const std::shared_ptr BOOL = std::make_shared(); +const std::shared_ptr INT8 = std::make_shared(); +const std::shared_ptr INT16 = std::make_shared(); +const std::shared_ptr INT32 = std::make_shared(); +const std::shared_ptr INT64 = std::make_shared(); +const std::shared_ptr UINT8 = std::make_shared(); +const std::shared_ptr UINT16 = std::make_shared(); +const std::shared_ptr UINT32 = std::make_shared(); +const std::shared_ptr UINT64 = std::make_shared(); +const std::shared_ptr FLOAT = std::make_shared(); +const std::shared_ptr DOUBLE = std::make_shared(); + +static Status IntFromFlatbuffer(const flatbuf::Int* int_data, + std::shared_ptr* out) { + if (int_data->bitWidth() % 8 != 0) { + return Status::NotImplemented("Integers not in cstdint are not implemented"); + } else if (int_data->bitWidth() > 64) { + return Status::NotImplemented("Integers with more than 64 bits not implemented"); + } + + switch (int_data->bitWidth()) { + case 8: + *out = int_data->is_signed() ? INT8 : UINT8; + break; + case 16: + *out = int_data->is_signed() ? INT16 : UINT16; + break; + case 32: + *out = int_data->is_signed() ? INT32 : UINT32; + break; + case 64: + *out = int_data->is_signed() ? INT64 : UINT64; + break; + default: + *out = nullptr; + break; + } + return Status::OK(); +} + +static Status FloatFromFlatuffer(const flatbuf::FloatingPoint* float_data, + std::shared_ptr* out) { + if (float_data->precision() == flatbuf::Precision_SINGLE) { + *out = FLOAT; + } else { + *out = DOUBLE; + } + return Status::OK(); +} + +static Status TypeFromFlatbuffer(flatbuf::Type type, + const void* type_data, const std::vector>& children, + std::shared_ptr* out) { + switch (type) { + case flatbuf::Type_NONE: + return Status::Invalid("Type metadata cannot be none"); + case flatbuf::Type_Int: + return IntFromFlatbuffer(static_cast(type_data), out); + case flatbuf::Type_Bit: + return Status::NotImplemented("Type is not implemented"); + case flatbuf::Type_FloatingPoint: + return FloatFromFlatuffer(static_cast(type_data), + out); + case flatbuf::Type_Binary: + case flatbuf::Type_Utf8: + return Status::NotImplemented("Type is not implemented"); + case flatbuf::Type_Bool: + *out = BOOL; + return Status::OK(); + case flatbuf::Type_Decimal: + case flatbuf::Type_Timestamp: + case flatbuf::Type_List: + if (children.size() != 1) { + return Status::Invalid("List must have exactly 1 child field"); + } + *out = std::make_shared(children[0]); + return Status::OK(); + case flatbuf::Type_Tuple: + *out = std::make_shared(children); + return Status::OK(); + case flatbuf::Type_Union: + return Status::NotImplemented("Type is not implemented"); + default: + return Status::Invalid("Unrecognized type"); + } +} + +// Forward declaration +static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, + FieldOffset* offset); + +static Offset IntToFlatbuffer(FBB& fbb, int bitWidth, + bool is_signed) { + return flatbuf::CreateInt(fbb, bitWidth, is_signed).Union(); +} + +static Offset FloatToFlatbuffer(FBB& fbb, + flatbuf::Precision precision) { + return flatbuf::CreateFloatingPoint(fbb, precision).Union(); +} + +static Status ListToFlatbuffer(FBB& fbb, const std::shared_ptr& type, + std::vector* out_children, Offset* offset) { + FieldOffset field; + RETURN_NOT_OK(FieldToFlatbuffer(fbb, type->child(0), &field)); + out_children->push_back(field); + *offset = flatbuf::CreateList(fbb).Union(); + return Status::OK(); +} + +static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr& type, + std::vector* out_children, Offset* offset) { + FieldOffset field; + for (int i = 0; i < type->num_children(); ++i) { + RETURN_NOT_OK(FieldToFlatbuffer(fbb, type->child(i), &field)); + out_children->push_back(field); + } + *offset = flatbuf::CreateTuple(fbb).Union(); + return Status::OK(); +} + +#define INT_TO_FB_CASE(BIT_WIDTH, IS_SIGNED) \ + *out_type = flatbuf::Type_Int; \ + *offset = IntToFlatbuffer(fbb, BIT_WIDTH, IS_SIGNED); \ + break; + + +static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, + std::vector* children, + flatbuf::Type* out_type, Offset* offset) { + switch (type->type) { + case Type::BOOL: + *out_type = flatbuf::Type_Bool; + *offset = flatbuf::CreateBool(fbb).Union(); + break; + case Type::UINT8: + INT_TO_FB_CASE(8, false); + case Type::INT8: + INT_TO_FB_CASE(8, true); + case Type::UINT16: + INT_TO_FB_CASE(16, false); + case Type::INT16: + INT_TO_FB_CASE(16, true); + case Type::UINT32: + INT_TO_FB_CASE(32, false); + case Type::INT32: + INT_TO_FB_CASE(32, true); + case Type::UINT64: + INT_TO_FB_CASE(64, false); + case Type::INT64: + INT_TO_FB_CASE(64, true); + case Type::FLOAT: + *out_type = flatbuf::Type_FloatingPoint; + *offset = FloatToFlatbuffer(fbb, flatbuf::Precision_SINGLE); + break; + case Type::DOUBLE: + *out_type = flatbuf::Type_FloatingPoint; + *offset = FloatToFlatbuffer(fbb, flatbuf::Precision_DOUBLE); + break; + case Type::LIST: + *out_type = flatbuf::Type_List; + return ListToFlatbuffer(fbb, type, children, offset); + case Type::STRUCT: + *out_type = flatbuf::Type_Tuple; + return StructToFlatbuffer(fbb, type, children, offset); + default: + std::stringstream ss; + ss << "Unable to convert type: " << type->ToString() + << std::endl; + return Status::NotImplemented(ss.str()); + } + return Status::OK(); +} + +static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, + FieldOffset* offset) { + auto fb_name = fbb.CreateString(field->name); + + flatbuf::Type type_enum; + Offset type_data; + std::vector children; + + RETURN_NOT_OK(TypeToFlatbuffer(fbb, field->type, &children, &type_enum, &type_data)); + auto fb_children = fbb.CreateVector(children); + + *offset = flatbuf::CreateField(fbb, fb_name, field->nullable, type_enum, + type_data, fb_children); + + return Status::OK(); +} + +Status FieldFromFlatbuffer(const flatbuf::Field* field, + std::shared_ptr* out) { + std::shared_ptr type; + + auto children = field->children(); + std::vector> child_fields(children->size()); + for (size_t i = 0; i < children->size(); ++i) { + RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), &child_fields[i])); + } + + RETURN_NOT_OK(TypeFromFlatbuffer(field->type_type(), + field->type(), child_fields, &type)); + + *out = std::make_shared(field->name()->str(), type); + return Status::OK(); +} + +// Implement MessageBuilder + +Status MessageBuilder::SetSchema(const Schema* schema) { + header_type_ = flatbuf::MessageHeader_Schema; + + std::vector field_offsets; + for (int i = 0; i < schema->num_fields(); ++i) { + const std::shared_ptr& field = schema->field(i); + FieldOffset offset; + RETURN_NOT_OK(FieldToFlatbuffer(fbb_, field, &offset)); + field_offsets.push_back(offset); + } + + header_ = flatbuf::CreateSchema(fbb_, fbb_.CreateVector(field_offsets)).Union(); + body_length_ = 0; + return Status::OK(); +} + +Status MessageBuilder::SetRecordBatch(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers) { + header_type_ = flatbuf::MessageHeader_RecordBatch; + header_ = flatbuf::CreateRecordBatch(fbb_, length, + fbb_.CreateVectorOfStructs(nodes), + fbb_.CreateVectorOfStructs(buffers)).Union(); + body_length_ = body_length; + + return Status::OK(); +} + + +Status WriteDataHeader(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out) { + MessageBuilder message; + RETURN_NOT_OK(message.SetRecordBatch(length, body_length, nodes, buffers)); + RETURN_NOT_OK(message.Finish()); + return message.GetBuffer(out); +} + +Status MessageBuilder::Finish() { + auto message = flatbuf::CreateMessage(fbb_, header_type_, header_, + body_length_); + fbb_.Finish(message); + return Status::OK(); +} + +Status MessageBuilder::GetBuffer(std::shared_ptr* out) { + // The message buffer is prefixed by the size of the complete flatbuffer as + // int32_t + // + int32_t size = fbb_.GetSize(); + + auto result = std::make_shared(); + RETURN_NOT_OK(result->Resize(size + sizeof(int32_t))); + + uint8_t* dst = result->mutable_data(); + memcpy(dst, reinterpret_cast(&size), sizeof(int32_t)); + memcpy(dst + sizeof(int32_t), fbb_.GetBufferPointer(), size); + + *out = result; + return Status::OK(); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h new file mode 100644 index 00000000000..f7365d2a49f --- /dev/null +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_METADATA_INTERNAL_H +#define ARROW_IPC_METADATA_INTERNAL_H + +#include +#include +#include +#include + +#include "arrow/ipc/Message_generated.h" + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +class Buffer; +struct Field; +class Schema; +class Status; + +namespace ipc { + +Status FieldFromFlatbuffer(const flatbuf::Field* field, + std::shared_ptr* out); + +class MessageBuilder { + public: + Status SetSchema(const Schema* schema); + + Status SetRecordBatch(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers); + + Status Finish(); + + Status GetBuffer(std::shared_ptr* out); + + private: + flatbuf::MessageHeader header_type_; + flatbuffers::Offset header_; + int64_t body_length_; + flatbuffers::FlatBufferBuilder fbb_; +}; + +Status WriteDataHeader(int32_t length, int64_t body_length, + const std::vector& nodes, + const std::vector& buffers, + std::shared_ptr* out); + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_METADATA_INTERNAL_H diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc new file mode 100644 index 00000000000..642f21a41e6 --- /dev/null +++ b/cpp/src/arrow/ipc/metadata.cc @@ -0,0 +1,238 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/metadata.h" + +#include +#include +#include +#include + +// Generated C++ flatbuffer IDL +#include "arrow/ipc/Message_generated.h" +#include "arrow/ipc/metadata-internal.h" + +#include "arrow/schema.h" +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { + +namespace flatbuf = apache::arrow::flatbuf; + +namespace ipc { + +Status WriteSchema(const Schema* schema, std::shared_ptr* out) { + MessageBuilder message; + RETURN_NOT_OK(message.SetSchema(schema)); + RETURN_NOT_OK(message.Finish()); + return message.GetBuffer(out); +} + +//---------------------------------------------------------------------- +// Message reader + +class Message::Impl { + public: + explicit Impl(const std::shared_ptr& buffer, + const flatbuf::Message* message) : + buffer_(buffer), + message_(message) {} + + Message::Type type() const { + switch (message_->header_type()) { + case flatbuf::MessageHeader_Schema: + return Message::SCHEMA; + case flatbuf::MessageHeader_DictionaryBatch: + return Message::DICTIONARY_BATCH; + case flatbuf::MessageHeader_RecordBatch: + return Message::RECORD_BATCH; + default: + return Message::NONE; + } + } + + const void* header() const { + return message_->header(); + } + + int64_t body_length() const { + return message_->bodyLength(); + } + + private: + // Owns the memory this message accesses + std::shared_ptr buffer_; + + const flatbuf::Message* message_; +}; + +class SchemaMessage::Impl { + public: + explicit Impl(const void* schema) : + schema_(static_cast(schema)) {} + + const flatbuf::Field* field(int i) const { + return schema_->fields()->Get(i); + } + + int num_fields() const { + return schema_->fields()->size(); + } + + private: + const flatbuf::Schema* schema_; +}; + +Message::Message() {} + +Status Message::Open(const std::shared_ptr& buffer, + std::shared_ptr* out) { + std::shared_ptr result(new Message()); + + // The buffer is prefixed by its size as int32_t + const uint8_t* fb_head = buffer->data() + sizeof(int32_t); + const flatbuf::Message* message = flatbuf::GetMessage(fb_head); + + // TODO(wesm): verify message + result->impl_.reset(new Impl(buffer, message)); + *out = result; + + return Status::OK(); +} + +Message::Type Message::type() const { + return impl_->type(); +} + +int64_t Message::body_length() const { + return impl_->body_length(); +} + +std::shared_ptr Message::get_shared_ptr() { + return this->shared_from_this(); +} + +std::shared_ptr Message::GetSchema() { + return std::make_shared(this->shared_from_this(), + impl_->header()); +} + +SchemaMessage::SchemaMessage(const std::shared_ptr& message, + const void* schema) { + message_ = message; + impl_.reset(new Impl(schema)); +} + +int SchemaMessage::num_fields() const { + return impl_->num_fields(); +} + +Status SchemaMessage::GetField(int i, std::shared_ptr* out) const { + const flatbuf::Field* field = impl_->field(i); + return FieldFromFlatbuffer(field, out); +} + +Status SchemaMessage::GetSchema(std::shared_ptr* out) const { + std::vector> fields(num_fields()); + for (int i = 0; i < this->num_fields(); ++i) { + RETURN_NOT_OK(GetField(i, &fields[i])); + } + *out = std::make_shared(fields); + return Status::OK(); +} + +class RecordBatchMessage::Impl { + public: + explicit Impl(const void* batch) : + batch_(static_cast(batch)) { + nodes_ = batch_->nodes(); + buffers_ = batch_->buffers(); + } + + const flatbuf::FieldNode* field(int i) const { + return nodes_->Get(i); + } + + const flatbuf::Buffer* buffer(int i) const { + return buffers_->Get(i); + } + + int32_t length() const { + return batch_->length(); + } + + int num_buffers() const { + return batch_->buffers()->size(); + } + + int num_fields() const { + return batch_->nodes()->size(); + } + + private: + const flatbuf::RecordBatch* batch_; + const flatbuffers::Vector* nodes_; + const flatbuffers::Vector* buffers_; +}; + +std::shared_ptr Message::GetRecordBatch() { + return std::make_shared(this->shared_from_this(), + impl_->header()); +} + +RecordBatchMessage::RecordBatchMessage(const std::shared_ptr& message, + const void* batch) { + message_ = message; + impl_.reset(new Impl(batch)); +} + +// TODO(wesm): Copying the flatbuffer data isn't great, but this will do for +// now +FieldMetadata RecordBatchMessage::field(int i) const { + const flatbuf::FieldNode* node = impl_->field(i); + + FieldMetadata result; + result.length = node->length(); + result.null_count = node->null_count(); + return result; +} + +BufferMetadata RecordBatchMessage::buffer(int i) const { + const flatbuf::Buffer* buffer = impl_->buffer(i); + + BufferMetadata result; + result.page = buffer->page(); + result.offset = buffer->offset(); + result.length = buffer->length(); + return result; +} + +int32_t RecordBatchMessage::length() const { + return impl_->length(); +} + +int RecordBatchMessage::num_buffers() const { + return impl_->num_buffers(); +} + +int RecordBatchMessage::num_fields() const { + return impl_->num_fields(); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata.h b/cpp/src/arrow/ipc/metadata.h new file mode 100644 index 00000000000..c7288529b9f --- /dev/null +++ b/cpp/src/arrow/ipc/metadata.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// C++ object model and user API for interprocess schema messaging + +#ifndef ARROW_IPC_METADATA_H +#define ARROW_IPC_METADATA_H + +#include +#include + +namespace arrow { + +class Buffer; +struct Field; +class Schema; +class Status; + +namespace ipc { + +//---------------------------------------------------------------------- +// Message read/write APIs + +// Serialize arrow::Schema as a Flatbuffer +Status WriteSchema(const Schema* schema, std::shared_ptr* out); + +//---------------------------------------------------------------------- + +// Read interface classes. We do not fully deserialize the flatbuffers so that +// individual fields metadata can be retrieved from very large schema without +// + +class Message; + +// Container for serialized Schema metadata contained in an IPC message +class SchemaMessage { + public: + // Accepts an opaque flatbuffer pointer + SchemaMessage(const std::shared_ptr& message, const void* schema); + + int num_fields() const; + + // Construct an arrow::Field for the i-th value in the metadata + Status GetField(int i, std::shared_ptr* out) const; + + // Construct a complete Schema from the message. May be expensive for very + // large schemas if you are only interested in a few fields + Status GetSchema(std::shared_ptr* out) const; + + private: + // Parent, owns the flatbuffer data + std::shared_ptr message_; + + class Impl; + std::unique_ptr impl_; +}; + +// Field metadata +struct FieldMetadata { + int32_t length; + int32_t null_count; +}; + +struct BufferMetadata { + int32_t page; + int64_t offset; + int64_t length; +}; + +// Container for serialized record batch metadata contained in an IPC message +class RecordBatchMessage { + public: + // Accepts an opaque flatbuffer pointer + RecordBatchMessage(const std::shared_ptr& message, + const void* batch_meta); + + FieldMetadata field(int i) const; + BufferMetadata buffer(int i) const; + + int32_t length() const; + int num_buffers() const; + int num_fields() const; + + private: + // Parent, owns the flatbuffer data + std::shared_ptr message_; + + class Impl; + std::unique_ptr impl_; +}; + +class DictionaryBatchMessage { + public: + int64_t id() const; + std::unique_ptr data() const; +}; + +class Message : public std::enable_shared_from_this { + public: + enum Type { + NONE, + SCHEMA, + DICTIONARY_BATCH, + RECORD_BATCH + }; + + static Status Open(const std::shared_ptr& buffer, + std::shared_ptr* out); + + std::shared_ptr get_shared_ptr(); + + int64_t body_length() const; + + Type type() const; + + // These methods only to be invoked if you have checked the message type + std::shared_ptr GetSchema(); + std::shared_ptr GetRecordBatch(); + std::shared_ptr GetDictionaryBatch(); + + private: + Message(); + + // Hide serialization details from user API + class Impl; + std::unique_ptr impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_METADATA_H diff --git a/cpp/src/arrow/types/floating.h b/cpp/src/arrow/ipc/test-common.h similarity index 59% rename from cpp/src/arrow/types/floating.h rename to cpp/src/arrow/ipc/test-common.h index e7522781d33..0fccce94107 100644 --- a/cpp/src/arrow/types/floating.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -15,22 +15,39 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TYPES_FLOATING_H -#define ARROW_TYPES_FLOATING_H +#ifndef ARROW_IPC_TEST_COMMON_H +#define ARROW_IPC_TEST_COMMON_H +#include +#include #include - -#include "arrow/types/primitive.h" -#include "arrow/type.h" +#include namespace arrow { - -typedef PrimitiveArrayImpl FloatArray; -typedef PrimitiveArrayImpl DoubleArray; - -typedef PrimitiveBuilder FloatBuilder; -typedef PrimitiveBuilder DoubleBuilder; - +namespace ipc { + +class MemoryMapFixture { + public: + void TearDown() { + for (auto path : tmp_files_) { + std::remove(path.c_str()); + } + } + + void CreateFile(const std::string path, int64_t size) { + FILE* file = fopen(path.c_str(), "w"); + if (file != nullptr) { + tmp_files_.push_back(path); + } + ftruncate(fileno(file), size); + fclose(file); + } + + private: + std::vector tmp_files_; +}; + +} // namespace ipc } // namespace arrow -#endif // ARROW_TYPES_FLOATING_H +#endif // ARROW_IPC_TEST_COMMON_H diff --git a/cpp/src/arrow/table/schema-test.cc b/cpp/src/arrow/schema-test.cc similarity index 72% rename from cpp/src/arrow/table/schema-test.cc rename to cpp/src/arrow/schema-test.cc index 9dfade26953..a1de1dc5ac8 100644 --- a/cpp/src/arrow/table/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include -#include "arrow/table/schema.h" +#include "gtest/gtest.h" + +#include "arrow/schema.h" #include "arrow/type.h" -#include "arrow/types/string.h" using std::shared_ptr; using std::vector; @@ -32,25 +32,20 @@ namespace arrow { const auto INT32 = std::make_shared(); TEST(TestField, Basics) { - shared_ptr ftype = INT32; - shared_ptr ftype_nn = std::make_shared(false); - Field f0("f0", ftype); - Field f0_nn("f0", ftype_nn); + Field f0("f0", INT32); + Field f0_nn("f0", INT32, false); ASSERT_EQ(f0.name, "f0"); - ASSERT_EQ(f0.type->ToString(), ftype->ToString()); + ASSERT_EQ(f0.type->ToString(), INT32->ToString()); - ASSERT_TRUE(f0.nullable()); - ASSERT_FALSE(f0_nn.nullable()); + ASSERT_TRUE(f0.nullable); + ASSERT_FALSE(f0_nn.nullable); } TEST(TestField, Equals) { - shared_ptr ftype = INT32; - shared_ptr ftype_nn = std::make_shared(false); - - Field f0("f0", ftype); - Field f0_nn("f0", ftype_nn); - Field f0_other("f0", ftype); + Field f0("f0", INT32); + Field f0_nn("f0", INT32, false); + Field f0_other("f0", INT32); ASSERT_EQ(f0, f0_other); ASSERT_NE(f0, f0_nn); @@ -63,12 +58,12 @@ class TestSchema : public ::testing::Test { TEST_F(TestSchema, Basics) { auto f0 = std::make_shared("f0", INT32); - auto f1 = std::make_shared("f1", std::make_shared(false)); + auto f1 = std::make_shared("f1", std::make_shared(), false); auto f1_optional = std::make_shared("f1", std::make_shared()); auto f2 = std::make_shared("f2", std::make_shared()); - vector > fields = {f0, f1, f2}; + vector> fields = {f0, f1, f2}; auto schema = std::make_shared(fields); ASSERT_EQ(3, schema->num_fields()); @@ -78,7 +73,7 @@ TEST_F(TestSchema, Basics) { auto schema2 = std::make_shared(fields); - vector > fields3 = {f0, f1_optional, f2}; + vector> fields3 = {f0, f1_optional, f2}; auto schema3 = std::make_shared(fields3); ASSERT_TRUE(schema->Equals(schema2)); ASSERT_FALSE(schema->Equals(schema3)); @@ -88,21 +83,20 @@ TEST_F(TestSchema, Basics) { } TEST_F(TestSchema, ToString) { - auto f0 = std::make_shared("f0", std::make_shared()); - auto f1 = std::make_shared("f1", std::make_shared(false)); + auto f0 = std::make_shared("f0", INT32); + auto f1 = std::make_shared("f1", std::make_shared(), false); auto f2 = std::make_shared("f2", std::make_shared()); auto f3 = std::make_shared("f3", std::make_shared(std::make_shared())); - vector > fields = {f0, f1, f2, f3}; + vector> fields = {f0, f1, f2, f3}; auto schema = std::make_shared(fields); std::string result = schema->ToString(); - std::string expected = R"(f0 int32 -f1 uint8 not null -f2 string -f3 list -)"; + std::string expected = R"(f0: int32 +f1: uint8 not null +f2: string +f3: list)"; ASSERT_EQ(expected, result); } diff --git a/cpp/src/arrow/table/schema.cc b/cpp/src/arrow/schema.cc similarity index 88% rename from cpp/src/arrow/table/schema.cc rename to cpp/src/arrow/schema.cc index d49d0a713e7..18aad0e806f 100644 --- a/cpp/src/arrow/table/schema.cc +++ b/cpp/src/arrow/schema.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/table/schema.h" +#include "arrow/schema.h" #include #include @@ -26,7 +26,7 @@ namespace arrow { -Schema::Schema(const std::vector >& fields) : +Schema::Schema(const std::vector>& fields) : fields_(fields) {} bool Schema::Equals(const Schema& other) const { @@ -49,8 +49,13 @@ bool Schema::Equals(const std::shared_ptr& other) const { std::string Schema::ToString() const { std::stringstream buffer; + int i = 0; for (auto field : fields_) { - buffer << field->ToString() << std::endl; + if (i > 0) { + buffer << std::endl; + } + buffer << field->ToString(); + ++i; } return buffer.str(); } diff --git a/cpp/src/arrow/table/schema.h b/cpp/src/arrow/schema.h similarity index 91% rename from cpp/src/arrow/table/schema.h rename to cpp/src/arrow/schema.h index 103f01b26e3..52f3c1ceae4 100644 --- a/cpp/src/arrow/table/schema.h +++ b/cpp/src/arrow/schema.h @@ -22,13 +22,13 @@ #include #include -#include "arrow/type.h" - namespace arrow { +struct Field; + class Schema { public: - explicit Schema(const std::vector >& fields); + explicit Schema(const std::vector>& fields); // Returns true if all of the schema fields are equal bool Equals(const Schema& other) const; @@ -47,7 +47,7 @@ class Schema { } private: - std::vector > fields_; + std::vector> fields_; }; } // namespace arrow diff --git a/cpp/src/arrow/table/table-test.cc b/cpp/src/arrow/table-test.cc similarity index 92% rename from cpp/src/arrow/table/table-test.cc rename to cpp/src/arrow/table-test.cc index 8b354e8503c..4c7b8f80486 100644 --- a/cpp/src/arrow/table/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -15,19 +15,19 @@ // specific language governing permissions and limitations // under the License. -#include -#include #include #include #include -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" -#include "arrow/table/test-common.h" +#include "gtest/gtest.h" + +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" +#include "arrow/util/status.h" using std::shared_ptr; using std::vector; @@ -45,7 +45,7 @@ class TestTable : public TestBase { auto f1 = std::make_shared("f1", UINT8); auto f2 = std::make_shared("f2", INT16); - vector > fields = {f0, f1, f2}; + vector> fields = {f0, f1, f2}; schema_ = std::make_shared(fields); columns_ = { @@ -58,7 +58,7 @@ class TestTable : public TestBase { protected: std::unique_ptr
table_; shared_ptr schema_; - vector > columns_; + vector> columns_; }; TEST_F(TestTable, EmptySchema) { diff --git a/cpp/src/arrow/table/table.cc b/cpp/src/arrow/table.cc similarity index 69% rename from cpp/src/arrow/table/table.cc rename to cpp/src/arrow/table.cc index 0c788b8fe3f..e405c1d508c 100644 --- a/cpp/src/arrow/table/table.cc +++ b/cpp/src/arrow/table.cc @@ -15,20 +15,30 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/table/table.h" +#include "arrow/table.h" +#include #include #include -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/type.h" +#include "arrow/column.h" +#include "arrow/schema.h" #include "arrow/util/status.h" namespace arrow { +RowBatch::RowBatch(const std::shared_ptr& schema, int num_rows, + const std::vector>& columns) : + schema_(schema), + num_rows_(num_rows), + columns_(columns) {} + +const std::string& RowBatch::column_name(int i) const { + return schema_->field(i)->name; +} + Table::Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns) : + const std::vector>& columns) : name_(name), schema_(schema), columns_(columns) { @@ -40,7 +50,7 @@ Table::Table(const std::string& name, const std::shared_ptr& schema, } Table::Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns, int64_t num_rows) : + const std::vector>& columns, int64_t num_rows) : name_(name), schema_(schema), columns_(columns), @@ -51,16 +61,19 @@ Status Table::ValidateColumns() const { return Status::Invalid("Number of columns did not match schema"); } - if (columns_.size() == 0) { - return Status::OK(); - } - // Make sure columns are all the same length for (size_t i = 0; i < columns_.size(); ++i) { const Column* col = columns_[i].get(); + if (col == nullptr) { + std::stringstream ss; + ss << "Column " << i << " named " << col->name() + << " was null"; + return Status::Invalid(ss.str()); + } if (col->length() != num_rows_) { std::stringstream ss; - ss << "Column " << i << " expected length " + ss << "Column " << i << " named " << col->name() + << " expected length " << num_rows_ << " but got length " << col->length(); diff --git a/cpp/src/arrow/table/table.h b/cpp/src/arrow/table.h similarity index 55% rename from cpp/src/arrow/table/table.h rename to cpp/src/arrow/table.h index b0129387b71..e2f73a2eedd 100644 --- a/cpp/src/arrow/table/table.h +++ b/cpp/src/arrow/table.h @@ -15,28 +15,74 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TABLE_TABLE_H -#define ARROW_TABLE_TABLE_H +#ifndef ARROW_TABLE_H +#define ARROW_TABLE_H +#include #include #include #include namespace arrow { +class Array; class Column; class Schema; class Status; +// A row batch is a simpler and more rigid table data structure intended for +// use primarily in shared memory IPC. It contains a schema (metadata) and a +// corresponding vector of equal-length Arrow arrays +class RowBatch { + public: + // num_rows is a parameter to allow for row batches of a particular size not + // having any materialized columns. Each array should have the same length as + // num_rows + RowBatch(const std::shared_ptr& schema, int num_rows, + const std::vector>& columns); + + // @returns: the table's schema + const std::shared_ptr& schema() const { + return schema_; + } + + // @returns: the i-th column + // Note: Does not boundscheck + const std::shared_ptr& column(int i) const { + return columns_[i]; + } + + const std::string& column_name(int i) const; + + // @returns: the number of columns in the table + int num_columns() const { + return columns_.size(); + } + + // @returns: the number of rows (the corresponding length of each column) + int64_t num_rows() const { + return num_rows_; + } + + private: + std::shared_ptr schema_; + int num_rows_; + std::vector> columns_; +}; + // Immutable container of fixed-length columns conforming to a particular schema class Table { public: // If columns is zero-length, the table's number of rows is zero Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns); + const std::vector>& columns); + // num_rows is a parameter to allow for tables of a particular size not + // having any materialized columns. Each column should therefore have the + // same length as num_rows -- you can validate this using + // Table::ValidateColumns Table(const std::string& name, const std::shared_ptr& schema, - const std::vector >& columns, int64_t num_rows); + const std::vector>& columns, int64_t num_rows); // @returns: the table's name, if any (may be length 0) const std::string& name() const { @@ -72,11 +118,11 @@ class Table { std::string name_; std::shared_ptr schema_; - std::vector > columns_; + std::vector> columns_; int64_t num_rows_; }; } // namespace arrow -#endif // ARROW_TABLE_TABLE_H +#endif // ARROW_TABLE_H diff --git a/cpp/src/arrow/table/test-common.h b/cpp/src/arrow/table/test-common.h deleted file mode 100644 index 50a5f6a2f50..00000000000 --- a/cpp/src/arrow/table/test-common.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include -#include - -#include "arrow/table/column.h" -#include "arrow/table/schema.h" -#include "arrow/table/table.h" -#include "arrow/test-util.h" -#include "arrow/type.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/buffer.h" -#include "arrow/util/memory-pool.h" - -namespace arrow { - -class TestBase : public ::testing::Test { - public: - void SetUp() { - pool_ = GetDefaultMemoryPool(); - } - - template - std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { - auto data = std::make_shared(pool_); - auto nulls = std::make_shared(pool_); - EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); - EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); - return std::make_shared(length, data, 10, nulls); - } - - protected: - MemoryPool* pool_; -}; - -} // namespace arrow diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 0898c8e3e3a..a9fb2a7644a 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -18,26 +18,39 @@ #ifndef ARROW_TEST_UTIL_H_ #define ARROW_TEST_UTIL_H_ -#include +#include #include +#include #include #include +#include "gtest/gtest.h" + +#include "arrow/type.h" +#include "arrow/column.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" #include "arrow/util/random.h" #include "arrow/util/status.h" #define ASSERT_RAISES(ENUM, expr) \ do { \ Status s = (expr); \ - ASSERT_TRUE(s.Is##ENUM()); \ + if (!s.Is##ENUM()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) #define ASSERT_OK(expr) \ do { \ Status s = (expr); \ - ASSERT_TRUE(s.ok()); \ + if (!s.ok()) { \ + FAIL() << s.ToString(); \ + } \ } while (0) @@ -50,6 +63,27 @@ namespace arrow { +class TestBase : public ::testing::Test { + public: + void SetUp() { + pool_ = default_memory_pool(); + } + + template + std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { + auto data = std::make_shared(pool_); + auto nulls = std::make_shared(pool_); + EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); + EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); + return std::make_shared(length, data, 10, nulls); + } + + protected: + MemoryPool* pool_; +}; + +namespace test { + template void randint(int64_t N, T lower, T upper, std::vector* out) { Random rng(random_seed()); @@ -84,6 +118,33 @@ void random_nulls(int64_t n, double pct_null, std::vector* nulls) { } } +static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(0, 255); + + for (int i = 0; i < n; ++i) { + out[i] = d(gen) & 0xFF; + } +} + +template +void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out[i] = d(gen); + } +} + +static inline int bitmap_popcount(const uint8_t* data, int length) { + int count = 0; + for (int i = 0; i < length; ++i) { + // TODO: accelerate this + if (util::get_bit(data, i)) ++count; + } + return count; +} + static inline int null_count(const std::vector& nulls) { int result = 0; for (size_t i = 0; i < nulls.size(); ++i) { @@ -102,6 +163,7 @@ std::shared_ptr bytes_to_null_buffer(uint8_t* bytes, int length) { return out; } +} // namespace test } // namespace arrow #endif // ARROW_TEST_UTIL_H_ diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 0a2e817ad30..f7f835e96a7 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -24,45 +24,37 @@ namespace arrow { std::string Field::ToString() const { std::stringstream ss; - ss << this->name << " " << this->type->ToString(); + ss << this->name << ": " << this->type->ToString(); + if (!this->nullable) { + ss << " not null"; + } return ss.str(); } DataType::~DataType() {} -StringType::StringType(bool nullable) - : DataType(LogicalType::STRING, nullable) {} - -StringType::StringType(const StringType& other) - : StringType(other.nullable) {} +StringType::StringType() : DataType(Type::STRING) {} std::string StringType::ToString() const { std::string result(name()); - if (!nullable) { - result.append(" not null"); - } return result; } std::string ListType::ToString() const { std::stringstream s; - s << "list<" << value_type->ToString() << ">"; - if (!this->nullable) { - s << " not null"; - } + s << "list<" << value_field()->ToString() << ">"; return s.str(); } std::string StructType::ToString() const { std::stringstream s; s << "struct<"; - for (size_t i = 0; i < fields_.size(); ++i) { + for (int i = 0; i < this->num_children(); ++i) { if (i > 0) s << ", "; - const std::shared_ptr& field = fields_[i]; + const std::shared_ptr& field = this->child(i); s << field->name << ": " << field->type->ToString(); } s << ">"; - if (!nullable) s << " not null"; return s.str(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 00b01ea86e8..5984b6718dd 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -18,62 +18,34 @@ #ifndef ARROW_TYPE_H #define ARROW_TYPE_H +#include #include #include #include namespace arrow { -// Physical data type that describes the memory layout of values. See details -// for each type -enum class LayoutEnum: char { - // A physical type consisting of some non-negative number of bytes - BYTE = 0, - - // A physical type consisting of some non-negative number of bits - BIT = 1, - - // A parametric variable-length value type. Full specification requires a - // child logical type - LIST = 2, - - // A collection of multiple equal-length child arrays. Parametric type taking - // 1 or more child logical types - STRUCT = 3, - - // An array with heterogeneous value types. Parametric types taking 1 or more - // child logical types - DENSE_UNION = 4, - SPARSE_UNION = 5 -}; - - -struct LayoutType { - LayoutEnum type; - explicit LayoutType(LayoutEnum type) : type(type) {} -}; - // Data types in this library are all *logical*. They can be expressed as // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a // timestamp encoded as an int64) -struct LogicalType { +struct Type { enum type { // A degenerate NULL type represented as 0 bytes/bits NA = 0, - // Little-endian integer types - UINT8 = 1, - INT8 = 2, - UINT16 = 3, - INT16 = 4, - UINT32 = 5, - INT32 = 6, - UINT64 = 7, - INT64 = 8, - // A boolean value represented as 1 bit - BOOL = 9, + BOOL = 1, + + // Little-endian integer types + UINT8 = 2, + INT8 = 3, + UINT16 = 4, + INT16 = 5, + UINT32 = 6, + INT32 = 7, + UINT64 = 8, + INT64 = 9, // 4-byte floating point value FLOAT = 10, @@ -131,30 +103,38 @@ struct LogicalType { }; }; +struct Field; + struct DataType { - LogicalType::type type; - bool nullable; + Type::type type; - explicit DataType(LogicalType::type type, bool nullable = true) : - type(type), - nullable(nullable) {} + std::vector> children_; + + explicit DataType(Type::type type) : + type(type) {} virtual ~DataType(); bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses - return this == other || (this->type == other->type && - this->nullable == other->nullable); + return this == other || (this->type == other->type); } bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } + const std::shared_ptr& child(int i) const { + return children_[i]; + } + + int num_children() const { + return children_.size(); + } + virtual std::string ToString() const = 0; }; -typedef std::shared_ptr LayoutPtr; typedef std::shared_ptr TypePtr; // A field is a piece of metadata that includes (for now) a name and a data @@ -166,9 +146,13 @@ struct Field { // The field's data type TypePtr type; - Field(const std::string& name, const TypePtr& type) : + // Fields can be nullable + bool nullable; + + Field(const std::string& name, const TypePtr& type, bool nullable = true) : name(name), - type(type) {} + type(type), + nullable(nullable) {} bool operator==(const Field& other) const { return this->Equals(other); @@ -180,6 +164,7 @@ struct Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && + this->nullable == other.nullable && this->type->Equals(other.type.get())); } @@ -187,36 +172,12 @@ struct Field { return Equals(*other.get()); } - bool nullable() const { - return this->type->nullable; - } - std::string ToString() const; }; -struct BytesType : public LayoutType { - int size; - - explicit BytesType(int size) - : LayoutType(LayoutEnum::BYTE), - size(size) {} - - BytesType(const BytesType& other) - : BytesType(other.size) {} -}; - -struct ListLayoutType : public LayoutType { - LayoutPtr value_type; - - explicit ListLayoutType(const LayoutPtr& value_type) - : LayoutType(LayoutEnum::BYTE), - value_type(value_type) {} -}; - template struct PrimitiveType : public DataType { - explicit PrimitiveType(bool nullable = true) - : DataType(Derived::type_enum, nullable) {} + PrimitiveType() : DataType(Derived::type_enum) {} std::string ToString() const override; }; @@ -224,22 +185,19 @@ struct PrimitiveType : public DataType { template inline std::string PrimitiveType::ToString() const { std::string result(static_cast(this)->name()); - if (!nullable) { - result.append(" not null"); - } return result; } -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr LogicalType::type type_enum = LogicalType::ENUM; \ - static constexpr int size = SIZE; \ - \ - explicit TYPENAME(bool nullable = true) \ - : PrimitiveType(nullable) {} \ - \ - static const char* name() { \ - return NAME; \ +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr Type::type type_enum = Type::ENUM; \ + static constexpr int size = SIZE; \ + \ + TYPENAME() \ + : PrimitiveType() {} \ + \ + static const char* name() { \ + return NAME; \ } struct NullType : public PrimitiveType { @@ -292,11 +250,23 @@ struct DoubleType : public PrimitiveType { struct ListType : public DataType { // List can contain any other logical value type - TypePtr value_type; + explicit ListType(const std::shared_ptr& value_type) + : DataType(Type::LIST) { + children_ = {std::make_shared("item", value_type)}; + } + + explicit ListType(const std::shared_ptr& value_field) + : DataType(Type::LIST) { + children_ = {value_field}; + } - explicit ListType(const TypePtr& value_type, bool nullable = true) - : DataType(LogicalType::LIST, nullable), - value_type(value_type) {} + const std::shared_ptr& value_field() const { + return children_[0]; + } + + const std::shared_ptr& value_type() const { + return children_[0]->type; + } static char const *name() { return "list"; @@ -307,9 +277,7 @@ struct ListType : public DataType { // String is a logical type consisting of a physical list of 1-byte values struct StringType : public DataType { - explicit StringType(bool nullable = true); - - StringType(const StringType& other); + StringType(); static char const *name() { return "string"; @@ -319,20 +287,9 @@ struct StringType : public DataType { }; struct StructType : public DataType { - std::vector > fields_; - - explicit StructType(const std::vector >& fields, - bool nullable = true) - : DataType(LogicalType::STRUCT, nullable) { - fields_ = fields; - } - - const std::shared_ptr& field(int i) const { - return fields_[i]; - } - - int num_children() const { - return fields_.size(); + explicit StructType(const std::vector>& fields) + : DataType(Type::STRUCT) { + children_ = fields; } std::string ToString() const override; diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 57cabdefd25..595b3be6e16 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -26,8 +26,6 @@ install(FILES construct.h datetime.h decimal.h - floating.h - integer.h json.h list.h primitive.h diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h index a5023d7b368..1cb91f9ba49 100644 --- a/cpp/src/arrow/types/boolean.h +++ b/cpp/src/arrow/types/boolean.h @@ -22,7 +22,7 @@ namespace arrow { -typedef PrimitiveArrayImpl BooleanArray; +// typedef PrimitiveArrayImpl BooleanArray; class BooleanBuilder : public ArrayBuilder { }; diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 42a9c926bb1..46d84f1f183 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -25,7 +25,7 @@ namespace arrow { -template +template struct CollectionType : public DataType { std::vector child_types_; diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 43f01a30513..290decd81ff 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -19,24 +19,26 @@ #include -#include "arrow/types/floating.h" -#include "arrow/types/integer.h" +#include "arrow/type.h" +#include "arrow/types/primitive.h" #include "arrow/types/list.h" #include "arrow/types/string.h" +#include "arrow/util/buffer.h" #include "arrow/util/status.h" namespace arrow { class ArrayBuilder; -// Initially looked at doing this with vtables, but shared pointers makes it -// difficult - #define BUILDER_CASE(ENUM, BuilderType) \ - case LogicalType::ENUM: \ + case Type::ENUM: \ out->reset(new BuilderType(pool, type)); \ return Status::OK(); +// Initially looked at doing this with vtables, but shared pointers makes it +// difficult +// +// TODO(wesm): come up with a less monolithic strategy Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out) { switch (type->type) { @@ -56,30 +58,41 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(STRING, StringBuilder); - case LogicalType::LIST: + case Type::LIST: { std::shared_ptr value_builder; const std::shared_ptr& value_type = static_cast( - type.get())->value_type; + type.get())->value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); out->reset(new ListBuilder(pool, type, value_builder)); return Status::OK(); } - // BUILDER_CASE(CHAR, CharBuilder); - - // BUILDER_CASE(VARCHAR, VarcharBuilder); - // BUILDER_CASE(BINARY, BinaryBuilder); - - // BUILDER_CASE(DATE, DateBuilder); - // BUILDER_CASE(TIMESTAMP, TimestampBuilder); - // BUILDER_CASE(TIME, TimeBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} - // BUILDER_CASE(LIST, ListBuilder); - // BUILDER_CASE(STRUCT, StructBuilder); - // BUILDER_CASE(DENSE_UNION, DenseUnionBuilder); - // BUILDER_CASE(SPARSE_UNION, SparseUnionBuilder); +#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ + case Type::ENUM: \ + out->reset(new ArrayType(type, length, data, null_count, nulls)); \ + return Status::OK(); +Status MakePrimitiveArray(const std::shared_ptr& type, + int32_t length, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& nulls, + std::shared_ptr* out) { + switch (type->type) { + MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT8, Int8Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT16, UInt16Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT16, Int16Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT32, UInt32Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); + MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); + MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); + MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); default: return Status::NotImplemented(type->ToString()); } diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 59ebe1acddc..089c484c58b 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -18,19 +18,26 @@ #ifndef ARROW_TYPES_CONSTRUCT_H #define ARROW_TYPES_CONSTRUCT_H +#include #include -#include "arrow/type.h" - namespace arrow { +class Array; class ArrayBuilder; +class Buffer; +struct DataType; class MemoryPool; class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); +Status MakePrimitiveArray(const std::shared_ptr& type, + int32_t length, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& nulls, + std::shared_ptr* out); + } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index 765fc29dd57..e57b66ab46a 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -31,8 +31,8 @@ struct DateType : public DataType { Unit unit; - explicit DateType(Unit unit = Unit::DAY, bool nullable = true) - : DataType(LogicalType::DATE, nullable), + explicit DateType(Unit unit = Unit::DAY) + : DataType(Type::DATE), unit(unit) {} DateType(const DateType& other) @@ -41,10 +41,6 @@ struct DateType : public DataType { static char const *name() { return "date"; } - - // virtual std::string ToString() { - // return name(); - // } }; @@ -58,8 +54,8 @@ struct TimestampType : public DataType { Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI, bool nullable = true) - : DataType(LogicalType::TIMESTAMP, nullable), + explicit TimestampType(Unit unit = Unit::MILLI) + : DataType(Type::TIMESTAMP), unit(unit) {} TimestampType(const TimestampType& other) @@ -68,10 +64,6 @@ struct TimestampType : public DataType { static char const *name() { return "timestamp"; } - - // virtual std::string ToString() { - // return name(); - // } }; } // namespace arrow diff --git a/cpp/src/arrow/types/floating.cc b/cpp/src/arrow/types/floating.cc deleted file mode 100644 index bde28266e63..00000000000 --- a/cpp/src/arrow/types/floating.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/floating.h" - -namespace arrow { - -} // namespace arrow diff --git a/cpp/src/arrow/types/integer.cc b/cpp/src/arrow/types/integer.cc deleted file mode 100644 index 46965366169..00000000000 --- a/cpp/src/arrow/types/integer.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/integer.h" - -namespace arrow { - -} // namespace arrow diff --git a/cpp/src/arrow/types/integer.h b/cpp/src/arrow/types/integer.h deleted file mode 100644 index 56841912494..00000000000 --- a/cpp/src/arrow/types/integer.h +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_INTEGER_H -#define ARROW_TYPES_INTEGER_H - -#include -#include - -#include "arrow/types/primitive.h" -#include "arrow/type.h" - -namespace arrow { - -// Array containers - -typedef PrimitiveArrayImpl UInt8Array; -typedef PrimitiveArrayImpl Int8Array; - -typedef PrimitiveArrayImpl UInt16Array; -typedef PrimitiveArrayImpl Int16Array; - -typedef PrimitiveArrayImpl UInt32Array; -typedef PrimitiveArrayImpl Int32Array; - -typedef PrimitiveArrayImpl UInt64Array; -typedef PrimitiveArrayImpl Int64Array; - -// Builders - -typedef PrimitiveBuilder UInt8Builder; -typedef PrimitiveBuilder UInt16Builder; -typedef PrimitiveBuilder UInt32Builder; -typedef PrimitiveBuilder UInt64Builder; - -typedef PrimitiveBuilder Int8Builder; -typedef PrimitiveBuilder Int16Builder; -typedef PrimitiveBuilder Int32Builder; -typedef PrimitiveBuilder Int64Builder; - -} // namespace arrow - -#endif // ARROW_TYPES_INTEGER_H diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc index 168e370d51a..fb731edd607 100644 --- a/cpp/src/arrow/types/json.cc +++ b/cpp/src/arrow/types/json.cc @@ -20,7 +20,6 @@ #include #include "arrow/type.h" -#include "arrow/types/string.h" #include "arrow/types/union.h" namespace arrow { diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h index b67fb3807ad..9c850afac0a 100644 --- a/cpp/src/arrow/types/json.h +++ b/cpp/src/arrow/types/json.h @@ -28,8 +28,8 @@ struct JSONScalar : public DataType { static TypePtr dense_type; static TypePtr sparse_type; - explicit JSONScalar(bool dense = true, bool nullable = true) - : DataType(LogicalType::JSON_SCALAR, nullable), + explicit JSONScalar(bool dense = true) + : DataType(Type::JSON_SCALAR), dense(dense) {} }; diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 02991de2648..eb55ca868ee 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -15,20 +15,21 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include #include #include +#include "gtest/gtest.h" + #include "arrow/array.h" +#include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/types/construct.h" -#include "arrow/types/integer.h" #include "arrow/types/list.h" -#include "arrow/types/string.h" +#include "arrow/types/primitive.h" #include "arrow/types/test-common.h" #include "arrow/util/status.h" @@ -39,27 +40,24 @@ using std::vector; namespace arrow { -class ArrayBuilder; - TEST(TypesTest, TestListType) { std::shared_ptr vt = std::make_shared(); ListType list_type(vt); - ASSERT_EQ(list_type.type, LogicalType::LIST); + ASSERT_EQ(list_type.type, Type::LIST); ASSERT_EQ(list_type.name(), string("list")); - ASSERT_EQ(list_type.ToString(), string("list")); + ASSERT_EQ(list_type.ToString(), string("list")); - ASSERT_EQ(list_type.value_type->type, vt->type); - ASSERT_EQ(list_type.value_type->type, vt->type); + ASSERT_EQ(list_type.value_type()->type, vt->type); + ASSERT_EQ(list_type.value_type()->type, vt->type); - std::shared_ptr st = std::make_shared(false); - std::shared_ptr lt = std::make_shared(st, false); - ASSERT_EQ(lt->ToString(), string("list not null")); + std::shared_ptr st = std::make_shared(); + std::shared_ptr lt = std::make_shared(st); + ASSERT_EQ(lt->ToString(), string("list")); - ListType lt2(lt, false); - ASSERT_EQ(lt2.ToString(), - string("list not null> not null")); + ListType lt2(lt); + ASSERT_EQ(lt2.ToString(), string("list>")); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 69a79a77fab..670ee4da116 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -19,4 +19,33 @@ namespace arrow { +bool ListArray::EqualsExact(const ListArray& other) const { + if (this == &other) return true; + if (null_count_ != other.null_count_) { + return false; + } + + bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, + length_ + 1); + bool equal_nulls = true; + if (null_count_ > 0) { + equal_nulls = nulls_->Equals(*other.nulls_, + util::bytes_for_bits(length_)); + } + + if (!(equal_offsets && equal_nulls)) { + return false; + } + + return values()->Equals(other.values()); +} + +bool ListArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (this->type_enum() != arr->type_enum()) { + return false; + } + return EqualsExact(*static_cast(arr.get())); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 210c76a046c..141f762458b 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -21,12 +21,10 @@ #include #include #include -#include #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/type.h" -#include "arrow/types/integer.h" #include "arrow/types/primitive.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -38,29 +36,19 @@ class MemoryPool; class ListArray : public Array { public: - ListArray() : Array(), offset_buf_(nullptr), offsets_(nullptr) {} - ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, const ArrayPtr& values, int32_t null_count = 0, - std::shared_ptr nulls = nullptr) { - Init(type, length, offsets, values, null_count, nulls); - } - - virtual ~ListArray() {} - - void Init(const TypePtr& type, int32_t length, std::shared_ptr offsets, - const ArrayPtr& values, - int32_t null_count = 0, - std::shared_ptr nulls = nullptr) { + std::shared_ptr nulls = nullptr) : + Array(type, length, null_count, nulls) { offset_buf_ = offsets; offsets_ = offsets == nullptr? nullptr : reinterpret_cast(offset_buf_->data()); - values_ = values; - Array::Init(type, length, null_count, nulls); } + virtual ~ListArray() {} + // Return a shared pointer in case the requestor desires to share ownership // with this array. const std::shared_ptr& values() const {return values_;} @@ -77,6 +65,9 @@ class ListArray : public Array { int32_t value_offset(int i) { return offsets_[i];} int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i];} + bool EqualsExact(const ListArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + protected: std::shared_ptr offset_buf_; const int32_t* offsets_; @@ -137,8 +128,6 @@ class ListBuilder : public Int32Builder { template std::shared_ptr Transfer() { - auto result = std::make_shared(); - std::shared_ptr items = value_builder_->Finish(); // Add final offset if the length is non-zero @@ -146,8 +135,9 @@ class ListBuilder : public Int32Builder { raw_buffer()[length_] = items->length(); } - result->Init(type_, length_, values_, items, + auto result = std::make_shared(type_, length_, values_, items, null_count_, nulls_); + values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index f35a258e2cb..7eae8cda8c4 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -15,21 +15,17 @@ // specific language governing permissions and limitations // under the License. -#include - #include #include #include #include -#include "arrow/array.h" +#include "gtest/gtest.h" + #include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/boolean.h" #include "arrow/types/construct.h" -#include "arrow/types/floating.h" -#include "arrow/types/integer.h" #include "arrow/types/primitive.h" #include "arrow/types/test-common.h" #include "arrow/util/bit-util.h" @@ -43,23 +39,17 @@ using std::vector; namespace arrow { -TEST(TypesTest, TestBytesType) { - BytesType t1(3); - - ASSERT_EQ(t1.type, LayoutEnum::BYTE); - ASSERT_EQ(t1.size, 3); -} - +class Array; #define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ TEST(TypesTest, TestPrimitive_##ENUM) { \ KLASS tp; \ \ - ASSERT_EQ(tp.type, LogicalType::ENUM); \ + ASSERT_EQ(tp.type, Type::ENUM); \ ASSERT_EQ(tp.name(), string(NAME)); \ \ KLASS tp_copy = tp; \ - ASSERT_EQ(tp_copy.type, LogicalType::ENUM); \ + ASSERT_EQ(tp_copy.type, Type::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -109,22 +99,20 @@ class TestPrimitiveBuilder : public TestBuilder { void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); - random_nulls(N, pct_null, &nulls_); + test::random_nulls(N, pct_null, &nulls_); } void CheckNullable() { - ArrayType expected; int size = builder_->length(); auto ex_data = std::make_shared( reinterpret_cast(draws_.data()), size * sizeof(T)); - auto ex_nulls = bytes_to_null_buffer(nulls_.data(), size); - - int32_t ex_null_count = null_count(nulls_); + auto ex_nulls = test::bytes_to_null_buffer(nulls_.data(), size); + int32_t ex_null_count = test::null_count(nulls_); - expected.Init(size, ex_data, ex_null_count, ex_nulls); + auto expected = std::make_shared(size, ex_data, ex_null_count, ex_nulls); std::shared_ptr result = std::dynamic_pointer_cast( builder_->Finish()); @@ -135,18 +123,17 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); - ASSERT_TRUE(result->Equals(expected)); + ASSERT_TRUE(result->EqualsExact(*expected.get())); ASSERT_EQ(ex_null_count, result->null_count()); } void CheckNonNullable() { - ArrayType expected; int size = builder_nn_->length(); auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), size * sizeof(T)); - expected.Init(size, ex_data); + auto expected = std::make_shared(size, ex_data); std::shared_ptr result = std::dynamic_pointer_cast( builder_nn_->Finish()); @@ -156,7 +143,7 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_nn_->capacity()); ASSERT_EQ(nullptr, builder_nn_->buffer()); - ASSERT_TRUE(result->Equals(expected)); + ASSERT_TRUE(result->EqualsExact(*expected.get())); ASSERT_EQ(0, result->null_count()); } @@ -183,8 +170,8 @@ class TestPrimitiveBuilder : public TestBuilder { #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ struct P##CapType { \ PTYPE_DECL(CapType, c_type); \ - static void draw(int N, vector* draws) { \ - randint(N, LOWER, UPPER, draws); \ + static void draw(int N, vector* draws) { \ + test::randint(N, LOWER, UPPER, draws); \ } \ } diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index c86260b0fc6..32b8bfa7f1b 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -26,16 +26,16 @@ namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -void PrimitiveArray::Init(const TypePtr& type, int32_t length, +PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& nulls) { - Array::Init(type, length, null_count, nulls); + const std::shared_ptr& nulls) : + Array(type, length, null_count, nulls) { data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); } -bool PrimitiveArray::Equals(const PrimitiveArray& other) const { +bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { if (this == &other) return true; if (null_count_ != other.null_count_) { return false; @@ -50,4 +50,12 @@ bool PrimitiveArray::Equals(const PrimitiveArray& other) const { } } +bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (this->type_enum() != arr->type_enum()) { + return false; + } + return EqualsExact(*static_cast(arr.get())); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 22ab59c309a..e01027cf55c 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -21,7 +21,6 @@ #include #include #include -#include #include "arrow/array.h" #include "arrow/builder.h" @@ -38,64 +37,57 @@ class MemoryPool; // Base class for fixed-size logical types class PrimitiveArray : public Array { public: - PrimitiveArray() : Array(), data_(nullptr), raw_data_(nullptr) {} - - virtual ~PrimitiveArray() {} - - void Init(const TypePtr& type, int32_t length, + PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& nulls = nullptr); + virtual ~PrimitiveArray() {} const std::shared_ptr& data() const { return data_;} - bool Equals(const PrimitiveArray& other) const; + bool EqualsExact(const PrimitiveArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; protected: std::shared_ptr data_; const uint8_t* raw_data_; }; - -template -class PrimitiveArrayImpl : public PrimitiveArray { - public: - typedef typename TypeClass::c_type value_type; - - PrimitiveArrayImpl() : PrimitiveArray() {} - - virtual ~PrimitiveArrayImpl() {} - - PrimitiveArrayImpl(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - Init(length, data, null_count, nulls); - } - - void Init(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - TypePtr type(new TypeClass()); - PrimitiveArray::Init(type, length, data, null_count, nulls); - } - - bool Equals(const PrimitiveArrayImpl& other) const { - return PrimitiveArray::Equals(*static_cast(&other)); - } - - const value_type* raw_data() const { - return reinterpret_cast(raw_data_); - } - - value_type Value(int i) const { - return raw_data()[i]; - } - - TypeClass* exact_type() const { - return static_cast(type_); - } +#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ +class NAME : public PrimitiveArray { \ + public: \ + using value_type = T; \ + using PrimitiveArray::PrimitiveArray; \ + NAME(int32_t length, const std::shared_ptr& data, \ + int32_t null_count = 0, \ + const std::shared_ptr& nulls = nullptr) : \ + PrimitiveArray(std::make_shared(), length, data, \ + null_count, nulls) {} \ + \ + bool EqualsExact(const NAME& other) const { \ + return PrimitiveArray::EqualsExact( \ + *static_cast(&other)); \ + } \ + \ + const T* raw_data() const { \ + return reinterpret_cast(raw_data_); \ + } \ + \ + T Value(int i) const { \ + return raw_data()[i]; \ + } \ }; +NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type, uint8_t); +NUMERIC_ARRAY_DECL(Int8Array, Int8Type, int8_t); +NUMERIC_ARRAY_DECL(UInt16Array, UInt16Type, uint16_t); +NUMERIC_ARRAY_DECL(Int16Array, Int16Type, int16_t); +NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type, uint32_t); +NUMERIC_ARRAY_DECL(Int32Array, Int32Type, int32_t); +NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type, uint64_t); +NUMERIC_ARRAY_DECL(Int64Array, Int64Type, int64_t); +NUMERIC_ARRAY_DECL(FloatArray, FloatType, float); +NUMERIC_ARRAY_DECL(DoubleArray, DoubleType, double); template class PrimitiveBuilder : public ArrayBuilder { @@ -202,8 +194,9 @@ class PrimitiveBuilder : public ArrayBuilder { } std::shared_ptr Finish() override { - std::shared_ptr result = std::make_shared(); - result->PrimitiveArray::Init(type_, length_, values_, null_count_, nulls_); + std::shared_ptr result = std::make_shared( + type_, length_, values_, null_count_, nulls_); + values_ = nulls_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; @@ -222,6 +215,21 @@ class PrimitiveBuilder : public ArrayBuilder { int elsize_; }; +// Builders + +typedef PrimitiveBuilder UInt8Builder; +typedef PrimitiveBuilder UInt16Builder; +typedef PrimitiveBuilder UInt32Builder; +typedef PrimitiveBuilder UInt64Builder; + +typedef PrimitiveBuilder Int8Builder; +typedef PrimitiveBuilder Int16Builder; +typedef PrimitiveBuilder Int32Builder; +typedef PrimitiveBuilder Int64Builder; + +typedef PrimitiveBuilder FloatBuilder; +typedef PrimitiveBuilder DoubleBuilder; + } // namespace arrow #endif // ARROW_TYPES_PRIMITIVE_H diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 6381093dcbb..7dc3d682cdc 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -15,21 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include #include +#include #include #include #include +#include "gtest/gtest.h" + #include "arrow/array.h" -#include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" -#include "arrow/types/construct.h" -#include "arrow/types/integer.h" +#include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/test-common.h" -#include "arrow/util/status.h" namespace arrow { @@ -38,14 +37,14 @@ class Buffer; TEST(TypesTest, TestCharType) { CharType t1(5); - ASSERT_EQ(t1.type, LogicalType::CHAR); + ASSERT_EQ(t1.type, Type::CHAR); ASSERT_EQ(t1.size, 5); ASSERT_EQ(t1.ToString(), std::string("char(5)")); // Test copy constructor CharType t2 = t1; - ASSERT_EQ(t2.type, LogicalType::CHAR); + ASSERT_EQ(t2.type, Type::CHAR); ASSERT_EQ(t2.size, 5); } @@ -53,22 +52,20 @@ TEST(TypesTest, TestCharType) { TEST(TypesTest, TestVarcharType) { VarcharType t1(5); - ASSERT_EQ(t1.type, LogicalType::VARCHAR); + ASSERT_EQ(t1.type, Type::VARCHAR); ASSERT_EQ(t1.size, 5); - ASSERT_EQ(t1.physical_type.size, 6); ASSERT_EQ(t1.ToString(), std::string("varchar(5)")); // Test copy constructor VarcharType t2 = t1; - ASSERT_EQ(t2.type, LogicalType::VARCHAR); + ASSERT_EQ(t2.type, Type::VARCHAR); ASSERT_EQ(t2.size, 5); - ASSERT_EQ(t2.physical_type.size, 6); } TEST(TypesTest, TestStringType) { StringType str; - ASSERT_EQ(str.type, LogicalType::STRING); + ASSERT_EQ(str.type, Type::STRING); ASSERT_EQ(str.name(), std::string("string")); } @@ -90,15 +87,16 @@ class TestStringContainer : public ::testing::Test { length_ = offsets_.size() - 1; int nchars = chars_.size(); - value_buf_ = to_buffer(chars_); + value_buf_ = test::to_buffer(chars_); values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); - offsets_buf_ = to_buffer(offsets_); + offsets_buf_ = test::to_buffer(offsets_); - nulls_buf_ = bytes_to_null_buffer(nulls_.data(), nulls_.size()); - null_count_ = null_count(nulls_); + nulls_buf_ = test::bytes_to_null_buffer(nulls_.data(), nulls_.size()); + null_count_ = test::null_count(nulls_); - strings_.Init(length_, offsets_buf_, values_, null_count_, nulls_buf_); + strings_ = std::make_shared(length_, offsets_buf_, values_, + null_count_, nulls_buf_); } protected: @@ -116,28 +114,28 @@ class TestStringContainer : public ::testing::Test { int length_; ArrayPtr values_; - StringArray strings_; + std::shared_ptr strings_; }; TEST_F(TestStringContainer, TestArrayBasics) { - ASSERT_EQ(length_, strings_.length()); - ASSERT_EQ(1, strings_.null_count()); + ASSERT_EQ(length_, strings_->length()); + ASSERT_EQ(1, strings_->null_count()); } TEST_F(TestStringContainer, TestType) { - TypePtr type = strings_.type(); + TypePtr type = strings_->type(); - ASSERT_EQ(LogicalType::STRING, type->type); - ASSERT_EQ(LogicalType::STRING, strings_.logical_type()); + ASSERT_EQ(Type::STRING, type->type); + ASSERT_EQ(Type::STRING, strings_->type_enum()); } TEST_F(TestStringContainer, TestListFunctions) { int pos = 0; for (size_t i = 0; i < expected_.size(); ++i) { - ASSERT_EQ(pos, strings_.value_offset(i)); - ASSERT_EQ(expected_[i].size(), strings_.value_length(i)); + ASSERT_EQ(pos, strings_->value_offset(i)); + ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); pos += expected_[i].size(); } } @@ -151,9 +149,9 @@ TEST_F(TestStringContainer, TestDestructor) { TEST_F(TestStringContainer, TestGetString) { for (size_t i = 0; i < expected_.size(); ++i) { if (nulls_[i]) { - ASSERT_TRUE(strings_.IsNull(i)); + ASSERT_TRUE(strings_->IsNull(i)); } else { - ASSERT_EQ(expected_[i], strings_.GetString(i)); + ASSERT_EQ(expected_[i], strings_->GetString(i)); } } } @@ -199,7 +197,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { Done(); ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps * null_count(is_null), result_->null_count()); + ASSERT_EQ(reps * test::null_count(is_null), result_->null_count()); ASSERT_EQ(reps * 6, result_->values()->length()); int32_t length; diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 8ccc0a9698a..2b3fba5ce09 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -25,25 +25,21 @@ #include "arrow/array.h" #include "arrow/type.h" -#include "arrow/types/integer.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/util/status.h" namespace arrow { -class ArrayBuilder; class Buffer; class MemoryPool; struct CharType : public DataType { int size; - BytesType physical_type; - - explicit CharType(int size, bool nullable = true) - : DataType(LogicalType::CHAR, nullable), - size(size), - physical_type(BytesType(size)) {} + explicit CharType(int size) + : DataType(Type::CHAR), + size(size) {} CharType(const CharType& other) : CharType(other.size) {} @@ -56,54 +52,36 @@ struct CharType : public DataType { struct VarcharType : public DataType { int size; - BytesType physical_type; - - explicit VarcharType(int size, bool nullable = true) - : DataType(LogicalType::VARCHAR, nullable), - size(size), - physical_type(BytesType(size + 1)) {} + explicit VarcharType(int size) + : DataType(Type::VARCHAR), + size(size) {} VarcharType(const VarcharType& other) : VarcharType(other.size) {} virtual std::string ToString() const; }; -static const LayoutPtr byte1(new BytesType(1)); -static const LayoutPtr physical_string = LayoutPtr(new ListLayoutType(byte1)); - // TODO: add a BinaryArray layer in between class StringArray : public ListArray { public: - StringArray() : ListArray(), bytes_(nullptr), raw_bytes_(nullptr) {} - - StringArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, - int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - Init(length, offsets, values, null_count, nulls); - } - - void Init(const TypePtr& type, int32_t length, + StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - ListArray::Init(type, length, offsets, values, null_count, nulls); - - // TODO: type validation for values array - + const std::shared_ptr& nulls = nullptr) : + ListArray(type, length, offsets, values, null_count, nulls) { // For convenience bytes_ = static_cast(values.get()); raw_bytes_ = bytes_->raw_data(); } - void Init(int32_t length, const std::shared_ptr& offsets, + StringArray(int32_t length, + const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) { - TypePtr type(new StringType()); - Init(type, length, offsets, values, null_count, nulls); - } + const std::shared_ptr& nulls = nullptr) : + StringArray(std::make_shared(), length, offsets, values, + null_count, nulls) {} // Compute the pointer t const uint8_t* GetValue(int i, int32_t* out_length) const { @@ -125,9 +103,6 @@ class StringArray : public ListArray { }; // Array builder - - - class StringBuilder : public ListBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 9a4777e8b98..d94396f42c5 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -15,16 +15,13 @@ // specific language governing permissions and limitations // under the License. -#include - #include #include #include +#include "gtest/gtest.h" + #include "arrow/type.h" -#include "arrow/types/integer.h" -#include "arrow/types/string.h" -#include "arrow/types/struct.h" using std::shared_ptr; using std::string; @@ -42,13 +39,13 @@ TEST(TestStructType, Basics) { TypePtr f2_type = TypePtr(new UInt8Type()); auto f2 = std::make_shared("f2", f2_type); - vector > fields = {f0, f1, f2}; + vector> fields = {f0, f1, f2}; StructType struct_type(fields); - ASSERT_TRUE(struct_type.field(0)->Equals(f0)); - ASSERT_TRUE(struct_type.field(1)->Equals(f1)); - ASSERT_TRUE(struct_type.field(2)->Equals(f2)); + ASSERT_TRUE(struct_type.child(0)->Equals(f0)); + ASSERT_TRUE(struct_type.child(1)->Equals(f1)); + ASSERT_TRUE(struct_type.child(2)->Equals(f2)); ASSERT_EQ(struct_type.ToString(), "struct"); diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 1744efce7d6..227aca632ef 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -18,11 +18,12 @@ #ifndef ARROW_TYPES_TEST_COMMON_H #define ARROW_TYPES_TEST_COMMON_H -#include #include #include #include +#include "gtest/gtest.h" + #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/memory-pool.h" @@ -34,7 +35,7 @@ namespace arrow { class TestBuilder : public ::testing::Test { public: void SetUp() { - pool_ = GetDefaultMemoryPool(); + pool_ = default_memory_pool(); type_ = TypePtr(new UInt8Type()); builder_.reset(new UInt8Builder(pool_, type_)); builder_nn_.reset(new UInt8Builder(pool_, type_)); diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 9aff780c6a3..29cda90b972 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -30,8 +30,8 @@ namespace arrow { class Buffer; -struct DenseUnionType : public CollectionType { - typedef CollectionType Base; +struct DenseUnionType : public CollectionType { + typedef CollectionType Base; explicit DenseUnionType(const std::vector& child_types) : Base() { @@ -42,8 +42,8 @@ struct DenseUnionType : public CollectionType { }; -struct SparseUnionType : public CollectionType { - typedef CollectionType Base; +struct SparseUnionType : public CollectionType { + typedef CollectionType Base; explicit SparseUnionType(const std::vector& child_types) : Base() { @@ -55,28 +55,20 @@ struct SparseUnionType : public CollectionType { class UnionArray : public Array { - public: - UnionArray() : Array() {} - protected: // The data are types encoded as int16 Buffer* types_; - std::vector > children_; + std::vector> children_; }; class DenseUnionArray : public UnionArray { - public: - DenseUnionArray() : UnionArray() {} - protected: Buffer* offset_buf_; }; class SparseUnionArray : public UnionArray { - public: - SparseUnionArray() : UnionArray() {} }; } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 7506ca5b553..220bff084fd 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. -#include - #include "arrow/util/bit-util.h" +#include "gtest/gtest.h" + namespace arrow { TEST(UtilTests, TestNextPower2) { diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 5e7197f9012..1d2d1d5f9d7 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -19,7 +19,6 @@ #define ARROW_UTIL_BIT_UTIL_H #include -#include #include namespace arrow { diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc index 9f1fd91432b..1d58226d84a 100644 --- a/cpp/src/arrow/util/buffer-test.cc +++ b/cpp/src/arrow/util/buffer-test.cc @@ -15,11 +15,12 @@ // specific language governing permissions and limitations // under the License. -#include #include #include #include +#include "gtest/gtest.h" + #include "arrow/test-util.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 50f4716769d..04cdcd75cd4 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -40,7 +40,7 @@ std::shared_ptr MutableBuffer::GetImmutableView() { PoolBuffer::PoolBuffer(MemoryPool* pool) : ResizableBuffer(nullptr, 0) { if (pool == nullptr) { - pool = GetDefaultMemoryPool(); + pool = default_memory_pool(); } pool_ = pool; } diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index 954b5f951b5..6ef07a07ada 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -15,10 +15,11 @@ // specific language governing permissions and limitations // under the License. -#include #include #include +#include "gtest/gtest.h" + #include "arrow/test-util.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -26,7 +27,7 @@ namespace arrow { TEST(DefaultMemoryPool, MemoryTracking) { - MemoryPool* pool = GetDefaultMemoryPool(); + MemoryPool* pool = default_memory_pool(); uint8_t* data; ASSERT_OK(pool->Allocate(100, &data)); @@ -37,7 +38,7 @@ TEST(DefaultMemoryPool, MemoryTracking) { } TEST(DefaultMemoryPool, OOM) { - MemoryPool* pool = GetDefaultMemoryPool(); + MemoryPool* pool = default_memory_pool(); uint8_t* data; int64_t to_alloc = std::numeric_limits::max(); diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index 5820346e5a7..0b885e9376a 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -70,9 +70,9 @@ void InternalMemoryPool::Free(uint8_t* buffer, int64_t size) { InternalMemoryPool::~InternalMemoryPool() {} -MemoryPool* GetDefaultMemoryPool() { - static InternalMemoryPool default_memory_pool; - return &default_memory_pool; +MemoryPool* default_memory_pool() { + static InternalMemoryPool default_memory_pool_; + return &default_memory_pool_; } } // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool.h b/cpp/src/arrow/util/memory-pool.h index a7cb10dae17..0d2478686f5 100644 --- a/cpp/src/arrow/util/memory-pool.h +++ b/cpp/src/arrow/util/memory-pool.h @@ -34,7 +34,7 @@ class MemoryPool { virtual int64_t bytes_allocated() const = 0; }; -MemoryPool* GetDefaultMemoryPool(); +MemoryPool* default_memory_pool(); } // namespace arrow diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index c6e113ebea5..43cb87e1a8c 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -54,6 +54,9 @@ std::string Status::CodeAsString() const { case StatusCode::Invalid: type = "Invalid"; break; + case StatusCode::IOError: + type = "IOError"; + break; case StatusCode::NotImplemented: type = "NotImplemented"; break; diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index 47fda40db25..b5931232dbd 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -63,6 +63,7 @@ enum class StatusCode: char { OutOfMemory = 1, KeyError = 2, Invalid = 3, + IOError = 4, NotImplemented = 10, }; @@ -97,12 +98,17 @@ class Status { return Status(StatusCode::Invalid, msg, -1); } + static Status IOError(const std::string& msg) { + return Status(StatusCode::IOError, msg, -1); + } + // Returns true iff the status indicates success. bool ok() const { return (state_ == NULL); } bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } + bool IsIOError() const { return code() == StatusCode::IOError; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. diff --git a/cpp/src/arrow/util/test_main.cc b/cpp/src/arrow/util/test_main.cc index 00139f36742..adc8466fb0b 100644 --- a/cpp/src/arrow/util/test_main.cc +++ b/cpp/src/arrow/util/test_main.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include +#include "gtest/gtest.h" int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index 294737cc505..3d5f532b163 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -17,6 +17,7 @@ else case $arg in "gtest") F_GTEST=1 ;; "gbenchmark") F_GBENCHMARK=1 ;; + "flatbuffers") F_FLATBUFFERS=1 ;; *) echo "Unknown module: $arg"; exit 1 ;; esac done @@ -78,6 +79,14 @@ if [ -n "$F_ALL" -o -n "$F_GBENCHMARK" ]; then make VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; } fi +FLATBUFFERS_ERROR="failed for flatbuffers" +if [ -n "$F_ALL" -o -n "$F_FLATBUFFERS" ]; then + cd $TP_DIR/$FLATBUFFERS_BASEDIR + + CXXFLAGS=-fPIC cmake -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX -DFLATBUFFERS_BUILD_TESTS=OFF . || { echo "cmake $FLATBUFFERS_ERROR" ; exit 1; } + make -j$PARALLEL + make install +fi echo "---------------------" echo "Thirdparty dependencies built and installed into $PREFIX successfully" diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh index d22c559b3e3..d299afc1522 100755 --- a/cpp/thirdparty/download_thirdparty.sh +++ b/cpp/thirdparty/download_thirdparty.sh @@ -25,3 +25,8 @@ if [ ! -d ${GBENCHMARK_BASEDIR} ]; then echo "Fetching google benchmark" download_extract_and_cleanup $GBENCHMARK_URL fi + +if [ ! -d ${FLATBUFFERS_BASEDIR} ]; then + echo "Fetching flatbuffers" + download_extract_and_cleanup $FLATBUFFERS_URL +fi diff --git a/cpp/thirdparty/versions.sh b/cpp/thirdparty/versions.sh index 9cfc7cd94b5..cb455b4eadd 100755 --- a/cpp/thirdparty/versions.sh +++ b/cpp/thirdparty/versions.sh @@ -5,3 +5,7 @@ GTEST_BASEDIR=googletest-release-$GTEST_VERSION GBENCHMARK_VERSION=1.0.0 GBENCHMARK_URL="https://github.com/google/benchmark/archive/v${GBENCHMARK_VERSION}.tar.gz" GBENCHMARK_BASEDIR=benchmark-$GBENCHMARK_VERSION + +FLATBUFFERS_VERSION=1.3.0 +FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_VERSION}.tar.gz" +FLATBUFFERS_BASEDIR=flatbuffers-$FLATBUFFERS_VERSION diff --git a/format/Message.fbs b/format/Message.fbs new file mode 100644 index 00000000000..3ffd2033208 --- /dev/null +++ b/format/Message.fbs @@ -0,0 +1,183 @@ +namespace apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// Logical types and their metadata (if any) +/// +/// These are stored in the flatbuffer in the Type union below + +/// A Tuple in the flatbuffer metadata is the same as an Arrow Struct +/// (according to the physical memory layout). We used Tuple here as Struct is +/// a reserved word in Flatbuffers +table Tuple { +} + +table List { +} + +enum UnionMode:int { Sparse, Dense } + +table Union { + mode: UnionMode; +} + +table Bit { +} + +table Int { + bitWidth: int; // 1 to 64 + is_signed: bool; +} + +enum Precision:int {SINGLE, DOUBLE} + +table FloatingPoint { + precision: Precision; +} + +table Utf8 { +} + +table Binary { +} + +table Bool { +} + +table Decimal { + precision: int; + scale: int; +} + +table Timestamp { + timezone: string; +} + +table JSONScalar { + dense:bool=true; +} + +/// ---------------------------------------------------------------------- +/// Top-level Type value, enabling extensible type-specific metadata. We can +/// add new logical types to Type without breaking backwards compatibility + +union Type { + Int, + Bit, + FloatingPoint, + Binary, + Utf8, + Bool, + Decimal, + Timestamp, + List, + Tuple, + Union, + JSONScalar +} + +/// ---------------------------------------------------------------------- +/// A field represents a named column in a record / row batch or child of a +/// nested type. +/// +/// - children is only for nested Arrow arrays +/// - For primitive types, children will have length 0 +/// - nullable should default to true in general + +table Field { + // Name is not required, in i.e. a List + name: string; + nullable: bool; + type: Type; + children: [Field]; +} + +/// ---------------------------------------------------------------------- +/// A Schema describes the columns in a row batch + +table Schema { + fields: [Field]; +} + +/// ---------------------------------------------------------------------- +/// Data structures for describing a table row batch (a collection of +/// equal-length Arrow arrays) + +/// A Buffer represents a single contiguous memory segment +struct Buffer { + /// The shared memory page id where this buffer is located. Currently this is + /// not used + page: int; + + /// The relative offset into the shared memory page where the bytes for this + /// buffer starts + offset: long; + + /// The absolute length (in bytes) of the memory buffer. The memory is found + /// from offset (inclusive) to offset + length (non-inclusive). + length: long; +} + +/// Metadata about a field at some level of a nested type tree (but not +/// its children). +/// +/// For example, a List with values [[1, 2, 3], null, [4], [5, 6], null] +/// would have {length: 5, null_count: 2} for its List node, and {length: 6, +/// null_count: 0} for its Int16 node, as separate FieldNode structs +struct FieldNode { + /// The number of value slots in the Arrow array at this level of a nested + /// tree + length: int; + + /// The number of observed nulls. Fields with null_count == 0 may choose not + /// to write their physical null bitmap out as a materialized buffer, instead + /// setting the length of the null buffer to 0. + null_count: int; +} + +/// A data header describing the shared memory layout of a "record" or "row" +/// batch. Some systems call this a "row batch" internally and others a "record +/// batch". +table RecordBatch { + /// number of records / rows. The arrays in the batch should all have this + /// length + length: int; + + /// Nodes correspond to the pre-ordered flattened logical schema + nodes: [FieldNode]; + + /// Buffers correspond to the pre-ordered flattened buffer tree + /// + /// The number of buffers appended to this list depends on the schema. For + /// example, most primitive arrays will have 2 buffers, 1 for the null bitmap + /// and 1 for the values. For struct arrays, there will only be a single + /// buffer for the null bitmap + buffers: [Buffer]; +} + +/// ---------------------------------------------------------------------- +/// For sending dictionary encoding information. Any Field can be +/// dictionary-encoded, but in this case none of its children may be +/// dictionary-encoded. +/// +/// TODO(wesm): To be documented in more detail + +table DictionaryBatch { + id: long; + data: RecordBatch; +} + +/// ---------------------------------------------------------------------- +/// The root Message type + +/// This union enables us to easily send different message types without +/// redundant storage, and in the future we can easily add new message types. +union MessageHeader { + Schema, DictionaryBatch, RecordBatch +} + +table Message { + header: MessageHeader; + bodyLength: long; +} + +root_type Message; diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 8d93a156bcc..9a080709beb 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -35,4 +35,6 @@ uint8, uint16, uint32, uint64, float_, double, string, list_, struct, field, - DataType, Field, Schema) + DataType, Field, Schema, schema) + +from pyarrow.array import RowBatch diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd index d0d3486c032..de3c7741962 100644 --- a/python/pyarrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -16,7 +16,7 @@ # under the License. from pyarrow.includes.common cimport shared_ptr -from pyarrow.includes.libarrow cimport CArray, LogicalType +from pyarrow.includes.libarrow cimport CArray from pyarrow.scalar import NA diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index bceb333c94e..c5d40ddd7a4 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -28,6 +28,9 @@ from pyarrow.error cimport check_status cimport pyarrow.scalar as scalar from pyarrow.scalar import NA +from pyarrow.schema cimport Schema +import pyarrow.schema as schema + def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.GetMemoryPool() return pool.bytes_allocated() @@ -155,12 +158,12 @@ cdef class StringArray(Array): cdef dict _array_classes = { - LogicalType_NA: NullArray, - LogicalType_BOOL: BooleanArray, - LogicalType_INT64: Int64Array, - LogicalType_DOUBLE: DoubleArray, - LogicalType_LIST: ListArray, - LogicalType_STRING: StringArray, + Type_NA: NullArray, + Type_BOOL: BooleanArray, + Type_INT64: Int64Array, + Type_DOUBLE: DoubleArray, + Type_LIST: ListArray, + Type_STRING: StringArray, } cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): @@ -190,3 +193,35 @@ def from_pylist(object list_obj, DataType type=None): raise NotImplementedError return box_arrow_array(sp_array) + +#---------------------------------------------------------------------- +# Table-like data structures + +cdef class RowBatch: + """ + + """ + cdef readonly: + Schema schema + int num_rows + list arrays + + def __cinit__(self, Schema schema, int num_rows, list arrays): + self.schema = schema + self.num_rows = num_rows + self.arrays = arrays + + if len(self.schema) != len(arrays): + raise ValueError('Mismatch number of data arrays and ' + 'schema fields') + + def __len__(self): + return self.num_rows + + property num_columns: + + def __get__(self): + return len(self.arrays) + + def __getitem__(self, i): + return self.arrays[i] diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index baba112833e..e6afcbd79b6 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -21,31 +21,30 @@ from pyarrow.includes.common cimport * cdef extern from "arrow/api.h" namespace "arrow" nogil: - enum LogicalType" arrow::LogicalType::type": - LogicalType_NA" arrow::LogicalType::NA" + enum Type" arrow::Type::type": + Type_NA" arrow::Type::NA" - LogicalType_BOOL" arrow::LogicalType::BOOL" + Type_BOOL" arrow::Type::BOOL" - LogicalType_UINT8" arrow::LogicalType::UINT8" - LogicalType_INT8" arrow::LogicalType::INT8" - LogicalType_UINT16" arrow::LogicalType::UINT16" - LogicalType_INT16" arrow::LogicalType::INT16" - LogicalType_UINT32" arrow::LogicalType::UINT32" - LogicalType_INT32" arrow::LogicalType::INT32" - LogicalType_UINT64" arrow::LogicalType::UINT64" - LogicalType_INT64" arrow::LogicalType::INT64" + Type_UINT8" arrow::Type::UINT8" + Type_INT8" arrow::Type::INT8" + Type_UINT16" arrow::Type::UINT16" + Type_INT16" arrow::Type::INT16" + Type_UINT32" arrow::Type::UINT32" + Type_INT32" arrow::Type::INT32" + Type_UINT64" arrow::Type::UINT64" + Type_INT64" arrow::Type::INT64" - LogicalType_FLOAT" arrow::LogicalType::FLOAT" - LogicalType_DOUBLE" arrow::LogicalType::DOUBLE" + Type_FLOAT" arrow::Type::FLOAT" + Type_DOUBLE" arrow::Type::DOUBLE" - LogicalType_STRING" arrow::LogicalType::STRING" + Type_STRING" arrow::Type::STRING" - LogicalType_LIST" arrow::LogicalType::LIST" - LogicalType_STRUCT" arrow::LogicalType::STRUCT" + Type_LIST" arrow::Type::LIST" + Type_STRUCT" arrow::Type::STRUCT" cdef cppclass CDataType" arrow::DataType": - LogicalType type - c_bool nullable + Type type c_bool Equals(const CDataType* other) @@ -55,8 +54,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int64_t bytes_allocated() cdef cppclass CListType" arrow::ListType"(CDataType): - CListType(const shared_ptr[CDataType]& value_type, - c_bool nullable) + CListType(const shared_ptr[CDataType]& value_type) cdef cppclass CStringType" arrow::StringType"(CDataType): pass @@ -65,21 +63,26 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string name shared_ptr[CDataType] type - CField(const c_string& name, const shared_ptr[CDataType]& type) + c_bool nullable + + CField(const c_string& name, const shared_ptr[CDataType]& type, + c_bool nullable) cdef cppclass CStructType" arrow::StructType"(CDataType): - CStructType(const vector[shared_ptr[CField]]& fields, - c_bool nullable) + CStructType(const vector[shared_ptr[CField]]& fields) cdef cppclass CSchema" arrow::Schema": - CSchema(const shared_ptr[CField]& fields) + CSchema(const vector[shared_ptr[CField]]& fields) + const shared_ptr[CField]& field(int i) + int num_fields() + c_string ToString() cdef cppclass CArray" arrow::Array": const shared_ptr[CDataType]& type() int32_t length() int32_t null_count() - LogicalType logical_type() + Type type_enum() c_bool IsNull(int i) @@ -122,3 +125,57 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStringArray" arrow::StringArray"(CListArray): c_string GetString(int i) + + +cdef extern from "arrow/api.h" namespace "arrow" nogil: + # We can later add more of the common status factory methods as needed + cdef CStatus CStatus_OK "Status::OK"() + + cdef cppclass CStatus "arrow::Status": + CStatus() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsNotImplemented() + c_bool IsInvalid() + + cdef cppclass Buffer: + uint8_t* data() + int64_t size() + + +cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil: + cdef cppclass SchemaMessage: + int num_fields() + CStatus GetField(int i, shared_ptr[CField]* out) + CStatus GetSchema(shared_ptr[CSchema]* out) + + cdef cppclass FieldMetadata: + pass + + cdef cppclass BufferMetadata: + pass + + cdef cppclass RecordBatchMessage: + pass + + cdef cppclass DictionaryBatchMessage: + pass + + enum MessageType" arrow::ipc::Message::Type": + MessageType_SCHEMA" arrow::ipc::Message::SCHEMA" + MessageType_RECORD_BATCH" arrow::ipc::Message::RECORD_BATCH" + MessageType_DICTIONARY_BATCH" arrow::ipc::Message::DICTIONARY_BATCH" + + cdef cppclass Message: + CStatus Open(const shared_ptr[Buffer]& buf, + shared_ptr[Message]* out) + int64_t body_length() + MessageType type() + + shared_ptr[SchemaMessage] GetSchema() + shared_ptr[RecordBatchMessage] GetRecordBatch() + shared_ptr[DictionaryBatchMessage] GetDictionaryBatch() diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 9a0c004b768..eedfc854468 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,8 +18,7 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CDataType, LogicalType, - MemoryPool) +from pyarrow.includes.libarrow cimport CArray, CDataType, Type, MemoryPool cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed @@ -39,7 +38,7 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: c_bool IsNotImplemented() c_bool IsArrowError() - shared_ptr[CDataType] GetPrimitiveType(LogicalType type, c_bool nullable) + shared_ptr[CDataType] GetPrimitiveType(Type type) Status ConvertPySequence(object obj, shared_ptr[CArray]* out) MemoryPool* GetMemoryPool() diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 261a38967c4..04f013d6ca7 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -172,18 +172,18 @@ cdef class ListValue(ArrayValue): cdef dict _scalar_classes = { - LogicalType_UINT8: Int8Value, - LogicalType_UINT16: Int16Value, - LogicalType_UINT32: Int32Value, - LogicalType_UINT64: Int64Value, - LogicalType_INT8: Int8Value, - LogicalType_INT16: Int16Value, - LogicalType_INT32: Int32Value, - LogicalType_INT64: Int64Value, - LogicalType_FLOAT: FloatValue, - LogicalType_DOUBLE: DoubleValue, - LogicalType_LIST: ListValue, - LogicalType_STRING: StringValue + Type_UINT8: Int8Value, + Type_UINT16: Int16Value, + Type_UINT32: Int32Value, + Type_UINT64: Int64Value, + Type_INT8: Int8Value, + Type_INT16: Int16Value, + Type_INT32: Int32Value, + Type_INT64: Int64Value, + Type_FLOAT: FloatValue, + Type_DOUBLE: DoubleValue, + Type_LIST: ListValue, + Type_STRING: StringValue } cdef object box_arrow_scalar(DataType type, diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd index 07b9bd04da2..61458b765c7 100644 --- a/python/pyarrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.includes.common cimport shared_ptr +from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport CDataType, CField, CSchema cdef class DataType: @@ -33,9 +33,13 @@ cdef class Field: cdef readonly: DataType type + cdef init(self, const shared_ptr[CField]& field) + cdef class Schema: cdef: shared_ptr[CSchema] sp_schema CSchema* schema + cdef init(self, const vector[shared_ptr[CField]]& fields) + cdef DataType box_data_type(const shared_ptr[CDataType]& type) diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index ea878720d5b..b3bf02aad76 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -54,94 +54,153 @@ cdef class DataType: cdef class Field: - def __cinit__(self, object name, DataType type): - self.type = type - self.sp_field.reset(new CField(tobytes(name), type.sp_type)) - self.field = self.sp_field.get() + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CField]& field): + self.sp_field = field + self.field = field.get() + + @classmethod + def from_py(cls, object name, DataType type, bint nullable=True): + cdef Field result = Field() + result.type = type + result.sp_field.reset(new CField(tobytes(name), type.sp_type, + nullable)) + result.field = result.sp_field.get() + + return result def __repr__(self): return 'Field({0!r}, type={1})'.format(self.name, str(self.type)) + property nullable: + + def __get__(self): + return self.field.nullable + property name: def __get__(self): return frombytes(self.field.name) +cdef class Schema: + + def __cinit__(self): + pass + + def __len__(self): + return self.schema.num_fields() + + def __getitem__(self, i): + if i < 0 or i >= len(self): + raise IndexError("{0} is out of bounds".format(i)) + + cdef Field result = Field() + result.init(self.schema.field(i)) + result.type = box_data_type(result.field.type) + + return result + + cdef init(self, const vector[shared_ptr[CField]]& fields): + self.schema = new CSchema(fields) + self.sp_schema.reset(self.schema) + + @classmethod + def from_fields(cls, fields): + cdef: + Schema result + Field field + vector[shared_ptr[CField]] c_fields + + c_fields.resize(len(fields)) + + for i in range(len(fields)): + field = fields[i] + c_fields[i] = field.sp_field + + result = Schema() + result.init(c_fields) + + return result + + def __repr__(self): + return frombytes(self.schema.ToString()) + cdef dict _type_cache = {} -cdef DataType primitive_type(LogicalType type, bint nullable=True): - if (type, nullable) in _type_cache: - return _type_cache[type, nullable] +cdef DataType primitive_type(Type type): + if type in _type_cache: + return _type_cache[type] cdef DataType out = DataType() - out.init(pyarrow.GetPrimitiveType(type, nullable)) + out.init(pyarrow.GetPrimitiveType(type)) - _type_cache[type, nullable] = out + _type_cache[type] = out return out #------------------------------------------------------------ # Type factory functions -def field(name, type): - return Field(name, type) +def field(name, type, bint nullable=True): + return Field.from_py(name, type, nullable) cdef set PRIMITIVE_TYPES = set([ - LogicalType_NA, LogicalType_BOOL, - LogicalType_UINT8, LogicalType_INT8, - LogicalType_UINT16, LogicalType_INT16, - LogicalType_UINT32, LogicalType_INT32, - LogicalType_UINT64, LogicalType_INT64, - LogicalType_FLOAT, LogicalType_DOUBLE]) + Type_NA, Type_BOOL, + Type_UINT8, Type_INT8, + Type_UINT16, Type_INT16, + Type_UINT32, Type_INT32, + Type_UINT64, Type_INT64, + Type_FLOAT, Type_DOUBLE]) def null(): - return primitive_type(LogicalType_NA) + return primitive_type(Type_NA) -def bool_(c_bool nullable=True): - return primitive_type(LogicalType_BOOL, nullable) +def bool_(): + return primitive_type(Type_BOOL) -def uint8(c_bool nullable=True): - return primitive_type(LogicalType_UINT8, nullable) +def uint8(): + return primitive_type(Type_UINT8) -def int8(c_bool nullable=True): - return primitive_type(LogicalType_INT8, nullable) +def int8(): + return primitive_type(Type_INT8) -def uint16(c_bool nullable=True): - return primitive_type(LogicalType_UINT16, nullable) +def uint16(): + return primitive_type(Type_UINT16) -def int16(c_bool nullable=True): - return primitive_type(LogicalType_INT16, nullable) +def int16(): + return primitive_type(Type_INT16) -def uint32(c_bool nullable=True): - return primitive_type(LogicalType_UINT32, nullable) +def uint32(): + return primitive_type(Type_UINT32) -def int32(c_bool nullable=True): - return primitive_type(LogicalType_INT32, nullable) +def int32(): + return primitive_type(Type_INT32) -def uint64(c_bool nullable=True): - return primitive_type(LogicalType_UINT64, nullable) +def uint64(): + return primitive_type(Type_UINT64) -def int64(c_bool nullable=True): - return primitive_type(LogicalType_INT64, nullable) +def int64(): + return primitive_type(Type_INT64) -def float_(c_bool nullable=True): - return primitive_type(LogicalType_FLOAT, nullable) +def float_(): + return primitive_type(Type_FLOAT) -def double(c_bool nullable=True): - return primitive_type(LogicalType_DOUBLE, nullable) +def double(): + return primitive_type(Type_DOUBLE) -def string(c_bool nullable=True): +def string(): """ UTF8 string """ - return primitive_type(LogicalType_STRING, nullable) + return primitive_type(Type_STRING) -def list_(DataType value_type, c_bool nullable=True): +def list_(DataType value_type): cdef DataType out = DataType() - out.init(shared_ptr[CDataType]( - new CListType(value_type.sp_type, nullable))) + out.init(shared_ptr[CDataType](new CListType(value_type.sp_type))) return out -def struct(fields, c_bool nullable=True): +def struct(fields): """ """ @@ -154,9 +213,11 @@ def struct(fields, c_bool nullable=True): c_fields.push_back(field.sp_field) out.init(shared_ptr[CDataType]( - new CStructType(c_fields, nullable))) + new CStructType(c_fields))) return out +def schema(fields): + return Schema.from_fields(fields) cdef DataType box_data_type(const shared_ptr[CDataType]& type): cdef DataType out = DataType() diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 0235526198f..2894ea8f844 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -18,6 +18,8 @@ from pyarrow.compat import unittest import pyarrow as arrow +A = arrow + class TestTypes(unittest.TestCase): @@ -28,15 +30,12 @@ def test_integers(self): for name in dtypes: factory = getattr(arrow, name) t = factory() - t_required = factory(False) - assert str(t) == name - assert str(t_required) == '{0} not null'.format(name) def test_list(self): value_type = arrow.int32() list_type = arrow.list_(value_type) - assert str(list_type) == 'list' + assert str(list_type) == 'list' def test_string(self): t = arrow.string() @@ -47,5 +46,26 @@ def test_field(self): f = arrow.field('foo', t) assert f.name == 'foo' + assert f.nullable assert f.type is t assert repr(f) == "Field('foo', type=string)" + + f = arrow.field('foo', t, False) + assert not f.nullable + + def test_schema(self): + fields = [ + A.field('foo', A.int32()), + A.field('bar', A.string()), + A.field('baz', A.list_(A.int8())) + ] + sch = A.schema(fields) + + assert len(sch) == 3 + assert sch[0].name == 'foo' + assert sch[0].type == fields[0].type + + assert repr(sch) == """\ +foo: int32 +bar: string +baz: list""" diff --git a/cpp/src/arrow/table/CMakeLists.txt b/python/pyarrow/tests/test_table.py similarity index 58% rename from cpp/src/arrow/table/CMakeLists.txt rename to python/pyarrow/tests/test_table.py index d9f00e74a37..2e24445bd0c 100644 --- a/cpp/src/arrow/table/CMakeLists.txt +++ b/python/pyarrow/tests/test_table.py @@ -15,19 +15,26 @@ # specific language governing permissions and limitations # under the License. -####################################### -# arrow_table -####################################### - -# Headers: top level -install(FILES - column.h - schema.h - table.h - DESTINATION include/arrow/table) - -ADD_ARROW_TEST(column-test) -ADD_ARROW_TEST(schema-test) -ADD_ARROW_TEST(table-test) - -ADD_ARROW_BENCHMARK(column-benchmark) +from pyarrow.compat import unittest +import pyarrow as arrow + +A = arrow + + +class TestRowBatch(unittest.TestCase): + + def test_basics(self): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + num_rows = 5 + + descr = A.schema([A.field('c0', data[0].type), + A.field('c1', data[1].type)]) + + batch = A.RowBatch(descr, num_rows, data) + + assert len(batch) == num_rows + assert batch.num_rows == num_rows + assert batch.num_columns == len(data) diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index bb7905236c5..acb13acecaf 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -27,7 +27,7 @@ using arrow::ArrayBuilder; using arrow::DataType; -using arrow::LogicalType; +using arrow::Type; namespace pyarrow { @@ -356,17 +356,17 @@ class ListConverter : public TypedConverter { // Dynamic constructor for sequence converters std::shared_ptr GetConverter(const std::shared_ptr& type) { switch (type->type) { - case LogicalType::BOOL: + case Type::BOOL: return std::make_shared(); - case LogicalType::INT64: + case Type::INT64: return std::make_shared(); - case LogicalType::DOUBLE: + case Type::DOUBLE: return std::make_shared(); - case LogicalType::STRING: + case Type::STRING: return std::make_shared(); - case LogicalType::LIST: + case Type::LIST: return std::make_shared(); - case LogicalType::STRUCT: + case Type::STRUCT: default: return nullptr; break; @@ -378,7 +378,7 @@ Status ListConverter::Init(const std::shared_ptr& builder) { typed_builder_ = static_cast(builder.get()); value_converter_ = GetConverter(static_cast( - builder->type().get())->value_type); + builder->type().get())->value_type()); if (value_converter_ == nullptr) { return Status::NotImplemented("value type not implemented"); } @@ -393,8 +393,8 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type)); // Handle NA / NullType case - if (type->type == LogicalType::NA) { - out->reset(new arrow::Array(type, size, size)); + if (type->type == Type::NA) { + out->reset(new arrow::NullArray(type, size)); return Status::OK(); } diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index 0921fc49945..08003aabf9f 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -37,19 +37,14 @@ const std::shared_ptr FLOAT = std::make_shared(); const std::shared_ptr DOUBLE = std::make_shared(); const std::shared_ptr STRING = std::make_shared(); -#define GET_PRIMITIVE_TYPE(NAME, Type) \ - case LogicalType::NAME: \ - if (nullable) { \ - return NAME; \ - } else { \ - return std::make_shared(nullable); \ - } \ +#define GET_PRIMITIVE_TYPE(NAME, Class) \ + case Type::NAME: \ + return NAME; \ break; -std::shared_ptr GetPrimitiveType(LogicalType::type type, - bool nullable) { +std::shared_ptr GetPrimitiveType(Type::type type) { switch (type) { - case LogicalType::NA: + case Type::NA: return NA; GET_PRIMITIVE_TYPE(UINT8, UInt8Type); GET_PRIMITIVE_TYPE(INT8, Int8Type); diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index e41568d5881..ec42bb31d3b 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -24,7 +24,7 @@ namespace pyarrow { using arrow::DataType; -using arrow::LogicalType; +using arrow::Type; extern const std::shared_ptr NA; extern const std::shared_ptr BOOL; @@ -40,8 +40,7 @@ extern const std::shared_ptr FLOAT; extern const std::shared_ptr DOUBLE; extern const std::shared_ptr STRING; -std::shared_ptr GetPrimitiveType(LogicalType::type type, - bool nullable); +std::shared_ptr GetPrimitiveType(Type::type type); } // namespace pyarrow From a4002c6e217bf1e74895dc11ab76f0c8befbfe2a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 23 Mar 2016 10:59:31 -0700 Subject: [PATCH 040/210] ARROW-70: Add adapt 'lite' DCHECK macros from Kudu as also used in Parquet Also added a null pointer DCHECK to show that it works. cc @emkornfield Author: Wes McKinney Closes #33 from wesm/ARROW-70 and squashes the following commits: 258d77b [Wes McKinney] Add adapt 'lite' DCHECK macros from Kudu as also used in Parquet --- cpp/src/arrow/ipc/adapter.cc | 2 + cpp/src/arrow/util/logging.h | 109 +++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 cpp/src/arrow/util/logging.h diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 7cdb965f5f4..8a7d818ceee 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -32,6 +32,7 @@ #include "arrow/types/construct.h" #include "arrow/types/primitive.h" #include "arrow/util/buffer.h" +#include "arrow/util/logging.h" #include "arrow/util/status.h" namespace arrow { @@ -41,6 +42,7 @@ namespace flatbuf = apache::arrow::flatbuf; namespace ipc { static bool IsPrimitive(const DataType* type) { + DCHECK(type != nullptr); switch (type->type) { // NA is null type or "no type", considered primitive for now case Type::NA: diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h new file mode 100644 index 00000000000..3ce4ccc1e9c --- /dev/null +++ b/cpp/src/arrow/util/logging.h @@ -0,0 +1,109 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_LOGGING_H +#define ARROW_UTIL_LOGGING_H + +#include + +namespace arrow { + +// Stubbed versions of macros defined in glog/logging.h, intended for +// environments where glog headers aren't available. +// +// Add more as needed. + +// Log levels. LOG ignores them, so their values are abitrary. + +#define ARROW_INFO 0 +#define ARROW_WARNING 1 +#define ARROW_ERROR 2 +#define ARROW_FATAL 3 + +#define ARROW_LOG_INTERNAL(level) arrow::internal::CerrLog(level) +#define ARROW_LOG(level) ARROW_LOG_INTERNAL(ARROW_##level) + +#define ARROW_CHECK(condition) \ + (condition) ? 0 : ARROW_LOG(FATAL) << "Check failed: " #condition " " + +#ifdef NDEBUG +#define ARROW_DFATAL ARROW_WARNING + +#define DCHECK(condition) while (false) arrow::internal::NullLog() +#define DCHECK_EQ(val1, val2) while (false) arrow::internal::NullLog() +#define DCHECK_NE(val1, val2) while (false) arrow::internal::NullLog() +#define DCHECK_LE(val1, val2) while (false) arrow::internal::NullLog() +#define DCHECK_LT(val1, val2) while (false) arrow::internal::NullLog() +#define DCHECK_GE(val1, val2) while (false) arrow::internal::NullLog() +#define DCHECK_GT(val1, val2) while (false) arrow::internal::NullLog() + +#else +#define ARROW_DFATAL ARROW_FATAL + +#define DCHECK(condition) ARROW_CHECK(condition) +#define DCHECK_EQ(val1, val2) ARROW_CHECK((val1) == (val2)) +#define DCHECK_NE(val1, val2) ARROW_CHECK((val1) != (val2)) +#define DCHECK_LE(val1, val2) ARROW_CHECK((val1) <= (val2)) +#define DCHECK_LT(val1, val2) ARROW_CHECK((val1) < (val2)) +#define DCHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2)) +#define DCHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2)) + +#endif // NDEBUG + +namespace internal { + +class NullLog { + public: + template + NullLog& operator<<(const T& t) { + return *this; + } +}; + +class CerrLog { + public: + CerrLog(int severity) // NOLINT(runtime/explicit) + : severity_(severity), + has_logged_(false) { + } + + ~CerrLog() { + if (has_logged_) { + std::cerr << std::endl; + } + if (severity_ == ARROW_FATAL) { + exit(1); + } + } + + template + CerrLog& operator<<(const T& t) { + has_logged_ = true; + std::cerr << t; + return *this; + } + + private: + const int severity_; + bool has_logged_; +}; + +} // namespace internal + +} // namespace arrow + +#endif // ARROW_UTIL_LOGGING_H From fbbee3d2db5beb1ae6c623fc6392095cffdf74fe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 24 Mar 2016 09:31:56 -0700 Subject: [PATCH 041/210] ARROW-77: [C++] Conform bitmap interpretation to ARROW-62; 1 for nulls, 0 for non-nulls Author: Wes McKinney Closes #35 from wesm/ARROW-77 and squashes the following commits: 848d1fe [Wes McKinney] Clean up variable names to indicate valid_bytes vs null_bytes and change nulls to null_bitmap to be more clear 8960c7d [Wes McKinney] Flip bit interpretation so that 1 is null and 0 is not-null. Do not compare null slots in primitive arrays --- cpp/src/arrow/array-test.cc | 30 ++++----- cpp/src/arrow/array.cc | 10 +-- cpp/src/arrow/array.h | 16 ++--- cpp/src/arrow/builder.cc | 16 ++--- cpp/src/arrow/builder.h | 14 ++-- cpp/src/arrow/column-benchmark.cc | 6 +- cpp/src/arrow/ipc/adapter.cc | 10 +-- cpp/src/arrow/ipc/ipc-adapter-test.cc | 8 +-- cpp/src/arrow/test-util.h | 25 ++++--- cpp/src/arrow/types/construct.cc | 4 +- cpp/src/arrow/types/construct.h | 2 +- cpp/src/arrow/types/list.cc | 6 +- cpp/src/arrow/types/list.h | 23 +++---- cpp/src/arrow/types/primitive-test.cc | 55 ++++++++++------ cpp/src/arrow/types/primitive.cc | 29 +++++++-- cpp/src/arrow/types/primitive.h | 93 +++++++++++++++------------ cpp/src/arrow/types/string-test.cc | 18 +++--- cpp/src/arrow/types/string.h | 8 +-- cpp/src/arrow/util/bit-util.h | 8 ++- 19 files changed, 213 insertions(+), 168 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index eded5941e89..7c6eaf55c0d 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -44,9 +44,9 @@ class TestArray : public ::testing::Test { TEST_F(TestArray, TestNullCount) { auto data = std::make_shared(pool_); - auto nulls = std::make_shared(pool_); + auto null_bitmap = std::make_shared(pool_); - std::unique_ptr arr(new Int32Array(100, data, 10, nulls)); + std::unique_ptr arr(new Int32Array(100, data, 10, null_bitmap)); ASSERT_EQ(10, arr->null_count()); std::unique_ptr arr_no_nulls(new Int32Array(100, data)); @@ -61,28 +61,28 @@ TEST_F(TestArray, TestLength) { } TEST_F(TestArray, TestIsNull) { - std::vector nulls = {1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 1, 1, 0, 1, 0, 0, - 1, 0, 0, 1}; + std::vector null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 1, 1, 0, 1, 0, 0, + 1, 0, 0, 1}; int32_t null_count = 0; - for (uint8_t x : nulls) { - if (x > 0) ++null_count; + for (uint8_t x : null_bitmap) { + if (x == 0) ++null_count; } - std::shared_ptr null_buf = test::bytes_to_null_buffer(nulls.data(), - nulls.size()); + std::shared_ptr null_buf = test::bytes_to_null_buffer(null_bitmap.data(), + null_bitmap.size()); std::unique_ptr arr; - arr.reset(new Int32Array(nulls.size(), nullptr, null_count, null_buf)); + arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_count, null_buf)); ASSERT_EQ(null_count, arr->null_count()); ASSERT_EQ(5, null_buf->size()); - ASSERT_TRUE(arr->nulls()->Equals(*null_buf.get())); + ASSERT_TRUE(arr->null_bitmap()->Equals(*null_buf.get())); - for (size_t i = 0; i < nulls.size(); ++i) { - ASSERT_EQ(static_cast(nulls[i]), arr->IsNull(i)); + for (size_t i = 0; i < null_bitmap.size(); ++i) { + EXPECT_EQ(static_cast(null_bitmap[i]), !arr->IsNull(i)) << i; } } diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 5a5bc1069db..3736732740b 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -27,13 +27,13 @@ namespace arrow { // Base array class Array::Array(const TypePtr& type, int32_t length, int32_t null_count, - const std::shared_ptr& nulls) { + const std::shared_ptr& null_bitmap) { type_ = type; length_ = length; null_count_ = null_count; - nulls_ = nulls; - if (nulls_) { - null_bits_ = nulls_->data(); + null_bitmap_ = null_bitmap; + if (null_bitmap_) { + null_bitmap_data_ = null_bitmap_->data(); } } @@ -44,7 +44,7 @@ bool Array::EqualsExact(const Array& other) const { return false; } if (null_count_ > 0) { - return nulls_->Equals(*other.nulls_, util::bytes_for_bits(length_)); + return null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); } else { return true; } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 65fc0aaf583..133adf32cbd 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -32,8 +32,8 @@ class Buffer; // Immutable data array with some logical type and some length. Any memory is // owned by the respective Buffer instance (or its parents). // -// The base class is only required to have a nulls buffer if the null count is -// greater than 0 +// The base class is only required to have a null bitmap buffer if the null +// count is greater than 0 // // Any buffers used to initialize the array have their references "stolen". If // you wish to use the buffer beyond the lifetime of the array, you need to @@ -41,13 +41,13 @@ class Buffer; class Array { public: Array(const TypePtr& type, int32_t length, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr); + const std::shared_ptr& null_bitmap = nullptr); virtual ~Array() {} // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { - return null_count_ > 0 && util::get_bit(null_bits_, i); + return null_count_ > 0 && util::bit_not_set(null_bitmap_data_, i); } int32_t length() const { return length_;} @@ -56,8 +56,8 @@ class Array { const std::shared_ptr& type() const { return type_;} Type::type type_enum() const { return type_->type;} - const std::shared_ptr& nulls() const { - return nulls_; + const std::shared_ptr& null_bitmap() const { + return null_bitmap_; } bool EqualsExact(const Array& arr) const; @@ -68,8 +68,8 @@ class Array { int32_t null_count_; int32_t length_; - std::shared_ptr nulls_; - const uint8_t* null_bits_; + std::shared_ptr null_bitmap_; + const uint8_t* null_bitmap_data_; private: Array() {} diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index ba70add1551..6a62dc3b0e0 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -28,20 +28,20 @@ namespace arrow { Status ArrayBuilder::Init(int32_t capacity) { capacity_ = capacity; int32_t to_alloc = util::ceil_byte(capacity) / 8; - nulls_ = std::make_shared(pool_); - RETURN_NOT_OK(nulls_->Resize(to_alloc)); - null_bits_ = nulls_->mutable_data(); - memset(null_bits_, 0, to_alloc); + null_bitmap_ = std::make_shared(pool_); + RETURN_NOT_OK(null_bitmap_->Resize(to_alloc)); + null_bitmap_data_ = null_bitmap_->mutable_data(); + memset(null_bitmap_data_, 0, to_alloc); return Status::OK(); } Status ArrayBuilder::Resize(int32_t new_bits) { int32_t new_bytes = util::ceil_byte(new_bits) / 8; - int32_t old_bytes = nulls_->size(); - RETURN_NOT_OK(nulls_->Resize(new_bytes)); - null_bits_ = nulls_->mutable_data(); + int32_t old_bytes = null_bitmap_->size(); + RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); + null_bitmap_data_ = null_bitmap_->mutable_data(); if (old_bytes < new_bytes) { - memset(null_bits_ + old_bytes, 0, new_bytes - old_bytes); + memset(null_bitmap_data_ + old_bytes, 0, new_bytes - old_bytes); } return Status::OK(); } diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index d5d1fdf95af..308e54c80d7 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -40,9 +40,9 @@ class ArrayBuilder { explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : pool_(pool), type_(type), - nulls_(nullptr), + null_bitmap_(nullptr), null_count_(0), - null_bits_(nullptr), + null_bitmap_data_(nullptr), length_(0), capacity_(0) {} @@ -66,7 +66,7 @@ class ArrayBuilder { // initialized independently Status Init(int32_t capacity); - // Resizes the nulls array + // Resizes the null_bitmap array Status Resize(int32_t new_bits); // For cases where raw data was memcpy'd into the internal buffers, allows us @@ -74,7 +74,7 @@ class ArrayBuilder { // this function responsibly. Status Advance(int32_t elements); - const std::shared_ptr& nulls() const { return nulls_;} + const std::shared_ptr& null_bitmap() const { return null_bitmap_;} // Creates new array object to hold the contents of the builder and transfers // ownership of the data @@ -89,10 +89,10 @@ class ArrayBuilder { std::shared_ptr type_; - // When nulls are first appended to the builder, the null bitmap is allocated - std::shared_ptr nulls_; + // When null_bitmap are first appended to the builder, the null bitmap is allocated + std::shared_ptr null_bitmap_; int32_t null_count_; - uint8_t* null_bits_; + uint8_t* null_bitmap_data_; // Array length, so far. Also, the index of the next element to be added int32_t length_; diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc index 69ee52c3e09..335d581782a 100644 --- a/cpp/src/arrow/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -28,10 +28,10 @@ namespace { std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { auto pool = default_memory_pool(); auto data = std::make_shared(pool); - auto nulls = std::make_shared(pool); + auto null_bitmap = std::make_shared(pool); data->Resize(length * sizeof(typename ArrayType::value_type)); - nulls->Resize(util::bytes_for_bits(length)); - return std::make_shared(length, data, 10, nulls); + null_bitmap->Resize(util::bytes_for_bits(length)); + return std::make_shared(length, data, 10, null_bitmap); } } // anonymous namespace diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 8a7d818ceee..c79e8469530 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -75,7 +75,7 @@ Status VisitArray(const Array* arr, std::vector* field_nodes flatbuf::FieldNode(prim_arr->length(), prim_arr->null_count())); if (prim_arr->null_count() > 0) { - buffers->push_back(prim_arr->nulls()); + buffers->push_back(prim_arr->null_bitmap()); } else { // Push a dummy zero-length buffer, not to be copied buffers->push_back(std::make_shared(nullptr, 0)); @@ -230,13 +230,13 @@ class RowBatchReader::Impl { FieldMetadata field_meta = metadata_->field(field_index_++); if (IsPrimitive(type.get())) { - std::shared_ptr nulls; + std::shared_ptr null_bitmap; std::shared_ptr data; if (field_meta.null_count == 0) { - nulls = nullptr; + null_bitmap = nullptr; ++buffer_index_; } else { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &nulls)); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &null_bitmap)); } if (field_meta.length > 0) { RETURN_NOT_OK(GetBuffer(buffer_index_++, &data)); @@ -244,7 +244,7 @@ class RowBatchReader::Impl { data.reset(new Buffer(nullptr, 0)); } return MakePrimitiveArray(type, field_meta.length, data, - field_meta.null_count, nulls, out); + field_meta.null_count, null_bitmap, out); } else { return Status::NotImplemented("Non-primitive types not complete yet"); } diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index d75998f0a5d..79b4d710d28 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -77,14 +77,14 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { test::rand_uniform_int(length, 0, 0, std::numeric_limits::max(), reinterpret_cast(data->mutable_data())); - auto nulls = std::make_shared(pool_); + auto null_bitmap = std::make_shared(pool_); int null_bytes = util::bytes_for_bits(length); - ASSERT_OK(nulls->Resize(null_bytes)); - test::random_bytes(null_bytes, 0, nulls->mutable_data()); + ASSERT_OK(null_bitmap->Resize(null_bytes)); + test::random_bytes(null_bytes, 0, null_bitmap->mutable_data()); auto a0 = std::make_shared(length, data); auto a1 = std::make_shared(length, data, - test::bitmap_popcount(nulls->data(), length), nulls); + test::bitmap_popcount(null_bitmap->data(), length), null_bitmap); RowBatch batch(schema, length, {a0, a1}); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index a9fb2a7644a..ea3ce5f7f53 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -72,10 +72,10 @@ class TestBase : public ::testing::Test { template std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { auto data = std::make_shared(pool_); - auto nulls = std::make_shared(pool_); + auto null_bitmap = std::make_shared(pool_); EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); - EXPECT_OK(nulls->Resize(util::bytes_for_bits(length))); - return std::make_shared(length, data, 10, nulls); + EXPECT_OK(null_bitmap->Resize(util::bytes_for_bits(length))); + return std::make_shared(length, data, 10, null_bitmap); } protected: @@ -104,17 +104,22 @@ std::shared_ptr to_buffer(const std::vector& values) { values.size() * sizeof(T)); } -void random_nulls(int64_t n, double pct_null, std::vector* nulls) { +void random_null_bitmap(int64_t n, double pct_null, std::vector* null_bitmap) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - nulls->push_back(static_cast(rng.NextDoubleFraction() > pct_null)); + if (rng.NextDoubleFraction() > pct_null) { + null_bitmap->push_back(1); + } else { + // null + null_bitmap->push_back(0); + } } } -void random_nulls(int64_t n, double pct_null, std::vector* nulls) { +void random_null_bitmap(int64_t n, double pct_null, std::vector* null_bitmap) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - nulls->push_back(rng.NextDoubleFraction() > pct_null); + null_bitmap->push_back(rng.NextDoubleFraction() > pct_null); } } @@ -145,10 +150,10 @@ static inline int bitmap_popcount(const uint8_t* data, int length) { return count; } -static inline int null_count(const std::vector& nulls) { +static inline int null_count(const std::vector& valid_bytes) { int result = 0; - for (size_t i = 0; i < nulls.size(); ++i) { - if (nulls[i] > 0) { + for (size_t i = 0; i < valid_bytes.size(); ++i) { + if (valid_bytes[i] == 0) { ++result; } } diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 290decd81ff..df2317c340b 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -75,12 +75,12 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ case Type::ENUM: \ - out->reset(new ArrayType(type, length, data, null_count, nulls)); \ + out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ return Status::OK(); Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& nulls, + int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out) { switch (type->type) { MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 089c484c58b..228faeccc4e 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -35,7 +35,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& nulls, + int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out); } // namespace arrow diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 670ee4da116..d64c06d90c1 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -27,13 +27,13 @@ bool ListArray::EqualsExact(const ListArray& other) const { bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, length_ + 1); - bool equal_nulls = true; + bool equal_null_bitmap = true; if (null_count_ > 0) { - equal_nulls = nulls_->Equals(*other.nulls_, + equal_null_bitmap = null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); } - if (!(equal_offsets && equal_nulls)) { + if (!(equal_offsets && equal_null_bitmap)) { return false; } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 141f762458b..72e20e943c3 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -39,8 +39,8 @@ class ListArray : public Array { ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, const ArrayPtr& values, int32_t null_count = 0, - std::shared_ptr nulls = nullptr) : - Array(type, length, null_count, nulls) { + std::shared_ptr null_bitmap = nullptr) : + Array(type, length, null_count, null_bitmap) { offset_buf_ = offsets; offsets_ = offsets == nullptr? nullptr : reinterpret_cast(offset_buf_->data()); @@ -109,17 +109,17 @@ class ListBuilder : public Int32Builder { // Vector append // - // If passed, null_bytes is of equal length to values, and any nonzero byte + // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot - Status Append(value_type* values, int32_t length, uint8_t* null_bytes = nullptr) { + Status Append(value_type* values, int32_t length, uint8_t* valid_bytes = nullptr) { if (length_ + length > capacity_) { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } memcpy(raw_buffer() + length_, values, length * elsize_); - if (null_bytes != nullptr) { - AppendNulls(null_bytes, length); + if (valid_bytes != nullptr) { + AppendNulls(valid_bytes, length); } length_ += length; @@ -136,9 +136,9 @@ class ListBuilder : public Int32Builder { } auto result = std::make_shared(type_, length_, values_, items, - null_count_, nulls_); + null_count_, null_bitmap_); - values_ = nulls_ = nullptr; + values_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; @@ -159,16 +159,13 @@ class ListBuilder : public Int32Builder { } if (is_null) { ++null_count_; - util::set_bit(null_bits_, length_); + } else { + util::set_bit(null_bitmap_data_, length_); } raw_buffer()[length_++] = value_builder_->length(); return Status::OK(); } - // Status Append(int32_t* offsets, int length, uint8_t* null_bytes) { - // return Int32Builder::Append(offsets, length, null_bytes); - // } - Status AppendNull() { return Append(true); } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 7eae8cda8c4..10ba113c591 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -71,10 +71,10 @@ PRIMITIVE_TEST(BooleanType, BOOL, "bool"); TEST_F(TestBuilder, TestResize) { builder_->Init(10); - ASSERT_EQ(2, builder_->nulls()->size()); + ASSERT_EQ(2, builder_->null_bitmap()->size()); builder_->Resize(30); - ASSERT_EQ(4, builder_->nulls()->size()); + ASSERT_EQ(4, builder_->null_bitmap()->size()); } template @@ -99,7 +99,7 @@ class TestPrimitiveBuilder : public TestBuilder { void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); - test::random_nulls(N, pct_null, &nulls_); + test::random_null_bitmap(N, pct_null, &valid_bytes_); } void CheckNullable() { @@ -109,10 +109,11 @@ class TestPrimitiveBuilder : public TestBuilder { reinterpret_cast(draws_.data()), size * sizeof(T)); - auto ex_nulls = test::bytes_to_null_buffer(nulls_.data(), size); - int32_t ex_null_count = test::null_count(nulls_); + auto ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_.data(), size); + int32_t ex_null_count = test::null_count(valid_bytes_); - auto expected = std::make_shared(size, ex_data, ex_null_count, ex_nulls); + auto expected = std::make_shared(size, ex_data, ex_null_count, + ex_null_bitmap); std::shared_ptr result = std::dynamic_pointer_cast( builder_->Finish()); @@ -123,8 +124,8 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_EQ(0, builder_->null_count()); ASSERT_EQ(nullptr, builder_->buffer()); - ASSERT_TRUE(result->EqualsExact(*expected.get())); ASSERT_EQ(ex_null_count, result->null_count()); + ASSERT_TRUE(result->EqualsExact(*expected.get())); } void CheckNonNullable() { @@ -154,7 +155,7 @@ class TestPrimitiveBuilder : public TestBuilder { shared_ptr builder_nn_; vector draws_; - vector nulls_; + vector valid_bytes_; }; #define PTYPE_DECL(CapType, c_type) \ @@ -210,7 +211,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestInit) { } TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { - int size = 10000; + int size = 1000; for (int i = 0; i < size; ++i) { ASSERT_OK(this->builder_->AppendNull()); } @@ -218,17 +219,17 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { auto result = this->builder_->Finish(); for (int i = 0; i < size; ++i) { - ASSERT_TRUE(result->IsNull(i)); + ASSERT_TRUE(result->IsNull(i)) << i; } } TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { DECL_T(); - int size = 10000; + int size = 1000; vector& draws = this->draws_; - vector& nulls = this->nulls_; + vector& valid_bytes = this->valid_bytes_; int64_t memory_before = this->pool_->bytes_allocated(); @@ -236,7 +237,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { int i; for (i = 0; i < size; ++i) { - ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + if (valid_bytes[i] > 0) { + ASSERT_OK(this->builder_->Append(draws[i])); + } else { + ASSERT_OK(this->builder_->AppendNull()); + } } do { @@ -249,17 +254,21 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); - int size = 10000; + const int size = 10000; vector& draws = this->draws_; - vector& nulls = this->nulls_; + vector& valid_bytes = this->valid_bytes_; this->RandomData(size); int i; // Append the first 1000 for (i = 0; i < 1000; ++i) { - ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + if (valid_bytes[i] > 0) { + ASSERT_OK(this->builder_->Append(draws[i])); + } else { + ASSERT_OK(this->builder_->AppendNull()); + } ASSERT_OK(this->builder_nn_->Append(draws[i])); } @@ -271,7 +280,11 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { // Append the next 9000 for (i = 1000; i < size; ++i) { - ASSERT_OK(this->builder_->Append(draws[i], nulls[i] > 0)); + if (valid_bytes[i] > 0) { + ASSERT_OK(this->builder_->Append(draws[i])); + } else { + ASSERT_OK(this->builder_->AppendNull()); + } ASSERT_OK(this->builder_nn_->Append(draws[i])); } @@ -293,12 +306,12 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { this->RandomData(size); vector& draws = this->draws_; - vector& nulls = this->nulls_; + vector& valid_bytes = this->valid_bytes_; // first slug int K = 1000; - ASSERT_OK(this->builder_->Append(draws.data(), K, nulls.data())); + ASSERT_OK(this->builder_->Append(draws.data(), K, valid_bytes.data())); ASSERT_OK(this->builder_nn_->Append(draws.data(), K)); ASSERT_EQ(1000, this->builder_->length()); @@ -308,7 +321,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { ASSERT_EQ(1024, this->builder_nn_->capacity()); // Append the next 9000 - ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, nulls.data() + K)); + ASSERT_OK(this->builder_->Append(draws.data() + K, size - K, valid_bytes.data() + K)); ASSERT_OK(this->builder_nn_->Append(draws.data() + K, size - K)); ASSERT_EQ(size, this->builder_->length()); @@ -338,7 +351,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { ASSERT_EQ(cap, this->builder_->capacity()); ASSERT_EQ(cap * sizeof(T), this->builder_->buffer()->size()); - ASSERT_EQ(util::ceil_byte(cap) / 8, this->builder_->nulls()->size()); + ASSERT_EQ(util::ceil_byte(cap) / 8, this->builder_->null_bitmap()->size()); } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 32b8bfa7f1b..ecd5d68ff45 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -26,13 +26,14 @@ namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, +PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, int value_size, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& nulls) : - Array(type, length, null_count, nulls) { + const std::shared_ptr& null_bitmap) : + Array(type, length, null_count, null_bitmap) { data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); + value_size_ = value_size; } bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { @@ -41,12 +42,26 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { return false; } - bool equal_data = data_->Equals(*other.data_, length_); if (null_count_ > 0) { - return equal_data && - nulls_->Equals(*other.nulls_, util::ceil_byte(length_) / 8); + bool equal_bitmap = null_bitmap_->Equals(*other.null_bitmap_, + util::ceil_byte(length_) / 8); + if (!equal_bitmap) { + return false; + } + + const uint8_t* this_data = raw_data_; + const uint8_t* other_data = other.raw_data_; + + for (int i = 0; i < length_; ++i) { + if (!IsNull(i) && memcmp(this_data, other_data, value_size_)) { + return false; + } + this_data += value_size_; + other_data += value_size_; + } + return true; } else { - return equal_data; + return data_->Equals(*other.data_, length_); } } diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index e01027cf55c..4eaff433229 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -37,10 +37,10 @@ class MemoryPool; // Base class for fixed-size logical types class PrimitiveArray : public Array { public: - PrimitiveArray(const TypePtr& type, int32_t length, + PrimitiveArray(const TypePtr& type, int32_t length, int value_size, const std::shared_ptr& data, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr); + const std::shared_ptr& null_bitmap = nullptr); virtual ~PrimitiveArray() {} const std::shared_ptr& data() const { return data_;} @@ -51,31 +51,38 @@ class PrimitiveArray : public Array { protected: std::shared_ptr data_; const uint8_t* raw_data_; + int value_size_; }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ -class NAME : public PrimitiveArray { \ - public: \ - using value_type = T; \ - using PrimitiveArray::PrimitiveArray; \ - NAME(int32_t length, const std::shared_ptr& data, \ - int32_t null_count = 0, \ - const std::shared_ptr& nulls = nullptr) : \ - PrimitiveArray(std::make_shared(), length, data, \ - null_count, nulls) {} \ - \ - bool EqualsExact(const NAME& other) const { \ - return PrimitiveArray::EqualsExact( \ - *static_cast(&other)); \ - } \ - \ - const T* raw_data() const { \ - return reinterpret_cast(raw_data_); \ - } \ - \ - T Value(int i) const { \ - return raw_data()[i]; \ - } \ +#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ +class NAME : public PrimitiveArray { \ + public: \ + using value_type = T; \ + NAME(const TypePtr& type, int32_t length, \ + const std::shared_ptr& data, \ + int32_t null_count = 0, \ + const std::shared_ptr& null_bitmap = nullptr) : \ + PrimitiveArray(std::make_shared(), length, \ + sizeof(T), data, null_count, null_bitmap) {} \ + \ + NAME(int32_t length, const std::shared_ptr& data, \ + int32_t null_count = 0, \ + const std::shared_ptr& null_bitmap = nullptr) : \ + PrimitiveArray(std::make_shared(), length, \ + sizeof(T), data, null_count, null_bitmap) {} \ + \ + bool EqualsExact(const NAME& other) const { \ + return PrimitiveArray::EqualsExact( \ + *static_cast(&other)); \ + } \ + \ + const T* raw_data() const { \ + return reinterpret_cast(raw_data_); \ + } \ + \ + T Value(int i) const { \ + return raw_data()[i]; \ + } \ }; NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type, uint8_t); @@ -137,25 +144,22 @@ class PrimitiveBuilder : public ArrayBuilder { } // Scalar append - Status Append(value_type val, bool is_null = false) { + Status Append(value_type val) { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - if (is_null) { - ++null_count_; - util::set_bit(null_bits_, length_); - } + util::set_bit(null_bitmap_data_, length_); raw_buffer()[length_++] = val; return Status::OK(); } // Vector append // - // If passed, null_bytes is of equal length to values, and any nonzero byte + // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot Status Append(const value_type* values, int32_t length, - const uint8_t* null_bytes = nullptr) { + const uint8_t* valid_bytes = nullptr) { if (length_ + length > capacity_) { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); @@ -164,21 +168,26 @@ class PrimitiveBuilder : public ArrayBuilder { memcpy(raw_buffer() + length_, values, length * elsize_); } - if (null_bytes != nullptr) { - AppendNulls(null_bytes, length); + if (valid_bytes != nullptr) { + AppendNulls(valid_bytes, length); + } else { + for (int i = 0; i < length; ++i) { + util::set_bit(null_bitmap_data_, length_ + i); + } } length_ += length; return Status::OK(); } - // Write nulls as uint8_t* into pre-allocated memory - void AppendNulls(const uint8_t* null_bytes, int32_t length) { - // If null_bytes is all not null, then none of the values are null + // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + void AppendNulls(const uint8_t* valid_bytes, int32_t length) { + // If valid_bytes is all not null, then none of the values are null for (int i = 0; i < length; ++i) { - if (static_cast(null_bytes[i])) { + if (valid_bytes[i] == 0) { ++null_count_; - util::set_bit(null_bits_, length_ + i); + } else { + util::set_bit(null_bitmap_data_, length_ + i); } } } @@ -189,15 +198,15 @@ class PrimitiveBuilder : public ArrayBuilder { RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } ++null_count_; - util::set_bit(null_bits_, length_++); + ++length_; return Status::OK(); } std::shared_ptr Finish() override { std::shared_ptr result = std::make_shared( - type_, length_, values_, null_count_, nulls_); + type_, length_, values_, null_count_, null_bitmap_); - values_ = nulls_ = nullptr; + values_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; } diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 7dc3d682cdc..b329b4f0ef7 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -77,7 +77,7 @@ class TestStringContainer : public ::testing::Test { void SetUp() { chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; offsets_ = {0, 1, 1, 1, 3, 6}; - nulls_ = {0, 0, 1, 0, 0}; + valid_bytes_ = {1, 1, 0, 1, 1}; expected_ = {"a", "", "", "bb", "ccc"}; MakeArray(); @@ -92,23 +92,23 @@ class TestStringContainer : public ::testing::Test { offsets_buf_ = test::to_buffer(offsets_); - nulls_buf_ = test::bytes_to_null_buffer(nulls_.data(), nulls_.size()); - null_count_ = test::null_count(nulls_); + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_.data(), valid_bytes_.size()); + null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared(length_, offsets_buf_, values_, - null_count_, nulls_buf_); + null_count_, null_bitmap_); } protected: std::vector offsets_; std::vector chars_; - std::vector nulls_; + std::vector valid_bytes_; std::vector expected_; std::shared_ptr value_buf_; std::shared_ptr offsets_buf_; - std::shared_ptr nulls_buf_; + std::shared_ptr null_bitmap_; int null_count_; int length_; @@ -143,12 +143,12 @@ TEST_F(TestStringContainer, TestListFunctions) { TEST_F(TestStringContainer, TestDestructor) { auto arr = std::make_shared(length_, offsets_buf_, values_, - null_count_, nulls_buf_); + null_count_, null_bitmap_); } TEST_F(TestStringContainer, TestGetString) { for (size_t i = 0; i < expected_.size(); ++i) { - if (nulls_[i]) { + if (valid_bytes_[i] == 0) { ASSERT_TRUE(strings_->IsNull(i)); } else { ASSERT_EQ(expected_[i], strings_->GetString(i)); @@ -197,7 +197,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { Done(); ASSERT_EQ(reps * N, result_->length()); - ASSERT_EQ(reps * test::null_count(is_null), result_->null_count()); + ASSERT_EQ(reps, result_->null_count()); ASSERT_EQ(reps * 6, result_->values()->length()); int32_t length; diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 2b3fba5ce09..fda722ba6de 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -68,8 +68,8 @@ class StringArray : public ListArray { const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) : - ListArray(type, length, offsets, values, null_count, nulls) { + const std::shared_ptr& null_bitmap = nullptr) : + ListArray(type, length, offsets, values, null_count, null_bitmap) { // For convenience bytes_ = static_cast(values.get()); raw_bytes_ = bytes_->raw_data(); @@ -79,9 +79,9 @@ class StringArray : public ListArray { const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& nulls = nullptr) : + const std::shared_ptr& null_bitmap = nullptr) : StringArray(std::make_shared(), length, offsets, values, - null_count, nulls) {} + null_count, null_bitmap) {} // Compute the pointer t const uint8_t* GetValue(int i, int32_t* out_length) const { diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 1d2d1d5f9d7..08222d50894 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -40,8 +40,14 @@ static inline int64_t ceil_2bytes(int64_t size) { return (size + 15) & ~15; } +static constexpr uint8_t BITMASK[] = {1, 2, 4, 8, 16, 32, 64, 128}; + static inline bool get_bit(const uint8_t* bits, int i) { - return bits[i / 8] & (1 << (i % 8)); + return bits[i / 8] & BITMASK[i % 8]; +} + +static inline bool bit_not_set(const uint8_t* bits, int i) { + return (bits[i / 8] & BITMASK[i % 8]) == 0; } static inline void set_bit(uint8_t* bits, int i) { From c06b7654bccfe8c461869a6e5922668896c27c45 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 24 Mar 2016 19:19:22 -0700 Subject: [PATCH 042/210] ARROW-62: Clarify null bitmap interpretation, indicate bit-endianness, add null count, remove non-nullable physical distinction As the initial scribe for the Arrow format, I made a mistake in what the null bits mean (1 for not-null, 0 for null). I also addressed ARROW-56 (bit-numbering) here. Database systems are split on this subject. PostgreSQL for example does it this way: http://www.postgresql.org/docs/9.5/static/storage-page-layout.html > In this list of bits, a 1 bit indicates not-null, a 0 bit is a null. When the bitmap is not present, all columns are assumed not-null. Since the Drill implementation predates the Arrow project, I think it's safe to go with this. This patch also includes ARROW-76 which adds a "null count" to the memory layout indicating the actual number of nulls in an array. This also strikes the "non-nullable" distinction from the memory layout as there is no semantic difference between arrays with null count 0 and a non-nullable array. Instead, users may choose to set `nullable=false` in the schema metadata and verify that Arrow memory conforms to the schema. Author: Wes McKinney Closes #34 from wesm/ARROW-62 and squashes the following commits: 8c92926 [Wes McKinney] Add to README about what the format documents are 1f6fe03 [Wes McKinney] Account for null count and non-nullable removal from ARROW-76 648fd47 [Wes McKinney] Indicate that bitmaps should be a multiple of 8 bytes 4333d82 [Wes McKinney] Use 'null bitmap' similar to PostgreSQL documentation dac77d4 [Wes McKinney] Revise format document language re: null bitmaps per feedback f7a3898 [Wes McKinney] Revise format to indicate LSB bit numbering and 0/1 null/not-null distinction --- format/Layout.md | 77 +++++++++++++++------- format/Message.fbs | 10 +-- format/README.md | 17 +++++ format/diagrams/layout-list-of-struct.png | Bin 60600 -> 54122 bytes 4 files changed, 74 insertions(+), 30 deletions(-) diff --git a/format/Layout.md b/format/Layout.md index c393163bf89..2d46ece606e 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -42,7 +42,7 @@ Base requirements * Capable of representing fully-materialized and decoded / decompressed Parquet data * All leaf nodes (primitive value arrays) use contiguous memory regions -* Each relative type can be nullable or non-nullable +* Any relative type can be have null slots * Arrays are immutable once created. Implementations can provide APIs to mutate an array, but applying mutations will require a new array data structure to be built. @@ -56,7 +56,7 @@ Base requirements * To describe relative types (physical value types and a preliminary set of nested types) sufficient for an unambiguous implementation * Memory layout and random access patterns for each relative type -* Null representation for nullable types +* Null value representation ## Non-goals (for this document @@ -79,28 +79,55 @@ Base requirements Any array has a known and fixed length, stored as a 32-bit signed integer, so a maximum of 2^31 - 1 elements. We choose a signed int32 for a couple reasons: -* Enhance compatibility with Java and client languages which may have varying quality of support for unsigned integers. +* Enhance compatibility with Java and client languages which may have varying + quality of support for unsigned integers. * To encourage developers to compose smaller arrays (each of which contains contiguous memory in its leaf nodes) to create larger array structures possibly exceeding 2^31 - 1 elements, as opposed to allocating very large contiguous memory blocks. -## Nullable and non-nullable arrays +## Null count -Any relative type can be nullable or non-nullable. +The number of null value slots is a property of the physical array and +considered part of the data structure. The null count is stored as a 32-bit +signed integer, as it may be as large as the array length. -Nullable arrays have a contiguous memory buffer, known as the null bitmask, -whose length is large enough to have 1 bit for each array slot. Whether any -array slot is null is encoded in the respective bits of this bitmask, i.e.: +## Null bitmaps + +Any relative type can have null value slots, whether primitive or nested type. + +An array with nulls must have a contiguous memory buffer, known as the null (or +validity) bitmap, whose length is a multiple of 8 bytes (to avoid +word-alignment concerns) and large enough to have at least 1 bit for each array +slot. + +Whether any array slot is valid (non-null) is encoded in the respective bits of +this bitmap. A 1 (set bit) for index `j` indicates that the value is not null, +while a 0 (bit not set) indicates that it is null. Bitmaps are to be +initialized to be all unset at allocation time. ``` -is_null[j] -> bitmask[j / 8] & (1 << (j % 8)) +is_valid[j] -> bitmap[j / 8] & (1 << (j % 8)) ``` -Physically, non-nullable (NN) arrays do not have a null bitmask. +We use [least-significant bit (LSB) numbering][1] (also known as +bit-endianness). This means that within a group of 8 bits, we read +right-to-left: -For nested types, if the top-level nested type is nullable, it has its own -bitmask regardless of whether the child types are nullable. +``` +values = [0, 1, null, 2, null, 3] + +bitmap +j mod 8 7 6 5 4 3 2 1 0 + 0 0 1 0 1 0 1 1 +``` + +Arrays having a 0 null count may choose to not allocate the null +bitmap. Implementations may choose to always allocate one anyway as a matter of +convenience, but this should be noted when memory is being shared. + +Nested type arrays have their own null bitmap and null count regardless of +the null count and null bits of their child arrays. ## Primitive value arrays @@ -112,9 +139,8 @@ Internally, the array contains a contiguous memory buffer whose total size is equal to the slot width multiplied by the array length. For bit-packed types, the size is rounded up to the nearest byte. -The associated null bitmask (for nullable types) is contiguously allocated (as -described above) but does not need to be adjacent in memory to the values -buffer. +The associated null bitmap is contiguously allocated (as described above) but +does not need to be adjacent in memory to the values buffer. (diagram not to scale) @@ -180,22 +206,22 @@ For example, the struct (field names shown here as strings for illustration purposes) ``` -Struct [nullable] < - name: String (= List) [nullable], - age: Int32 [not-nullable] +Struct < + name: String (= List), + age: Int32 > ``` -has two child arrays, one List array (layout as above) and one -non-nullable 4-byte physical value array having Int32 (not-null) logical -type. Here is a diagram showing the full physical layout of this struct: +has two child arrays, one List array (layout as above) and one 4-byte +physical value array having Int32 logical type. Here is a diagram showing the +full physical layout of this struct: While a struct does not have physical storage for each of its semantic slots (i.e. each scalar C-like struct), an entire struct slot can be set to null via -the bitmask. Whether each of the child field arrays can have null values -depends on whether or not the respective relative type is nullable. +the null bitmap. Any of the child field arrays can have null values according +to their respective independent null bitmaps. ## Dense union type @@ -233,8 +259,7 @@ Here is a diagram of an example dense union: A sparse union has the same structure as a dense union, with the omission of the offsets array. In this case, the child arrays are each equal in length to -the length of the union. This is analogous to a large struct in which all -fields are nullable. +the length of the union. While a sparse union may use significantly more space compared with a dense union, it has some advantages that may be desirable in certain use cases: @@ -251,3 +276,5 @@ the correct value. ## References Drill docs https://drill.apache.org/docs/value-vectors/ + +[1]: https://en.wikipedia.org/wiki/Bit_numbering \ No newline at end of file diff --git a/format/Message.fbs b/format/Message.fbs index 3ffd2033208..fc849eedf79 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -129,8 +129,8 @@ struct FieldNode { length: int; /// The number of observed nulls. Fields with null_count == 0 may choose not - /// to write their physical null bitmap out as a materialized buffer, instead - /// setting the length of the null buffer to 0. + /// to write their physical validity bitmap out as a materialized buffer, + /// instead setting the length of the bitmap buffer to 0. null_count: int; } @@ -148,9 +148,9 @@ table RecordBatch { /// Buffers correspond to the pre-ordered flattened buffer tree /// /// The number of buffers appended to this list depends on the schema. For - /// example, most primitive arrays will have 2 buffers, 1 for the null bitmap - /// and 1 for the values. For struct arrays, there will only be a single - /// buffer for the null bitmap + /// example, most primitive arrays will have 2 buffers, 1 for the validity + /// bitmap and 1 for the values. For struct arrays, there will only be a + /// single buffer for the validity (nulls) bitmap buffers: [Buffer]; } diff --git a/format/README.md b/format/README.md index 1120e6282a5..c84e00772c3 100644 --- a/format/README.md +++ b/format/README.md @@ -3,3 +3,20 @@ > **Work-in-progress specification documents**. These are discussion documents > created by the Arrow developers during late 2015 and in no way represents a > finalized specification. + +Currently, the Arrow specification consists of these pieces: + +- Physical memory layout specification (see Layout.md) +- Metadata serialized representation (see Message.fbs) + +The metadata currently uses Google's [flatbuffers library][1] for serializing a +couple related pieces of information: + +- Schemas for tables or record (row) batches. This contains the logical types, + field names, and other metadata. Schemas do not contain any information about + actual data. +- *Data headers* for record (row) batches. These must correspond to a known + schema, and enable a system to send and receive Arrow row batches in a form + that can be precisely disassembled or reconstructed. + +[1]: http://github.com/google/flatbuffers \ No newline at end of file diff --git a/format/diagrams/layout-list-of-struct.png b/format/diagrams/layout-list-of-struct.png index 00d6c6fa441769a3c86044a52186d71c0bc23d54..fb6f2a27e07a766729d12ea33454db011ce6ae00 100644 GIT binary patch literal 54122 zcmeGEWmJ{h_XZ3Df(j_zCDPs9N_RKX-Q9wKba#g|(kZ2Mce4p;Bo$B)Bz)FBN6-Iw zp7DHrzq~)kaJaqqU2Cqn)|~UYuDQdN6eJ%b5g@_9z&w_g5>tVJfuDqdfjf8z2afm{ z35kM#U|m!spTks*6YYclAUaBEyTHKUP(uI0N~=(ufD`7d)U;f+fmff z%E`pR#6l*7L`q63;B0Enry?ft>vHfvK{88MS4Tc(W)BY!CJ%Nd2WJarR$g9SW)?PP zHa13Z2BV9Yy{nNYqrD6H-#7VvA2Bl*6K5+&S1Sj5Qs{k+j2+xu1R{*SZ06zu?k~hC@Yk7t9Q(iL{C&QX zrGu*ju!6IdiL|||nKQW9)d>1_LTtaT{{KGXzxPsfwlV{w{_ASiUswP8*st>in4tsz zmm&VP@?S@RnT3!9nE#nFA*7luwpADy5g2K)XKJ3XN7-(#h}7rbt%*2#cxW>;^LGW+ zH}gl1Z04*Y;28KiAs=m$7&>`)JSJFq@l|l->f^P>TQl$0atrU`+&dHJ!}+{pWNHOC(qX z3)PyX(%(1ueJ>iG_YeMi`#f5BtStF1>w@1W{eE`d`ZtpQdxyWj4Jbxaa^ylNkCyr4 z1;LkrGd}-41fTI-qzi&o!A1Aq0fCqP@1g&Hcxhhv5Y7vH7DEx2ezWrbntL)EKF%k4 zzt8Ap>Du6uD1Hcfnr<`=6Emwd((W;Tj@q`2=;@^G{Fua1Ek3rlm}x zSS1c0m9URlowyY_UzTaH>b6ic z{~Vc7Z4j^ZeZ0$rAUCPNN$0&)p|mQZ&N)p3V$8Ld-&flz4E#QRY59}`?rgSf_ioku zu<~kL-)kpX`1YIXIqb>thvTMY@HS~kar)wBXzc7h5e4zhf{3(qlVqp1Tk4L(LP*%Zabj# zfk2o$ZrG@V7W7=O*&0!p_mG*}I~PudsB=f%xaZVViCrh30Smlf5=MFOn4n=sf&Y|p zEc=B?G-nLo0S$)>)0*h-Al8~Hf{K=9k^7P>N&?U5^_t)HF&LcS&zrBPz{a18<^$h9 zdHHSX4Oi{%n?|ZE$8PYD_8uo=oXZh|yXUMVtUl4}z9(G|z(u!8$Y5^1?`Ap(E!)<^ zZU>%bG9T=AQUn!eYI=zStogSW7kV!>7C&=l^2caeD!*zQxpblz`B;O!i?M z_l2vn>E=Ke>VnUg_XG5(3NL()8ykj}r8zBdGOYd{r^q`@OSe>=90B{59UK4nGY`F=^R(D|&Y35C|km-esYOnB;FJ_lYOpY))9=T!RgVYqJ` zMR6(H<0J3vuD;K4@Zk2&0(q>!*+%FSy9*AdeYhnQ3XgBTz5IFm9imfA`72x5(}$~M;)z$3dWdIN+lSNqGQo2z zn)`F}R?J^h7K%!tAz$In$kWb5h!L9uW)OY#4a?8F?;WWZkyHb&Ez9<8P7&O=6}t3{ zCJH4FZCiFy9uPTp=JN~4WgC@qL-oe;V{c^jr0 zHtl$cs>E=#%|#wfWCr`my1X((oI_!Zb%3ZnHOnkTo2S?RsV+UzrRJ|3D!`s%ayk%G z@RcAYANx4Z7Fi_2^OaOmY=x#(sYG+D9@hmvIU;ja3xGu&BLrD2K2T zd2PoRBKmqS8ik6R?;CvFk)(+M9_$c^NJYb_lo2#dBH(#a5GxdsiO+nrPvEmE@O88& zp(LC-he!ABY=n6LHJV6o>QZ(w_Wk#d{76LJdv7Y7hH(_Eh>{ig_9;+$&{KO6aP`op zBVTn+mYe7c5a>G(QF8PlsYr~hJGAao^(*3{6=>(64fdL08gn$LOy)JZl+D^ z6rTFR9l~*jf@$lIVfOYxYnIn;x^m&-bJ~j$<_<$T-cMOB-P;&lQXl<)-ijq;MiAzu zWQIynGYDk5FIgd4tGJg6=+6){D{1AY(2IyOunW+$zqEHI8nm*w+DWyTTB0e4NSV!nu5I`=CE01xX&P9G+5aD*fBO##JbYPT#fuA z$J?ZJAM`a8GCC)5EmS#2g3sG(q^YGlCFE~mK7DTcLNx|wtB<`wI$X|yL)MMtL;She zRdb&{>#(6L^d{o0uMpY(i&H=TC$QtUfywyt{=5E%)m{gB1eFnlXt&v$97T>Pv7#XA zW>?dY?kJA$S`^cn1A5|qrevW3MXFIv;w;5a(|^U@24dI-dGg%#IVoiI1B@S(OP>Nb zBg9`5vi&@b^-t1}bJF%==|<;Xp(;Sg$oCCJpy?sM1i?{fNNrG*9W}4aMz1CRrP%Y~ zQK}DM>f^L5OKkm;Icn{mGfIv?IxK549X{~tz8e-|m7262vi$+p|2S8s{1%Dy5Z|u& zqa4f7$!dm1WAaJc`uH4|abB;$(HM#%C&Ye7Wu0K63)yw+;)FTOqcXJUyyN(oIb@58 zs`j&CGaqX9=LxgUUtbHXA4@dt*m`frOBQ#5pB^uu@kNf0A4l5bwHNsQqF4{-;krB8Rdx!JJo52Er@mrhlL><~ee zya^TWF!M*69BqFkr5Ywy;>TCoP0;sBw9m6TNvpOG{Y%MZ;|B9HCud^L{S&YYM#9!S zVp|r6>t23mQ`JJnTUR`lFbKDQ5_m=xcr51`Ej0p(Q6V^+;yJ8UHTNzXn9u}vOBgp@ zD@a;9$5UOP`c*T%eq#3vDnxeD@N$>jA)BiiJY@+b-d_(wjlo%bPaY+%b>RuO zs~xv+5Gc{AbpjY}Nj^hXw-7XL8{x#y2za+{7CEqct={ z71?+*af;g(10h!E>{PWzLt;hiLeUMyEXCM+Mr0C>XYY;*{F{iUdr{utts#AWV#7# zu4~=1(@PQ@F8#T!NBu#}82Kr1RzqkL9K1bnu_FQ=5(_i|dj!nQN;#LWjEV=NLwgz* z^fT#baE?-nEL?a8>9UjyB8yUV==fp!W8JtpC}M4**fqIHx*he$b~J^z9Skf4^ z2ig;Y6YryYdLo(=ZcpLM>U`@{+Hx+00|AhuWYTkSrLJF&!yP_%o54v*_38l8K5dWY zj_gf1PoL~-Q|fB{>;W^nv(XIt`HfZaD?Wy-6|NzSu$pZv{&QqXhGQ6gf zYWOuDC75A9bqTZj{O`GOoza@2nLR+O{_4fmcb3TLI80k2R+5HWcU723kq4t_)h_nM zHlWa1TQs?R#Hd2K&oB*C>0 zc6Dep9H%`kJ|$yk@iyX%Qtp~QlzpYd6a91T=bZR?5k80kF#J>yOx0S)79JWIt^i5yH+oz1`4mj3?Siy!dt(}9%r7F>D~KPOl->rO zaXr50ypnsO-&v642{0>R>a?n!LqUDo~0Qt)s? z*>paGd=oSiyK!d7h72+v`^0vJQfT!Y-QD%29gRzkp0pn=s_@x)54j)HV!Dl0#0Mtl zyqR%f=|4~_?}7i&+@9G<^J2a+4Hq_yOrS`1x$Ilg%Ep+es@ucyW=FKn!)8~O?HW=X zR0$yv!2)EqMpIyExt}&{edv}Z+{chrsqa&1rRhp>j9Me)9ux!%ivYEbJjgS$Rj1+|AMX{Xfm$A}6Jw+C+ir_;F#^S1PZM$M6q}UB<E@ycsDbvVRk2T_FfruO_Ch_6tAqbijp4T}VM#=N? zyr7EaBS|LGc;dxJs9RZhFeWw2UCq&5C%VbdPc%!#hGD~4zq=8efF~>(nNo?J&R|{X z+%!!aMIu(YEeb1F%_Cc%`Y1lSxp&R?>tkkS63_L3hq!8G!Fqn7+=^Ac$)fJJ;gKIV zR?0aN_=6!hMj}x~N`nj%;)#Y!D(75-mKIIPZCGlsbyx%#SF-nFL`@Z2tOiQRN zs)`+jSW!=QU^^_0NI|(}x+ak0nXul=6R~AiuRc^+PxI=OI#l`8J&3=u;;Zjl6|N~pypfn*xnDIqGu327 zAZD#5zY|gcs>{yE2lR=QGPPGVrR}tmua(kD*|khg7M^dROS&nGF`L+RsSvtt@*J|`m6nVozE0$2XYz3U^Qcg@3K}dCbagrnu8!$3)h0)#L=PHB zmZ@8m2PQ<-tz^c8o6Aw)k(1JBh)G!+gpbbo>ly$fe92h`&=>LfEqGc@EX)mX;I``& zc$zlKK6(Q8yZ=!}wDL;{xlZ|)7~=q@4k<<9TUU+*8g&Ar!|kt(Va#v}^~n;!nNGHf z$y8}FIlgB@4B{vYFc&f#*!M@!rXk~0-c0@DgX#s6&}7MZAre=M9&c1$oY7+&QW}-Yue4fn3D#&>(J5@aj93@rGR!gK7m~9O zFmeIKeA7XVJ(kF%M1E_vEuZeHOJlzRKZC|V`*gxcUGx?S{*}>Nd8dRlp{9&Fb?b@V zdGy1>#eurk^+Ne7w}%R9`1)Rr&4L3`bgKfyF~Iv=y4hZQqnq^Ph^^GaH)5os(zrT{ zxu+;QI|%!L2nKCs-04bzu)&>Y+!*=fF8J8xYLdZkU(A^-gk{Q}WeQW9Lrnaf z{X~PndR3U1V-dn1@s4ZpcJc*$QxV^N6&!^Uw=c;Oi+oo+_Xhlg8Ti(V^dSM1sLPkn z6wI1)0g@$x;uRgZPiS!J`nEniC#JUkYh(Mbgs1JHns3+2i+f1O*zfBVtPX~A)iqym z>M7Rd0^o-;I0`41!<9iEogwtoxk_Vtg-g+DEfReLW?f>vnDhpM0PGR&s~B@H6Pyl;rQFI02{-!n*+I0~aY9?}S?P2tRz zL5zB>6?8@9zGy^SFB5p}o+!`N1xY~gtMC`CFot{nu^u1sy;_;pIxUXy7RFF^mQ5lv z$D63;B!-DytZ==6x=^=?Qhd*mn40+9n7YRr!=~zVUr;&+%i(9KmM`iwG5gh1_Rq54 zs+G^3CC)sk4Wx@^WW*n=N;C9dP|CcyG`XtKwYxxxN}fz-{+si#VC=s>wi4l+7EURvf^Pi}isLcXk4#NlA=Y1SEePyfjYX4hEf<55C*J;g}x7b<0k z-Cq+8?g>8>yHB_5w-jUz5UMD+8`s`cf=H%2^3uD`23r{{!g>+e>jf|dH3=^RraUhv6ZV(GoPWNN`-)ub@QQIpO@wVPZFLC`Ua#U*7 zhkj>RbZlbtc;X{K?S&rkJS?0xR(JjgK*}k}C)v_QH1w-333!zZ@=>&@pB0tXnrLim z(H`$!c#fjf^?L+Z}r8qI)19)cMSm zU>Ax_ZUjHCQq76&v5t5yDsC~ML157&Cxbb$n-_!EvcxQ4GNVGzz<61YeoetWs8x!iYIv*l*_JWXDpQ`| zY9{yY^%UwdF1gXp@@q_mG##cn<#R-ZcBTUPUJ_XvZM=~EJlCwoQF_F2!>|E{$7F0% zmDFjlU_Q zW@1f|zz57+F1x{@56F<#!Wk0mQYsQVtLTVr%?_{p_;3C2m>k&Jbz$|CaUKveT#~$& zzzyLn>@2lj3=-4H&M2H}D>&w-LaeS2I1Y@jxk#SZV#LZ@uE*X6=u*{;D)oTclxw`<5`)`?$Rks_M^d~My2*5 zu20iW#YDE5`>W5aY-k@y8!R1l2HXR#>}WCV9LkMqX8BliTJM}^JA;g%M<)|AOUtP~ zK}a*3+E!n*u&hX|$f+1E8DlYZvEBlJ_>nP-+?2_3f$ZENb)iQYnAoxi)K^xl?vPf~ zd~?-~c&Hs){<7mYl>tRbx2_?&F_8i034o1_XKWKy9fVer@{}7W2_V0%;8|aQd9ZXu zWHRSbv`m|s`!aHtwEqZ3PWErrz*3yZ4eIiGudd}|>d)B%)$H`6;f1kny#TOR~O<;uJxM_K&ggcuOYwiW)c#4Wd5T@;Fi0K|>bPx|VZK z7+7ES(z_1u-&OC&U70&bIv zTPuNz$noHE`I5KP5#3tvaVLK336z3Gp1|8A&k*NCl+4{H+^%)&;h7I!$9ud1n%= zV|_O)?)hQrN9kB>re9=`#gwUBe>#6Vqb!qmdI;zE!By9`7QueB==hxi%GUP9Iz6i3 zXRpt3D>kVTAxk{{65j8s{sER~#13hBBHJH_EyWIn;jj`Ki_z8X+uMQRBqNR66W3+S z=B*+c+76_Rx@Xs}*|u#t-u=VlV@L?+pXX^^#JLtLe~@dk>b}J?!_@-&Ej`#wC~;PNOa1ELi3X`{rYOj%p>{5e*86 zT~%*C@>Sb+TfM{BQ-QNxk~THxa*|*E>1y@eo$4P+$6cz(+E4jQhiv$VIM17PnobRW zqGph;qw%^m&B@GpAGYg+*P6tO)IUTk!xoNgxF=uVTKnSf*bd6=NpzqPl6=j_r`U^Z z%qT%iwxZ{EwWDr)>oKc^oc*QptM$`f6md9tauLJh;zqlk-|sDzhY8d|*{{@_Xr7Tm zh7;aDhM)!Ss6gJ!4X1peP0!9=n3iRJ3FvWF{>|?fNxXat|2z`P6Xun{N2x5UwR183 zGmgJ7lsric78$$Pc3TvkKQ&GwnP}jPOg+mI+W%Zpj!47PmeG7G{io{45)&Aft7TF_ z^*=Y##8h&0=XAPG{6|&w*O(@$z3c|rZ3=gL2%;aT2( z&M*JRz(n9)$OEm@T<`IJL|R|~N{&rK!R{IVnDFmMv}OR!O?mio?th-k^)zqYHWB%l z<2SMTix6_<1T&_I`A+yh&$WYUtf+98tbQ-;-+Zh!FhmuVz5jonyYw17SDvY1Ch0$h zm;;7*A-{6>AAMK@V!#JGwh5-p-PXgZ>rwWJD4-WW6~1vybz7Q^85jJ%QV_!z!32o# zcA!OQD0?||scG=i?fa^S<`=WhSFq$UJjUm2Imv%Ktu-6G#JcRsyspFhvtfFuuAu9f ztEFWth99Mfdx#?O2=K-&RYE^+T+GFG0ri~9KNr>0_PNz2nsdJNhkWc@4qcO=?CB7=)TYC|0&y}3W!Vw{C?hCZpEs~bI!@o zS2h5s3rZHYI1vHi1eTWNn_wVEA{?X?PP<}{S=0eOf?Z$aOt(e3L_7gg^ z>s7j50(vON|L#gr;I#L_6JM59*BQpzUJMX~x>aNuIYFfZq!-b~fc?87av6037q@?# zmL-~I@CQ`(EtKvh-~~dK@J7P@BLkfe>w(CZ*Faw($A8@LC2j2Pr3oVHeA-Sn^A5H%l?P+2IPntYMJDwS)15>d%Fb0IWTfzD!P2- zfXr~sa_Xmj&r6kN7)YJ#%cki!$~sWE&9!VRDyH$;*s7AYI1LUlqZ~UF493yQ%UMD2 zeAef43$+4JW2j?6r8FZc6p4;)Q%g|x$xKTv-1TUtM9~Vn{|XcVW)Hps3XC4x2hfmp zC2tEMft1!SFCZ3wv-tl~IazWAc(OUv;4)@O<8X4>_v5hoCs15K5f^#x zRjdcIA71|aLIzbPS#|(nOq{y}ffWUTp65m|Yn(AbUqTC*6gxnCK$xgKwxB zC-FIgs+hpCdilBK)1DI;YwW{TrlX3?O6-46$>d{r0;~$wO%VCz=SFmRpi&W-U^*aE zS(x_qzdq(Rode>EVeO-=%f@0=qYWaX*FTx+R)NcFucRr|vhD^VGvPhXXH*pmhssvY z&`T8s&($tw!48l#7pJN3Wp=?ZoEQ<-YHMDY7@P*iUG|0R?8y{_@$HN_&^N-aT>#K!vn3hx)!6+&OgcOs#wQ5FJ9hhAAL}P#gzq3cRmDp(>$m4eVC9 zV%Ul6TN$zov95#2V^^S|T6XMttUZDhz+wx6XDa_uO%Et=UlbuI{uz`iMZp&R&4$(U z#PH_%{_d-LD4kK-OKbLR$01&EzP=i=6C{3@8%60y1Q<57P*IobMTSjP7ki4RD?vb@ zik}Qr$W}BJJ`F*^1S`&7p;sdLYB!S^1JV^2BoeI{$|woDNw{GyM_{oIutuA`$v3Hb z&O-)2e`rS4dZHAJ7A%4yzyK^3g5`#^KR2~#u&K$1J~_nLHwVs3yNv(VcE2cf7i(AM z=rx3mC%(>}JrmRFQBEn_H2i|Jg_C^^s#-ljG|1lk@}BpXZ1m} zx^>=E#fC@ey_95p%J#f8DE*ftnA)j0S_`c}$ffQuT>nQn5}`-C?C@l-WXaIyz2EdB z%xi-{hpU&~LOTWD>>&cy{5Vfkm{9!ZlP_Sy!Z#bv;N};YGrG5f;(8P8YD)g+%RdFS3pT8n ze^?v1$?+!0(YjLOoBy4~+p^-f`gy~U?IoF;H+6)5D&K+YS5H4b%>B{x38V0^J~DQZrc0h7 zICpFnc)vC(@xAU_+op7Q7fSKf<9bJ!KVOi{a5C?8zBVGf1;;JO%hV5o#(2Nn-F^l3 z(x96$5(;D=*7I0(GefTbaSHYW%@A|XS**6-|j2A_B2u2zU2j4 zP`y-KJ;yFpML}_@<#aV9lgx6J3pD>##-_zURiMp#x}3WmRm8pOib?2~QYmHraMhvX zwn-&me3!jI_~#c8X+M?~d4oNtU=fHO)8I&pB5X&7$QF47&IZX zn|zL32_kzWV-wzT5y$}qHT9hO0H*%B1w?lTM!P2l8WFYKn6;7=T1TNg&2`siRzn0X zCr`|A6KUGJ?i0>pJhB2RIFsu!z}sDeIs3d&y#~!HHY4bvJ02L^1+m9U*X)OzGg*Qa zAh%28sB~h%`7^_94F)+5eGN35NaF^Y$d@UkWC&N=-r`FelEV^7EjQyN`G+8GuyWn_ z{_q!u;6U|+fJ767x8PYTt`8GTVhM%Dd($ewy+l78pLH(5fQIT3ZN!(~z;b$Wa!svC zta}RLZ|2CiKO7pvQXZ~#)_uyY%Qg&a1;|nnBNC4jqEi~1;FQK0_Htt4FW8l)^F z067L5y6tOcvuhc-Y#Qly>lvY3UZ=J%Kwk<*A7uwtVs+bLKfUq!`>*5tAOM_co2OGQ z6BHN+Q$Nqvj$-Y{gQ)%^M*J(Z2c%F=mbgkAmGl$Xpb**?stR5Nb~lF@zv>MO?Vus3 z?iu4f%wbyyrs3x~EG~NEf4#hhB6Mj;U|Oy4+H#fA<5?ew_i)59K8%}~O5{=NBk@dm z_PGCE370?Nk*2(n#y`3UqpyKf&%jK`^F(85*cs$iz99D2`jUO}o*2yz|6|Wp;s|&S zU!qZv&n)}BF;ozIWtQKn$+O1{I`Ay0Gka$_t9>#1fd;A;3QiZ z;K5=Yn}H!@-9>u4UpTx!A2fP^U%3+0-0Kv=C{pA=6;oBQ50U&_0E7*E;0U;KN5CtxPa$+*4SBs8!8^=QKBJ_ zAVA9u(Jtgjm;qv5*(6OhON*uYe`L#l8EG;Gyz;TMqz|B3Gjtg^Mm zkK6ja{K(c4sc0C+|0`-KNCUX2LH+ZUuyg+xF!VcVw#0#2?O9NUo0six5$CUAie!S? zuIcLW`nP}QIG_&Vhy=c`ZEE+w^Z#cbs2}~$AO7m?`rmr_Rr>wUAO7bL|1K8(zx`ov zdH1Ww4-88)xblj+VGyWwy6q5{acS+ol70189JSApW zgUAKhs^f!}kose$7R*1xGQ}h(h_NqvAo{<<4F5?5Y(Q?^7E=HI zca!HYw9t|tpoP2oXCUL*Tz2TNvwza`Ux=0xT1Po+>-(PeHO}kU09_X`D3ALAj0E7P z6np#k=kvlhcP&SCBLw^BKrJ8Re*ZDSY?*si^JYY;Cy;)pR$g`^Gec1ikh*-K%CvO` z4GJ$nSq73kdq{%$?=k-dFyme==evfL&LE&!MKZP0G>xf!J*@7a8e0SSLa^9{8l9>S zT%uJ2G^bh6k4!M|yE0RYIyF7?1DP1Kmlz2;SoQl09ao+yG0ZWJ899K&jDlld z0dac-kVt4-5>WXrtM}AC`Dc2egUCw-brO#M(|Bu+Ve@8a6#@Eztdb{8$zWaYZq2#Y8 z6aiqvgrM(jx_ahzjo%pn7-&9p^y~N@R=xv(nc5N_=&1YL@zamPq9GLZ7~r`47mFs< zAfMj`4JSgtZ&~SDYd==}nKm|{!v@{wJT(4}W#Lr=5J^k`U{=(Rx6WzkG6J}*h_ZaU#y82G;Z70Y!0iX}`2Ng*8>;u&5{l$_^ z2j~xkxTEC`Jn6vaFr)-Dc?V`@tH=!iJiDS{;i;YE}7Iui;e-3fx z0j*#KysCa&=!s;Zb2$*-ncwsCYyH0knX|^Hneckao4Yq~QaFc2+9? z?W3|JaUqBPfdmI<>2y`BAFi)>G!~T7$lEv z2S#++dir4s)N0SEeBNy<6!m~D1NT&}7FFny)GXs)@GJD=K6 zsK(Zzm1vo~GiYt$OOcl#%>~vXwlPh9lLd-MBXY`yBY;V!&QhscVlw)I?Ic;_C+Jrs zPef}hvLFvV08LJORvdOkY`g!dCTJ-0))~yEi+Hs-_felL0nF`PgNZxk6|{v=Xv5#W zb-#ETj`kD47o)y`2E{=0np70a^G5!sg|;pcHwuT9YY2BglUO?22CZ)hGbv zLxKw^cGFB^z^LBY)&d^5oCC!!p?E_}w}~v>si~EwQf=WTh4SR@GT z62%2b4iZ+K2fv6lO9Z&8jid4M695Da6f6U*M0#)OKs0<$$ZTX%VO!I|6u=(i>QBs) zl*vkdV47xCVyV+C6o(21!zjl?f~X}%7{4ua+_^>27VpspeW`QW)DyX3w9PD&gi|VU zE$RjMg&*Jrol@Qj9610j80xSE?InG6JI1}3Y#$qZf%_Y!=g^&8<%3H4{Kn9*ul|WV zz$}tOtpA~5ctMOJienP=1z>A|Fo{LIVuRU(XN8cuE7Ri3+BpPKh6iI@7?JEl;g+Hh zRoZBL996u~GDkkMd`V26>!W&n;Sif|_cX5n4o}^z77OHi_x%B`FyN6$Iw2N7)sEw{ z?7lb!wjT8H6CkwH{DPK6tk6=1unqY?#?q1wL=4qa_emaE?t)(5uVe2<*+$fu1wjj; z8eN?6s2?D{l857d0x)1_T!Ezvz95>5z?!|y&7w+*=zGpQNJU7 zDHPbjkr|^)lzYO{M&dHc>gJIsV3o+ju&h?`bu>Ez;D(D2zk#MYv4EuHB(Q_%sl73+ zstOfMg*l@oB7e7NjBLB`^KwAgbKh}**rSHhb|PDiU1EIx8AqhLYNPgpkND;$;pQ4auC(DndscfvRUb|)bET+ zUjl>=Zi7j64ecEe2fWXQs2$pln|4xkt6Vv3`vA*ilc30YGb4j6HvZ)2r6dLX6a$nS zs-r)JV(FnI-lYvcz|QUm@cTr?(KKir!Hu{8JG@coM5*}D^ug{oF*p#suK}_=G{_Ax z#k?CA{&};a(>~Pgc%$Mb?R^S!>)tY$%Ftr3yQWnP;fitOTTOVC_13iV) zjZBgN_r=YJB_1K(LzJm;6G$o{(2&9AcOTlY2?62QaZX*cvM!m)4H3-^bZugiM|kLZ zo^9zq)7xaKF`rol-I+ZwGS%c2;~)omK;rpIH(q@c8Zt<5KyD^Psp?yIAkboA*`k{J zR5ZkJBG)uQ+PM)hl3soREjyQJq5H`s!T89)^xc9`^kwH50Lf<@Teh`H<{9vWpLF}%AHqpuf3VS;aRY2oFt*0v-9_C4^LYTLova<~O3v!-ymI7TOP=C%1rTVO{ra6m6G z#xcdQPJl>~3WH;5V~Q*J(;LQ|X-_QBBZuL2DHFRtV=BfAMMiiWyvk;5W4Z zn*WpdcZ_h+9$p4}^qRbExLLBP7T-eO2$A!(8FByqN%$x4TSI4mfCAPBxGZu~AlM7; z7L`*QH;WL4cav^w^C`N0pud7Z zK13ClZN)@<#;|M!k;~8l>nlk3VnHUBjR)BnS&#gH&!I*MApN}k;y)1c`o$?oeY-!z<5-Y@j()3*ApZ0)L(N1r>-(M7zDLb<=P7%h9IRGcN zT6*@e&{w~jw37wfIhN7|^Si%y^FTM|6%DNY7BD7XiiipM<`uu0hO{XsMG8I2_az(V zN0}=ZtSlew+cUy(E;TkCMyWq;onT+y?sfRXXqBZx>CK2DPeaS7!Hq+?${PP^;Tj4Y_X8gV#1w-n`41!eb1+o=X>`*3i&WyE&BDcm$-ai`o9Im6N^{@2b6+hte=|SD+qf*_6 zSk9o8uFs%t97#dn_G^1G3IArR`OmX6{%a*IX~#3Q*DuQw5W*j{eX9W?L{h1gQQMCA z-8X2cRhz63nBp}?67|jP=H*2^`OW$kU&*uKEwm)3vIg(lCN%e{kP1Gj z2BoMG!*leF{%{0Nl6fy9^z5_a&UBV#p{_*m3l|$aBR(mvNW>WFu1RdFWLFiRL8=8L z>YISN44-$T>9pX~#pqZAzx&Xq-79*5wJ&4vNw0S0DFG<~XlJIcFPT){nCj%us}QG8 zHiA(TTf^YyVM|=6$aV0*$aliI1Hsi7bY|91hyCizRKPo}>v)=m1{!)#OU1Duvp&gU z0fFq6?RcCEV#dutv3Q4!h}4^+c}=$I+`evcQwmwVpirPru-i3umHV7^=ZcM|yk z8oQ=+x7$H)RVXvBT{O_w_nx;umY0t|hjU(w{0u7ZSaJKbz@y+)tCEb;p`_J)rL7Q9 z;n@|?)iGV3K3nk@nbn68mRoJS+|Z~&t~98w0cufgdcij-Ze{YTUz>yaXthZ;4~*@* z-?3a5N?pkWvS%0vpncXqRU(W&DGVkK3YMm&kSkk{GOrCVJ*QKA&k-E|^zpP9E+);k zY!0D|ryJRGt*UfE?8B$~@$XO}H&WDH(y+c)TCM4fi8-;nBX&LAeMj2U(mTeU6WN-!|f+?_*VR+OEqUZP^4cJeI)O+hmM=SRdUFqm~T&&rL4dsxHq%*a%LrPA++}?P2xmz<*?$PI~ zY~AE0ij>K5DT9_atUj|osWdy`X`s`+bAEx;1qhqcaBdt z%$|T2Eel?`wTHId`?eTaMPI0Bf;#M32FeB0VP8~4?q(~1p770GQzxt8qZ4Fo$`(fk zF7x-IuEP^|DEvoCtsLZ>HqSx&B2o*#IZ(35Wmh@(_adlt(`Hp#&kIAvLuDH3RUSa| zO4)ptCBkeQJY9D1x{H(!ahG&{|BW)g`a*Z`U#2Bi*TD*)Wwo*gs2c$-na`35j%0sV-*4w_rXi_htSS-_m@USaSzxl=0* zrV1dlG8Qt}?1|cl$Kv(WHv-Q~pEor}HqDJ?RWVa^OY7(5>RTQ~9H&5Z=4uUlGNNST zdQl8X(=6NPe2l7&SjE*1NgI99LcA;Js*9c$7u08H1dN$D0qs z5}${faz7)9S#foE^TAUyH77^(Z7R;+*QARZv$$gYIPQdfsjJ~Qe;1s;?Y?Z6I!|Gu z)K|dYn0~wfZFmlN{;+pK^Qk;UY6Jre`sYI4h9vmGkq2;|*ddveirb5U37Iw-K2lgH z=vkK>#W%KOeFj73#T*wL`Dukc)9@W7i;YTAcwv{MTrAzGIFUpY@Q?jKk&_mpL{+Ax z#ei?O^o%2$E>p^9Q3Ib)<#9Q!)?ytTzj|FlvRzG=oLUzYxsw@a$RxYJwE+c?rrD@A z<365Yk=!63_B2013G+e4pDqa{!e{#W)?ofCWDW@fj_#oY*6}u`etS?lIGA zwj85}ieYT_jA;~Ugc_9EzA%@6==yPd{n4ntnCt5|$(gXEuLotGbLjT{ZVpF=;+ZgY zFQoD)9~!beQl)sy9K21-&qlHaNv}DgdR==`ZOh13p4BH#WP+lCW0qVg1hB6H4#%J4 z<`M+xQnN}pOlPh!v6R<#2@~t0j>c`}4I4&PWfY8_m*d&PB+fWp($}6RDcL?vj^tKi z=r^_YM(^bUF*DhnC)go=s~pskLW(gc~M&wXV^EL`HgL3+$f5Sd5kC?r_# ze_Gs4@(J4i`o=~e05QM6Ekq|Ipj4twC86yhGK;dWl4YY>paWka$;F@ zWW@piCNe{y4nBny0^ozE{=gu9U}B%uk^V^85f?RtHK~}~6#1$bee8HYfpHYVQ=vRy ze&|joe?3SsOZ7BK2>d?H;wD+tL@8Wal3iu1yBEMGe_`& z(J;zKaD9G-BAvI%9z&q2}YnEY`kY%y$ zPhJtHI26JVo)G!poHlg6Y9WL7+H4xA8}@m8KZ@RsvP*j(d2Yjy#q}P=H-TUi(QFhO z?Oz(+$wvS$Fjs1nM{FR1z0s_#VqwT}dVorzXXVRtjZPC>4^6JUNqHf~XIB2*Ns01m ze37poHimmV87sE9wOy^V5~KAFr(xS;O`@I{dpZgluL+*FBvqtY9n?o@R&-LH7&u&VO&XF!LdUC6^vYMws*R7FH=9?i*> z)5qzh%qd^`k@QmtFz67~aRT%q(rHTXiM^|wJB&tl#s&crGQp25)DY6zliVO0j*z~G zTth426oZH}8&|-<9gUFR0@p}{!7DJk3uHF~-cz0qDzf6kYCoQF=fvetP$O%wz{tH_1f^5 zBJ`9`(A>JnnqB+T|HIx}MpfB$QJ{b{DAFz69nuKW-QC?F0@96y(%m5`QWAnR(w&Nw z64D~l(%k*%8~ToMf87}Oj`96r;PJ4}d3LP5V$Qh+-JkMMU~5sX&s-fNxDuu??ADzZ6yX^|NdlOnC{TiXvkP z2EjKqSZQw}6BRg79@d-o+atiqRogVmp>2H-?;ow90q}GDyE^@4yqfh+g9`X}3Y>J` z=w4kbf74KQzS%nnjuz#}rEF2{=Ob{{cC{~JaW89F`kPUe2;q$2r>$tl#o&?O5$QYa z>9h~fW@?b>GLAFgFEBbj*zd-E><&^0Z2D0fUuw?cpFQE0z%hE02FrnHMkwyARSKBi z4#>$qK((#LfZc=V&Q!h@TviH;il-H_?eHkTNubn1`SwirPxRmJz=mZoj?jho#FZn{ zU_N##Q0OMP?iF2dyqrcgRvn$YvEm*sycm{oS>e1FKjx)YU2!oS3Vz-i!Va;MgS z;*ihnxLq(Bb08!($7J~UMDSS*a&?Fv6Dj(41F^^hcXEQ~p&K_zf+>a_F_ zIeaH&=_p!M=N2d=Oaw0Xx>T2Bc4vfN);;I{l?x84qy|WLxw#XXpk0a%smSGt!6{#w z@9(&4sQEy8+Wp>wy8kDX9VHMhX6RJ9d|Q%{l^E%d>Uh-%-AOIfFVSI@_~+XLZh$%! zk9PSWMLl!9bs^0B@Jek)eXxRN#5Y1r``4%|ap?zNm8g|vyvyGszX<6xC+yNj&Sbo< zn?Tl-t3rN)A-Kbiy2nF-rJD9q2-6;4UK=$Nr{r`fSQa(T1Rc|ZF5Av77)EvCfnaub z{)mCcOf?l<^6G2692|3232oXVCcTVI_yKPa#X6V0zD>)z))-?GP?W4i2`W#E)Hcqn zDHvxU-7fsdPG)bAO1nWleCt~Ja)Z%q{gX{WbUwaPa_fz&W=*(_EzW&_&6gcE1Hybu zXu;bLmI7~0Hmx~${gpe1X6ewFDwn{GibV$?OI}Lkz5K2%q%%d7QYYtUd6wRfa;PB_-25^DXAjwazBgX|9Rlwoc!t z4A8dN%lcGa)7Nd2g-ILk^}?{fD=JR7U2EL#k4z0GXIc;$L_^+!_wEa?W@)@NBN+;F zI5&aMunBJ8Dq6Jzp-^k0InU_o+rk=g^otb;ls<4R8*G_Xs+E>EpupcKu`oRA$G3lc z8@U^4#V#L9wG&>p6OQmvz~mJME8n;Yh#wJZ@HE5U*T+zjrIz7o-QMZoRIA_vQ!^YctjnS}DyiDJ;^fWNx)^vFMBN+9Eka5RW2hcvzwEOlD>12^x=R>HZ}H1;c8gY`h69>oN{H0X?0fvo z>?EETI0b~ePm2)xWrGvfs$^-3^%&kw<0D@fpcx3(6Ub&T;wqsW5y)|Bm+hEpITvqh zV)e_+F);C=G(^5HSHmw+ao-xgQy`Ty%g7KN*lm|hxnC2*n@#x+IT4}8O@{eNFYkT* zwma_ze3>NgNg+~Qvo)?Jyube{ZTA(L68Z~`Le!WQNYj$W3sa)RlTcT28dYiqL^4CziYVZEWpcf&C! zZHsXj=%)=zvrk}^d*(2-6N?r8mVBBo(&5P@ym;pYP9}@;%`7Vc;#VHb)`6CkGMzpO z;@qWzNSWe&2jj-#z3r$#DbKDI)=a@n!9s>%GiJr%O1f1}%y2~2L9y;h8wGAb=OxiB zjaf$;La_qf1vV5qapsjc*2cCB?}!s|+b=tMO||qq9YCY+mSS4|POpsJs;>LYQLrMx z>@zcI(e)&*GNWo&r`PWgmqxUZI5&|SHTsm&X5D4DjN2qUUxioPA9bk)*SevTgS$Hxcv8@Hnbh0Rr zX&7oV7W#_KgB{7#P5|n197 zwA7Uu+s2!%P|G5n(OUeNvXD$}M*R6l;qa&DOHU1G+B9Zo)1jhFo-!IKD@=NTap!5PxzzB=AX8NoZef}@lVmXiZdR++xst@s z`SIq4%~kV0S}S|J7iwT>XlGyO$N@&4M}AF(Y?ru8HRI3_U9cH5E1E#dQEa-*+~hd_ zFMO2cApj&}rM`OziiP}eo+1Yn*s!zF~wsDaoO$PdCR(u}<7 znOsX{;nWL?IvID5>doq@jq#>lKguB>hTre(z5smk>tS7rNUgu?masz7_0}Gd!vrv2sT@N zRmvX_cTFl_3nBFb!{4|Dm1ltuLm1;ZjNv{h=f#E2eG-XolG+1T0U)g3nDC-$=z_`{ zVAyXpW!N^#0XhdUtS!p;U>Lnbwx#~Wwz!2gk=dkL*lBCgMTADxKP2o!g-~TWEKy)!j(Cz0V0-gT=H{?fSl z=fng}Q-EpwY7n^|T^LtZ9$M>#-=6}3JXcZNMne24ZiH9Ejh1KH7z4K?+&^;dbT2Xg zr8RC76==^#{KeYwKa-C2QdIPvE^O57e+uE$;{>3vXMS1E62wCV2@(5(q^l*J;hM(m z!S6_dBX1B?7zI4b&VxrK-vg-GN|1pBw6$IEoV*}-0)>m}(g{dhcjW#@g%XH`<~I13 zvK(|G0QQdEMkR9*M#&hC6x$Pg2)?dGJS3j`9P4Bq)KFrM0Fw^F#-<2=c-(wD)$?+{ zuL=M_GGrb(rZ%>#a}u*^N?d4!f8!T0Bc>>T2 zM2Z9Q&oh9j)*Yvl)%X`2r4(ia`V*3{DmD}nZ-8iihy_VbgJMIf+%>45gPLI%O{&2G zC<(RRTwgq$v_uEfwFGKgT%hz>Z1y`?CPN}mnSg%VNtl`VHjkRecdaDR@3bnXKfVHV z(E>=-K{XyVCz)oz8*}c#)mA8}|Lpx0kO_E6r_)XWWAGWHgy{+jq^~T~Hu=1N6ch)1 z0D564yYnHwxK=E@ld;4P**;(DQjFB9%gBK#^1R;pG2Xpj5&1XP56}mg4c(TRNA?TedLZ-za zT~NEzD#X^B^_^VpE(P}GE5I5J=d^+vX6Fn14;dqm+y#24c8v1u(TS~X(7?C* zPCK65p4ggc!8BGGSLQID*Rg4ga^wOBWjP{w(y4oULHI-Z#VCWF61!|2z&Ku}Tj03nYKz~|LQo9OnO67qv1W-agp@WDO<@&*ZK0tCjn zW%QOL$NzlG($B5H;^a}C)F@!1rs%UhhlhA~{GnIa4f75xzXOhXumA>YXlj~zJRe?1 zm5@ph(DAy`(up?#qmr`=oh$AcNzWONN4OBOErWv<4{0+O1wtU}nxdn?vMM|<4I(m60BuAzMJk_ZXCV+s& z^)@T;+L>P~u(lDf7<(=w%vtN&Vm)u%O`zJfOf^gaa*}cP^W&8x=4euM;4tPPrMcv8 zqd}O{C@@hBQ@`8)Wq5QV4L&vEB+(&E$i#GE<9;TDYnByLG^e9t`k*%B ztm%VHYltBUpbyHZ#6wm&5?@XIXelCII~oBrOldq&Jm;U>*^vF1{6LkqbF#Qe6U%cM zYdIxQd|6C^+i+zi2iz$@s*9+t89+7kci8m)WI}R7kiZa_3f7=|q9)*0i&gc%KHJa? zyO_1k?U@xLQJiggnw(xvYAH!C7yuA>tw3@0uBzQuu;^QfU4D?Yy#ZAq*NDqV{|zAM zzFayOO*PShV6qBg>#UER0aqtt4?Io4U0qE6gQ}tOb0mH0Ids^OsSvAmu3Fe z5@VbkT=;)~9e{0$QPuj9tZ1G1$nlFS)#n|!_5d5y@6*77r@WLz7P$jq0zvKVRxRWD z_-4do2yYYZkzsENM73v5e_)*LFn9xS$Ln0w`5-en8fvzEAJ?_t?=sLxf%T1Kd>tuD z9c$1t+6?-rl)I9dKA^fOyLiO$?L34tMZTDKTZ!mff z4@(*uQ}j2m1QhiwxvK&Cr539qfVGDHf?JAJKMcswx|?m_VtttZw{-w82~@yyjvlWC z!83Xg03>Bs11MXA#2$m-Ca6Q718GyA7=gHUIp|tiPANQo|{^cAl6TdzX+f= zp$Tv2K7%5KW!Y<7I-sVzM5$jd%IXSk4+|KEddJ-z;5y*u)b)R#R8)5f{rzcQW&+>V z$qig)%9MelB;YI(a@gZPdb=F9FkUSSVTE_h#I7mANjQN*{5q=JcAQF`>bNtd*N(b%`S59T)0k;?|r!=7@PB37lIjmuAfwiW!lrjTlY08H>0N94O zrt~;sjf)bIqN3})u3j(6zn&kcB+>KzG?SaK@Woj!a?=W&DkrPd3SU9#&Kb-Jwjxs3 zI-puV?}^Q;$+E;Jslw*FL~%u!+wW*Muv;lDidoH z0si@w|GqGd$rzBU3tVSLvj&N-CP3C#8=ldy0!G{u9IW_qP&-x{jDp|y*YO<1kj zn0!#RKL8W80#4kBmDQBP(6el=P_z{siqM&UczFi`z1mPJ#RW3BON-kHo~t#F=$g23 zdlOSwM8CwrTg|(@;tuK-25zO?VP=y0&WG{`G8eB#OHyh%%R>_^fiLR*^9uSl+~~7m z)UP?+Iopaq@tJ1)3H<4R32_2M;Rnvl&>TuNJdkC;*y`b-D6*?<+pqRQ>6Jo{?SF58 z<_2hlKI~EDbA{*>p@?@#B2X>a5&^f5N*}Gq2e?10?`aWqoCZJV>=cS=6k{i;pF!^ud!0KEW*jvvXBI6url}^?HpKk;pbz}BkLo;qq z#e{3mPS#8#7~cXgl2Bs;aDr#LX$eOwHT8P?2jr9-N&p^w8J%l*d`T-E6!sAib%twa zJPA7h7F<24vXV1&QeCk%qj7hK38^qfoNyX;gb|!Reay|l#esUbd9_5ApXJN#3;_ZZ zbm_b}wlilI?{Po)rUSb^M8QM0*KcWD^tvW^I?AGNJ2oO+h@GT<>ha?3i{O>3z$IVCcPNV zU&)kTaVhxf7?k4;NYNi&K#=^+EqKZ~{g1<)ZA2Rj`h+}6y?B2?5br~`2^D=L#CErd z=m>CB?^v&NwTIE!%LdXzl2@g6gvF)1g$JkP~7l?#7-V;SH7G7r*lhdTA(sR z>*-2yDXvTM8)w>_-<{*1ACxBpo|n>gF6Gb1g+-wTuRrbl?Q;D^S;UbA5gxVTe8R7P z{+$sZ4+7JZu9Ef_PVC>8DS>f_EVR}C&*V_Yfmy>w#s2ThD%4=o;<+7vMYMmTF~O+N zWQv(W7=S-M_&G?gGba=~{;3H7CYdsCNr7JBpYQQk@Ff}#jJD)t9m~Jw>(~FtfY(t% z|0F4ciKkBhmR2UC{@>KYUtbAmn{d5g|2Gr!*UOSvoB9^!Ki{4gjRMFlj3G!EPUbL5 zhCjb}h9qFJzyhzuyjHKl5QRJOSHM2$4|>JC{A@xMjo#6Owb=uI>3>`vN&K_%X(LF4WxzvC86X-te+vNSsso8F zP&$%;b=MqNPizAWls*6!ovft$TbL%T3rY)sw=lCP->Er_=KV}xp(dkrq$8uw^Vht~ z!0R2}q)wA)23zIdh5H#M2TinNZC=5Hz^EPpPX~s09g>g%0V25m*myU!*o$^pTJsTj>DPj5mles&rmEo26{ zir9)74GS(qMIhS>pdaIEa*52yK#)jmt_J}DM@1AC`VkZckIqLNd|w}dR@DS#W2vU| zEskQpq4PakKpvQB0@?tDli0FSbzUIl^bI)kvngGG6J`TkH#!a+oo%_inzY-G*)ji=`$v2<9g&LNE>+$i%G^N=2@DNeCLaqM zSOR7!opn^9$n;fDr^WNJMC^C(kInR6*7cA4zUrHMQ>-IBuXs8IKfpX?o%W!i(Qw+x z#}P4NvcC}kQCE4+RDZ4%^dk=i_SPt^#)x{*0B)YPBMz%ofg%D8dVvok89)qwtaAbG zsKCaH-1qD0JRF)kz_C8h{_aaxQR=tsvv^^&+6cT~9IMftw@??F%*ys_?_xMJQ^4wo z4O4_$0c&d$AP=&v;a3v(NVFRYeAr&iNM48iDpGWCEFbNv&= zk`M?a)EJ+9NApw zWx!$PVz;-^G*(46a0*%3>`jg53>Oc_G1PaUUKCF=qE0N<3p;%cO8;Gmr(hwlU#*vR ztfI1eFpVS+!*m0|S9R)j4?oa&%xj1y)JkW);NI~|hpnv_=TOBKe6q;*y5ZwYOG>g{ zh3lgdk&)u6$Lap<@%kd+ryr{Pai67grXr3(y|@acGzj#U>zaTmWJc-;`5I-$OQ%g%f$+Qdw+}c;`wyLF6eKBb=SQ8Ah~L_C=ZF$ zO+`+m3k^~Pw*3^{)V_qkmUre6#w4ilfna|~#ndYC`LhfFT$QN2H05*{Xfsgt{rGZ6 z`oRHaWnIeqCjI+_notM4th3=wg$WkLhGTH~d>HdTsVxy&V`wRTt55Wi2`s-lqe%yU z5#6X6lZs&=d7?)?``h@y!`AP9Y=k|Rbs+sP2T=7HBdJzn5j>C(9c?|$(y&v7#~I)+ zj#MtXO&8kkRnhG_{@%8N!{QPii865JhHU8la zEu(U|_oTQ02Ml5ph(|Cg+PYoQ$>zV07F^_crMx;U@8*4Pr7%&6G6Blk%?lyn`prR^ zi1N*MpVb}evDv4WKyIQWhJ#~1FWC}Dpj9cY#c`aJsqn{SMc_IsDm{BtU?Sq}f{iv* zg)&hE(r%VJt;$>U#8bCDkEbk|hi5$JaP-U~y zUNK%B1A%pV`FdP(exIjArjYaUCw`#_)5`T6uY=1KKtX%_j};i{Uz1c9B(Z}e)Q^?nFf0HDv+9728YhA0)qngNt%m1jSG zla*$3AwxfrXnK<;WxiXQNVc+R`_+PXQ;}qvoKW0+(gV)TB%`(@A+^cC0p>TCK;Z|0 zLD8K80qZJ!aQ0J){aO9IQ-L&x<$4d#a8@@QDY;-XtFm@c2EF5O`0x}S+yO*DhA!l3wXv?=@Po^H1OXrAxX;+brDnKY@(U zOeA{($`e5$pZ5^w(9@5SbHd$pg+XlrGT8z( z#W6L*^)4R`J(9=wwueCOHWhvcYBdQu{=FFjTm;WI^5;tR)jRFFK?VT_JG7~|{V44GcMlE01vetj^U5D!ctCDpR>A%)nP}FARygz%@SP)d(&^u>pDMi0K;rM!|p$ z!A03589}OJwQnh?hNZoJjvtqV?hkQh&6J2(>p@p54$^#!pyr#N zNc=;Ikn7I|w@5O=(G-$;G+1pvf0@Z%zXE=E*qW!`9c9rpMMj=|JyO<+&Vs$yREW6J z*aDv>ri@@F3k_Zv36Y(3(>-{%Y*SWJhjWc)ddAEmZXqBHCBWrx#od37Nt`hBHXU9i8WS$Z3WkrYm8Qq3loK83W`3GDe>+upOvx< z)xFXe2QF87a_s{jp?=wDgyBmD2x23iF=hXQ#6r{C?b^*x;ON-b&&X~2R{@WhsnP8$ z$BJvC@m@RATbR>j?>t-&@CW`=nV-AAuR3t1LMNaN{&ZewSee8;jbuR^7NH4+`|4^A z6u`dRUBGc?y7dqZb4X48YO#wUUwzl-?SMJHcUSSd^g4YhT8Y4Tme?EDSr@wVC~G0A z`U5uJm<7B;0aPa&vlU7$7yb14NZLUFk;+YXnTla{{xb9x3E9@=*;zl!93cPYnq}7E zdn^N%8@XdncJuoh+B+KZT3yy+N%z)Fjf(oC37yPFfNwLG9*|-6tk)u+_8gU#w}#Sh z48HJ%F}LDc)^yz?_aH$%)eAGiMbmYN!{FLkscF+$zmN#Je1`} zD?GZZe=00_Xr*&Ypx(D9MPlC7jWTMAN> zhD`2f^X`~^5{eJ0+HSS`xSA%xJ;syr>h_PhpI)?GvRb@loR?C2 zSu#r$epXs^w+pAk4>!#;XyD9$f+ks87Yo~^;h}Td{gR%9yYY1UEe9OXR7_*gMit!- zcaTqb;*{SQWj-lo5P6JoyKj8ztl$sMb^DPHtuwIa0TQU&&2rj@;&FJ z83Vk{88KD$((!8csr?WNSohgRoT$I12Xa5{-4DNz zVij&Mh%!pKQ;c+q6==EpHCPfM)*|IU40BgK$0~_We=08V$@T{%RnY9~pKr~lta{d=?f&e`b{q{l=0i7?_-y>7`KljIXw}69YS^ z-uI5%pO`Jqga!zjzYb!>jR7CA&Zuu&l&m(&JdT*jML5)lbNxiC9<(6DRmEprBIxfB zN1_$^QuYqUp-rx|&6jX$mO3gHEvH+^jb2D@_;`eW4c}@^p4Vk5@9|sEk1>o5GZj15 zijr(P5x4ytTI|}O6v9d&w|4|;y0TR%qei!(9K0R>5MmZhr<0n=Fx(s8fj)z99#Vgo zQGCZVRf6+wt1*}meskeijGC{yO8eA@j#6mEuo$h}w&ZK+Vj?J>1eodASrVJccODkp z_86;9-ik06Zlwe6EVJuD`@86g~9Gp?0JOA@8y~9kYKz z;r*-x_5_aN$|p`X12WTlRdpkd6)IIY&iM~Dvd~_8jf#0pTM(B}ktXeKI#BKkuO+Fg z(RXU))E5pEnU7Y~$u}U3ifspq-fZj?(A<$bBDTy^7Z*@L<(uv9Hks8)DkvPte|A=j zM0%uVx`^W}H6_de!3ktT`1xkD`raDIcMx_;cz~q2eLbtiJ!3qqajkS7%$Yt`o5EyU zieh70*bu5oIlLyrCI>Tae~W%@mr@%&{+hqF09@g}@_%s~S$lj--*#h_ID2G#4HLQB zi|}0#qCmGrid`OK+S?LQ7VFu-+LBrn9B=|zj1Q`l+x(DnvJP=u1yIB`4YPzXI5Hw# zCWknhbJlR2Ymy0K<2^%YWmKKq8b-g}GshVanrCqa*Zgg60gLrXu6)j2C;4+9YmXNl z@wi?0(Yii@o>5k5pW@fNZz*w-)EUhNa^X8lsoG}6b|eTS@FnNYP}z+{mD*bBb;P~! zpe9dY<(6eIryx8Sr@)_12rS;}k48^FXL6T)C2^oYm^~C1%b-DfM_HfAfRSNob@PbN za?@(C!Z&E0FSzA)C;*LM|A( zO$b>AR{Lck*BhbHmn|J!oI;@BIHhVp5G~ptyRLIZX2~i4A>d4LHul%_kl9Vqv)>ES zYg_%o;LQ{(K(P^NLM|sZ6DNpq)Ev3|b~r-(Y5J#(+kDSbYckCmt>LVuC}P&=QeAgO zXWHpM08~Q`XGw@nxdl%~9VZ^a2tj5V0c}I+p858HG`6BC+}zh`9rNh^#LnIf4Z$sGEwVo+4$3P*dH;zAv3@Tdw5uL?0Pxk}Flsl&Cj# z(mW;iNQe?4i(|aOwdy^5U&m10o6e#HI3>MT>2~y%A6$cwNGQfP+PQar4A-(hG=$v% zy>@ldM3`)wV)fXYD2ikaQw76MrL0|86O#Dwzzq3*q`c`g{k$_|9?_qu+SVv^HB%>= zl~EkaX>*;Z`|6>+>A6dg-spi^eB=!yl&u&J6ZPJg-6(U4uufAC*Ue7*_6##2YKyVl z#g2esVQ{b$9{p&g#2G*McE*wbFJ{x-RrZz>uG};9)&TXAhiX#uq=yBgV`Cn>Y5L-C zaT8}bYc=dphKpXbrfyZ4-N7B~s%ve8zwcjez$TR?)$C+#`tl=4ILVeH;NLb7;#=Cy zwN!I?+vycHQwKQuQd-^5X=-~C5XBM2Yxah%A8@r|Fq|mfU`6dGKCr@9OWoR$OZoCf zig|n{hez|R#$?;karAPGt>PJ6MYDpwVGeEcec{IbN|H5~hDS7(o*+aYCfCQ?WEvW- zQ)Sfh-;y^rOTRNbe2U9-mRiF2G>_TsLaR8*mg32~5yg&7tnrwUCo=WJF+=UPZf$8PPL?CkQFQr%|Y?h zk|UR3B0QYk>oR1%H95K4h~Sip!M6lMBxwt$$n_N~O9Dx3POm_gr$Kv{>W~7{uezpX z)z)63q)}n}N~gl8^l50QDHAN+)xGpPsndFPinS~ZwwKg+EDJOh7Ap37tOh0zw%K>m zHJ5j9Ap|jm2eBs4*ynA=8+txcZ)sv6bxXiMbrC+Pj{T<9>E`83Sja7rv(uAV<^pRy z+}QI&xHW|L+oU5!j`8c{<*5nv1}+-I0&^PeBv|W`SbLD6QRxV*jq6DUX_F2O)bTi6 zKjX$^l(Lz4e(kBp@^}OTc>3xkvz^@x^m1Ehi63fe^jZZL*AiJ266oF+7Eh4N z!qJW{(`DAUrUw?fTDmJ{bn~@t(l@`^aA^pebrRHit3g@zbev_Q)M)VrA?}sz7ap!z z&ZAThhsiyHs=DQ`WMIm+AYwV;NV7fE`>NJuMZ_Bp#_oD(q?^9x7cfS!FtPZVb;ES*PJ zi)x5X;(%TnQz#1itT2&nIueayMh1)MhKqNiw4MK!LRSV3;*S6{qcmj^rm_!ZLQV>A zBJqpHD{*NkdN-$$IC0a%vVx7&k7Ukv*q~?AN@O4EJb!YVrt<46S54Lfpe*{!1h!RUPd}qq6FRpx6tu20|^%CFDQXUnX0v;v5 zlqZ^}@VuLtM)0JTfiNX0$erulA(8kh3(0YEIOWAbkr*v_Du?T-W?zs*-iN~qlf%=Z z@PIAkHoy>iyyVy=RJtr@VpQa5(G_WzmdH=@wXp;D2}t23Ishb={73PzRt3l|9pTVl z7frkJV*kvkDrCSU21v!H9UU`#3P=;8ZOWO<_XNPi_iAY8AA6#x6yN}?j_!$6X6cj4 zL+7Z$$QcG&d&_!dL~PDjr)|^uMtP?ZfZsi}siR(9HzaSY*rIBUF!$nywGQ*IpMbgiE#khY=q;k>>v9Pz^of_Ut^p;=8DvWhok0MBLO6TC_c9cUizL@)H zW3aXX@sgR43<=iN2#<6MOKyt{E^M^>dN7g{RT_HkbPjB%=r;+eZR{l_ln^r?))RCp zwLRdlWHoeb?y0ox+Y$4As_G&+L3;H9ZJX&6Yl1@nR6B`l(+bRI{^PJ&m1q3w!8vF^d5Oi= zCctOR=d*g+DL{2bWT7ZFmFD^OX23(J7Mvf?dTJDQ8)bJ7zT?~F?w6z!wrpdfMO z{BhG|Z;L1_(YiI2Y2`9|fFJ?{NOT9%=K!+NRo|97rtfZ+paAl?VxVpSGN0?fpvb}G zdmB+J37du_UyjDCG+3Yd%#3Eu#baP_6@dlH$n;?V&z}fV(s%{ZU*(Ms5&*2EsKuFK z3F6m3+aJXKQTt>qSv$re>OTh%qMO`kHj=UcNqQnWZFWVraQOgxKa z%?j{)%usD3``tGH^cziTanquHhbc`0QKh<#x-N-%7bbe@^dI1E_poML!zbEktlEmz zKiLO*R8bxS<485Gw1p6C4?gL(KH{J|lZ75&754QVfC5#@=eYR#$pbH-%e5EpIrCRWgSc81(5lFb;Gca&Wtqbkm1N(~}V-lnHz`^}zqA^JDzVTk#x3POCT`6w&P{V$gU>FwpXrEC$+G$bF+~~Homt~Ya zDCm>Eof1FRG96KQ7l+}qYWE>?Z-)EO!trgC?&C$3_2k;@VKvRl7(qPL_bw@9T4_&i znCuxOZo;{W;3}~8F76wa_(W|)J~J|VjrUm0npwL)ErZ1CHsUwBwS8*-BL&fJT)6BT zto4a+Tq{j8a(xp9r8VLok+Y!bmAOT`ul(FWF!G^-l6Kc<`74AhfKPVYy_*#}I($*8 zjmk%Oosr*dunR|E?zM@b#)X~d5sJma^Rb(V>wI;^Nx4G1GbBerV=wUGH{TRV#; z69N-?_1%?7vPa>tyAF`azvk*ICE`wn86WuMT~5*zj`?d&O_&G8?7BT4CH)vnbr<;t zlf))ZT*0ao@m26MALuc^sS!v0EF39RcYuN|{*|Pyk^)Qg+A)P?w^d7raB(rS@}eu# z7u@tW^t{<9(N91{O|X>%QU}-DP%}uvQ>$-z$+dp);v@YhxvBGOeB^Ls`L?r>M+Z_V z4kfx7mO01e){n`QrwqLJ#cyWa<-Pg=lnY3-lvTBPyRCgoame$2Cn-@4u+LKl>71+F z`;}j(ZcbPkG^)l}AV<&M^4fwYgV24%N0A3xI7Hag_6;WyA90Mv9k+Tsg(#f$u>F z@~P@$M;ZoC4d!Prx6($aFxIhGo=DFg|Fp-#2y5e8^HNVhnUfSPy%v80L0sAPeQ-B}jwO{(M+DOM%&PztH^tbuwwzKpI5b%)S zAJ7=b6_91%?D^g<@ByXczHXjj@6M#mE{prB7bogfqk*pZuS`sfB?D7@@70upP?zJ) zSRKy1%Lx^GdVn!)t;UsSMsWZLGinU&E|7SOpu(uAI^1xx`1J6CIGVM3AAxjRtzOk* z9q(aI*w3Znb+XUXZ0ngz=b^0o1n_QRrsoTw3jp=9O0?xLt}+?@wS|C|76iS|?!vj@ zJ$rvCQg)n?fjyoPJh_}g_DbH^&FHcVF8c$L-<6epUHMBm)F?%h(rVJ#ThN7yTBjd4 zV4hZ)^3ike8iP8`IpRK-C?^;pNS(`E(+u8VJ4`W(F1CXt?mnoruZGeOk7oF#4%BIFy>P+MtAYqdpwoh- zwlN>ahYi1@(0~=}Tu;zy6@pO;Rn58#u{6d>86(C*`R^kf=ajh|30f5s2e>!G=sfk4 z4c|L*s!p<9?aRS`M~Rtm-n4;pE%o@poLdxFhD5|STM5;b|JFhw&k-3^2&5yvz4*J7 z{;QtOgF@h3zvZB>4OQ3wDcqVo1+zfaeWLQm5Tco&_&)DeUs3jV=^L7AdUi2TRm3#& zq5oruJVh{sqrm*M=0BrC;}g6Hgxx@{7EbzOh;dpl1W&f(*H_ShKSzarM1knU_36w# zxj%-mNd#rp%+dNMn*V-b@S7Bf-1x;DFpB;dqD~gL7mC*NO+TaGe}<;QVP7Jls1H~B zV~9CwlaATqT1P{WPyY8KgD{~TdUsy@=MMd62uCdTC5Nfm`v3Q)IjBg=llj;4u~)!F z;<4zpc<$Gwe0CX&>$tqU1PF5;aoie4XtATvhFNrLEYcW9M@FhI;?mL8VSZ?ugdtq| zD}Efv{&tA_CbN>u%+^}7qd7)bxOTcMv*VY6c*5o8cON>+ELk7+KFNbwuqo)s<30ay zt!a zFP}hftStE@>7Nhx0GyVKr=013ZAXRZ4nX>WFu2l>F$6aAMgtFl0`FTOgeChLK$qMR zvw#PzJm&}6j*o6K;xhwHk^a zn;g$Rx}-QRJIq^}`~7G4f)gHtJlFN-r#WJS-8)a%(DvJL1N$ZdoN#IQZ>GN+1JR&* zl+$~o_1TZVLLd_j{D^^Dh@Tc)Ys~D>tKvuA{k713I-0`zV(I|%2XtdZ z^HM#K^u!yj&Jx7<(G&ZF1-^TWu69Z#~q$WAj&-1dg@i6s^ z?lmxR-YQc}<5<8=W8t6BQvbQ$L8aJxtd`qKx}k7Ck4HKAch)~vAP}w{vf-!K%7bO2PO$}h09#N_PXAF(I z>`Uc85i9>(<sN$ZvN2f2ZS*E9ZZw z69HX=(65XJ=EVzw5!nALPiKlZ!6Xur7tst5w#>ZSnvF$KVr_5fkv2modU zzb3Q3Tok$fBGPu4{oSHV-uQeHG}e4WE_tgA`Jf%-di}rqEH*EP5hEXe1o#Nh;Km2A zSk4Z?_edzXTIX$V1WFe-YsS}W#y7{tWIx{a34b9Rn*kkh=`Ii2KuaG$*p7u-U;`PB z&!B0hoZUx9Y@e$gYZ0G8^_&Ccn ziX?LSZmUfEkcM_pV5=~3^07bY(#Q%jAWlxK6$}O@_$!k@FWU#XSIc;*xJRpe%Z`#um^BT15=AyM(^F>I=B+!_GaG z@Qs@VydItzlMs-wfI16or+;`=R_P6REh1la$MT<4TCbGKI0L1V*5%ue@?JvJlIvx% zYtvKCRBS^4HfQsh9lGba1be6My_!75K)VPags4#ud`W0%S!0*f*&B5e_8`Eq=-Sz{4mhy} z%@^p=7W#~VZb69UMDTlvOo%?`dt7F$ctKFD^~NL*LF0*5d`{idADE61NoQl_a@3=( ziz)#i(i3YtsIcz1k3U56`_a5Oaxa;kFJG}q7`t<#`UJ>%6jO0?+yHWy*x=&Tu1D58 zz+wL}+uGZvP_(Hacw(jl^Xqtofk>O=^Ee4k8YV1al;lk-e%oFsR6uZ))`+6W=2B}t zfAtW}+M*uhc&1a!_>b@Ah5Ue6B3Kg>Nv6F}WAdH7m(s%9ppU|X)j07}85KE=bWW&$ z?_Q-xYh>)x0&reZ2{s6|@7qKDs{tDzGHTT#T{U~Qb#m9-U?&5N!-BSYvyC~ z>uU*sZ7!&|Z2)KvXSoaAAJYrG7{=3J93MtWQePOl&sqZB%=1jcjF1)9_UbOB$GsC{ z7SEa3H6`iH3R|A0PF&_NU4Ud4j(ts+U{%vLBO*slj!Gg!uuJne)j7~GiAWx%OPUwx zjfEIxKr#Ca(hdPCKm_I-LUq4P`x|6FTNRe5GX?*>5VAd;E70(UlgsXPbojm-jCq3S zdC2}gIJcjGP{85E(bXA~@tK@L)MLH}$gM$glxAN;j zKUyiIp=%7&!BV`4saDVdtPOBN3MmZ?Ar>K8x$@z(6_d=Ns(~tmou(83D983e6RzSX zUhmM1_JS~c12e5ywcWjLE*6y?q#jAl$pZs6#>tx0TLh6X;)yHNx|@bQpymY}Z)14q zX)!I&RRqf0R%zItRVEZdwCzPeQK=C{i51v>In!`xJonCwhYnP^vGyfLiWHxi$% z4dnE;@@36GCgvPb$6S95xcZ2W@7sx;6E}Hyxe1yWeW40LG^e?WrXqsK__tVSMpSEET%~uq+E#-|w-7u%OAm9b_z% zaKbe0<$Mwq3UM&-Ty7rxXdH{akHg$xYe3fMy+xqVZ~}~d!SS8k-9v%1RXP#=afAW! z4+%qpqlTR%N3W=^7H_T=1HKS1np7J4tFpUipdCY)dE=A=*=JLjhChQhf_4}Wy{=0M zgs=hAlanmx7mf5Z=$bjk&@nhlK(O1aXbATtt7olZBj-FI0s-bI!j^K&gXO&CUA zRfD@XC=n(>yC6|~+=#FOf1(&(Ol(0-E+xlQiysy}%Bqx8!+c%forWYR;=@eHrzPHwTCl@c2ORPS zBo3v-!f(SDed4@)U-CebWTp`xlbPk|r;}}ioA_8?YEDxE3-_Hi;opRU#?Lg^IKyirBs1Ncl}u^)D;RCw2)Z?1hh~bYtS*Re)jl_ z9&PdS6rCI@m9Qh5kPW^M&o!0>2#aR0*H9f6XZqIdLy=nmt&DXcw8ZfHc|9b{^sk>Y50D^JHkK6V0@9Gm9-%pE`F! zgRgw*r7$iAhbnsO3q;f9q9QtL%r2T)P{7`q$P9#+P<(tb%FXq(w48#{l5$aJY0*jK z5;21GOHp>+@Sah(j6x3bUOvvCXc?QtW$)&2IA3Bi_a&C1WrLco9;a=lUpLh~*nZIL zSb~c)luqSgtZDRcUWjJp*YES6PwIh&S*R~x4}2N-Q%=U@R!`Ipw`L2O4NW4?TBKJk z3R4`yf~QOoEQx~$>~{_~NrtG{NBK92S4T*c%OKA8 z@hRedb3~=Muk6-!M{=%DOXi&o2y6(vV-$44+tP|*!xp{nivZSavV14jcC%KqU<&hU z|8zOs+x;qUXylEvOTifSrp^0&)OzPT-gb7%A~oOfZ!2~yk;Gj1XC$aeA_wK8yVWZ& zO^Mkp7xFUjh87m4N5&+zgi&zHvSe|4)+DQJ1MCL%Yl_Y&g-LxfWUX43bcZ6+2X|V5 z4+YTsiai<}N4djAvD^~1F^`bITA7}YcjrF|`i~o=I&N3>>E$sU+Fd!w} zCDIJtBGLi_LkZF#Eg)Uef(n8X%1}ciB?c%U4RRO@&D!@``@ZK|_l|2{zyE)dlaH}tuMt|6Y|DZx0?ZtS6GgdEfxBw553~`!#x!`j zlROH=c`@|ih0nb2v4>D1hYmBOUnp^2&M@Sax~mSUKYVSCACgsCMbES|;dq-1@Xm#^ zlI{X#E90%P(}uZ+M(U8O5*!2x;uw`oE3}WyL(7Vt2R`AmBv)dLl@q>z8a}W4hq@NW zSDxpZN;s9O_3WA8VsxBpQkIt#4I0ETIH9vrpGS%ZlEDKlCX0oTR zB#QBJuCS+G_Zk1 zkQ5DwWU8*?@49PQ12r@49xvOfL3(-BK8x$O`qhHr0KX)N6*3@u)Zp;3ej>_Vz0JI2=I4W`J}CvP;?1?W(ueMw0TlGvgd{PB~cAIJ>e*Ixn~; zierv)dK=$qu+eYn=N+@Njr;Wq)Z};?W*L4+k$wvxL%(dJ+B5vVHNGaZj|;jlG*e2x z;*GDOP*p#7o%Eb8%*we^DfGcRL7ORBXI&7$G~BD-c=;gxVB6dMs?Tm9HnHlfAv~2>PmDfY`QRH! zf+X;il2S0&Nnbn%fjwC!ZvHwHfbO=u#x%*v%vdOCpzfO%uH;SBrOs(8F;s z%y+q2US(+)){qlsf5s9hp(_PioFA>oY45cTw~1a(bqP78yRYf|m%jR^)JCYZf>2Gq^Wj&rJPkS@?kmM-R08av?Eev#Sjxw37Axv}_`sJb*&xCD`55Aw4QqM^t0 zR+V^z5~tp-eE>?{KGjBUv2zAJd_GOO$*rfV@8IGsP9GTh6=i7gGg`VFVBSUliPrwg zczacS6j5;5!UaCbOkXyagn3Gw`EZxvCcp!thO63dySaEnkxO=|sEyQniWh=cg7kB^ zPVa2JRq*BEDhQEY-#&`Zt5ibU-H%YDzFi!}&&V-(j?*X>zU;43og&iJfA(_ihg$hs zpXM*mJO{{cerBClJYY>?y=ozAV7#1fRe-IQh!R1m-dL56M{UzYwNCMKLlDktUVE?Mo5mnsINf?Bj3SLU3)?2 zz^hG5=3Hlj0>HI}CSPn{_B^uUl@~^=^^8-4JxEzXFeR?K8|n#XwmGne3a=mbQN1FY z4?}C%xu#AgtlFbP{May}m+k?TQ$gMC{w(cjm%T|yO>0%d7Onl^eFj!7$Xx1*npW>+ z#hNwJ7sXPEti9Sf)#`p3FnByW(J5PpjE9yj-OpVUwz@PFDmFUU)&|!@vY#=%3Vry< zMc6Lf!0+ob74}ihKF|DFy#9VdvIhqn$AdDm^@K=TT%svURCWvun?vV!D44(V(U?^x zsNBM6!P)JlDF;PQJzuc3^ti4-wYW7@7oIL9U$C$ze{RE~9*vXI9%o;e7(DBJ(DqlT z|G3UAYu+`B^_zWcl0qg8ANE6f+RSzr)5ukQ4`$7%!upA%TzWo9lNiZ8A~`9vAvr|LErPIv7~yWP`l^}f9g6RuA#eiHklS;OglM~8C(P@cLFbqlea_)NS{jX_rUZ{j3z zy-7M=O?^6TSS`=F%){w1R7bc-F5%62<5`eqpOql^fsxGb$tfi3lrtZ+n$E61?9_u# zx!mR_m1SfzuAtAX=guQMwcR8BLgMrIy*!!4HdRKpb_%*;DVNCmNvBaD`^OnjA(nO7 zh$F~VSS~3t6BHLrl)2QHJXm5k+|Xhqh8?^4GxU&W_rwTA6ezvKg$!+N;I}N`VpJh^ z2rJ2M zcG?s=Ud-yv&rhtnFxZs!9$or1c%6`GM=l@dt#DUs5{PC@)4^GcVvORxXuAi`3En^_RymlV?T5(4;03Es{L9`u6muXJhllT|&QH=Cz<7?mR z(+=qn?WG&bof(wZ8q046L~jY-12&_UiVt-KbXr&snOa{k(ib+u%@c1){hX5 zX*6g;-yM3`l`Ymp;k!y=fvj+;=e8M1O)rY>&7rv0pM{`NqsM7hlMl$h>Sk-HwsLWZ zaDXS910`~Y|MEHGosz|KBzo=@cRp3KY}3Awl3JlHMzAW7%r~`K36vM2U<4BbU5vQA zVQZ@ldeKvdL!Hcy^{ir-<7JxFn2hWri9B7T)~oHG>TS9PRSdYr5Ad@*u_*D7-fWML zKtd0L7qq@yzp-~6=rnDrT(-+-oGuo_{Dt3`Ds1&CMlWucu}))0dvC%Y#-yVv89o@$-Y%bgJy2eF z6?{_>L<>Ti(%q+20$)z$=~n2VaqwF(FBNvrTwoi=;849U@Q5^<5EE*rsSOo80sNa^ z^52=Dch+%q=I+NwtL4Z`+~vy{z3a!^`$mM&Ji4L7hVtp9{$XA1CzRR8W~!fuxb#CV zW^~=h%z+aQo~v@YB?oRkgjyHef&{@LA<`0<0;`-D+^*hLcrk8&**Zb2Xeyw@R(iq- z{8A=Tvl0lQ{j?|R!Z9DYd>(lz2e4&GsapFV5!nlKhj-o8AlO-)-6KHT?Q5c0iwkwr zuFnowo!DoH21MRTY{C~3QR)lo!n2v{)+OfAjY+Vbl-(+;>&JH?BVV&el|_#`OK{u7 z8Sc_TT$;cdn-{c^q%Eu*v2=d=UYjN7RQSp|qrPO$C^wy|aCf|?vTYTLX$35+$88Mj z>SNu;)*oB5`8bOV$=~%``CM}@iC6E%YPDUo`Yf3g1|L$vG$_1$&OltgJG!yif<5f$ z+lcoE|CxK*WEyR)pL?dCJlJYq{)UAZi@zOAv3iFxfyy@Tx*?ni#?#4Q#mdi zdM?7iuoaA|^E$oD_Xn2vOR`jay^j?oHH(vIgfTH1UBm0V7o2TeYcvhl#s110g$sPh zqj#7PYtWy4Jb%E?T6oZ{8>N#YA7#5Q#(HBezM(2aVwWpK5X*%F(gusjW${Ka_oVAY z#>y~?8k#2WM1S1jiSBpU>9CvO`xw-cJI(b#yOVWLNw3|v;j1?f)UJj3^85tfXPfTz zDPaGjt<_rTY{9_BrK6I`J(>pr{~VXVTY$U!!m<#490}Bli~w8ph9sgy<~ej+_AbPp}-HB%pRnjbB>vQDM0`DyPQcRt z2q09v;rnL`Ln}y1&#QOam%0W2h*c++5L7?9Bc>oUK@muHW`2bD#vrwaQ{0;Ih-7~} zOMQ`T*h`IhZpM0-3FswX#~@8rsaWZ{FL8&(*0rj}xk1%u@AUWImb`|cy4XcSM4UOn zFw3GWPF6^X2ejeR72N#}5bIPu#czH~!go_vubj1Z@{JvfUWL1^o}u8dWJ*scOuf)-BXXddcdX*uLLN$5y&_KakHYV)i%5llN$~Xmb6Z?wDGNj*Rq;Qu9HZ@9{4sO-}6jE2@g(&^{wbDl;RZ$qQsK+E~`ly1G~wjL&Px68SX{DA!q!Rvd(uJvUJ(BhWGTl z^If*?et#la?RzzMy1=wT`+$eQ@gkB2PmM9TqeWf?+Dy~OqlJI5=F+DtoJ$L_q}zGz z3rhTY8PU$``M9_D2wo&D^WTzIp^6kGh#PhDq2Wp5pyF)9x@@zp?Tp*^NenHh_xD9) zB%JSqyjPxMx3+Ajxw4~SBYbV{gXI%J&o_=Zyc{wjH8ZA*29j!3b-;2WFS~WzApJ`UAixhbc~&!2K`gN>`>2goN^zaRFu$?cq7kJF3s(;go5U4!%MsF=J0S6 z*-;N&@z2D4vfONs(4Uoq7p))PTnqTr<%)TdP-0MJ>!agiyWQl}%4Lr60 zn^kzNeuMu9_~LctlRCi~fNbXFp)Q{k2%h_gkcS9dS-#ge#ABVy&^yH=6!$C&9c#!V zT%~hH%4nCxiB23uJVb+~%P|1_j4RgWOZ6`0Ace4a`u#ink}QtCoGN*6SKzZ0PkH?w z+u^F)?Lrgx58fth>5dX;;q5_aaHa%Q(w6k|;0$j=ilbZsRippM%^dB?ZBAu>1f5Q= z1sBgT*xWAkdE(i*QDL4k*}tt{MD&qz*38H4#^L6UC*$MvbtV&db@J4&Q9TeJOOcI7 zfmHBQtN5aAeTDf1p6|K6?H9LW((hAOVHy8|6vmSMZb!|HZX5rJqM=|m>*t6wJ%?2l zUXdC&-|(}dzUqGOAs&T4Haezv=&pO>^_KFIxKSoY!x2q*1By7OL;yJW8rjgFl+bu| zBVM(G1IMtF1IKv!OWTABCNBd+80;jw+KVMG6E%(c#A+*avK%g`Dz<2*GAYUcAA}^< zx|)RT4)(1i2e-T3Y%U@~dP^h4(2TF)0}<+Z7r##y4CRepy*PMO1=1PZhQb4R?3q|A z%cWosMiOewq{bt->XqT-8$|2r>`{C%JMFZk138J=uqmr<1b1}tK4OAM}2Itcm{{Lhiivb zS@pOQ=MBueRrJmh(Cfbh{%LZwnb3av2j{kHoht3TJf(FGe2Dy|(Uc-3)}r$Hj{;UW z%8|0piu7FkAvq1o<#N_q?f!!YYqzexly{gtGM4ZzAE%D-+*^Cw%ft^-AW*__% zgVrhZry}aMRwcS?WC>!7`(4vngI4WAU#`lU%ia({N!Vn<&4^@&C_I{*wydy=6`k5fFsy1H^@kkKno{Ao6p&1z);+3 z$~j>w_fb1!31+=XJc4|>&_OCGCe{*5$I&U$aX!U(LmCH%k{GV0Y=q-uCT1hjG{aHT zok$#K5|P=}Z}L{-fpdfqism*)o?2$1;F`Bd{w1}>Bcrhw(^hsy0zyqw58t%5ZQj3^ z+p5D2TYYlntYyV~pv>l;i1n<*v>jr80A|Sxas#TY$TGP@kqy4vxk;o`K&t?f+ipnz zrSnUbsxqT1%$*@)L)r_MPz?Tz-)vk0_;Sf|%_Iw~Vh)(kzaN7)kQoLSyIJYmNaF`K zs$8s%EROP(c2yG!EVn@{(F}F>BbuIeB@+iK>#oLDj$scr7qGDEfhETMO9bBOJc_ut zwls7ifivY_t*%dX)D~k6GamTt1lYsoo9q3Pt)z(+@{rV|>O;s7c`EyE$zRe-jkd|= z=<9Ed6mvoZ^aW<1WABk-2U+fkoKePYnp_B&*d5h#_yw{?%e&=t7Kk9T$ zOPC3uCFp6$7!6rj@+sgw0_HV6RZl_}fj6M`wD8AmF)+%Y~b_T}3^?s~I^?m7bZ&Y!)RyvN2pB$$I57AC(7gosID;56iE-q(NW0)@reNK$g-%`hz4>4EF~25M&`BHU<1}pL`_IMb?`u(D$VLg4{jNQtVK4@( zovHHP4~yDgAU4Q=%bCe{?+fxni5czUgfw4`)%UDg+|kY(^q56 zw?CSR!GuC$_AjH}ljw^+12A*Pd=a-D}_~| z`!Cc#L-jup>gU7H8Ee7&At@*-BQv@|+4qATJ$Z2>)$!eC-EO}{&_22d6pGEMp?zvU zE-=yo*~8=bFKuFNhZ4aZ`16buQC&uklZLnpZ+$PME=+D&)wJmolxxGSmar=`FP5a69@dk MFY2n5so327KS``}od5s; literal 60600 zcmeFZWn9%;)HR9-N+{ALozfsF-60*)-Q5k+-QArcAtfP5cZUL-kVa7&M4G$yIeH%T z`Q7{Je!BPd3){W_v0|<{#~gF46|NvB@em0g2?hq{p_HVk5)2I71Pshw5(HTAjg1Kc zCHQm4Nl8K&rfQ5}8~g{+UQ)ve1_m4afK7QvN{QkCT=3CCMcr9lR)*Wi&W6#z*v`;| z(cQ)#Tnz)m>&^{6+L$;Skht4e+d6T(^O5~?1vmH%{hEo4>N!< z*csUvnaTK(NJvO{9gR)7l|;pV-VXl9M`rHqY|qWan^6c3&Cg!J1 zPoFY?D;S(SY@H3<8El=%|9!}>=ZKm(897?mJ6qVM++0s>woTM`FZ#6-~GIvmkHYO z-v;8}QU2#!FwFc&yiEU189!3>`qL#C7(o~*Q6UxgJ3HAfc?2IJx66Wg>JA8UB!V?7 zdvK}PGzyCr$e48K^>CVb6EI?$5!iMPj}#V{bOP_W{a7yLZU51ln7;JhYj*EbOYZgh z)<}9`Zu-)QTh3?R(&a}2R|f(<6eM>LB?V#8+zd|=hx&v>fTs{BlS4V!e2Q>Qr{|_6zg?g6XHvYBz&8I8}$m_oc z<%HtShVDzx!%&|OZ_MspUwkhwedj{?moenoAbz01z2r1DbU`T~HL6DY+a}C0{X=kZ zbh;t0h7EXrjrT8eV#dD%|Jj_2_kY=dI2y2lh@#zJ7yf!{hdHpM!qtQSWe50Tzz&`y zG?4#2>4o4)F(KV||9889Efh-5$Nb&xC~?8gXZR@p+lUpxtE0&*{!b_VnUw#p&qSVY zA%1Q4De1v#C~*w?jQV=4fY0TFvQqJ*?neSq`0SP&1MzQKkA^4`Wu8vUu#R%AN3F;3 z*vd!u`~1E}CKTa3{ezD)?bBKP|_@K(grZIh&6^O%-J&L7FiVxj(e*-hV$^k6pHH*=>NBK5nbS(Q%%f)$`Tyfbnj5(|&W&eznO28vU&Bx?kYt za3%1OYSo&?uQlWuA;?s1uj|l@B}LSF(1q-KwN6}`?K+>c_Oao*?R4Hsn!X0EVU(>) zp8uSpc$kvNeZ_y6sXeRiv27EnVTE>$ZmREQvYKuEqsZQY|EfuU#TyThMAPDrlhDV z+nx_5jHW3L#?YFPnZkB`&h^!8HYVn|oRlE$?fIPT_9;ZBav1qyBT@c{-5U(8u=gV1UWBHAv}9ckHuVpD#Ghu! zYj}#+=`pM0cKubY9qBFeviGq~O)s|7*fVOuPu|C)pusr{+co_}Mnk*}W4y;B_X(X9 zVx|-&2!nC4_(~JapLoST zhjL;s0OLF;$Zs-5%Qk$0IviYZ*!S3O2Rdh7XM%YPpE zl6pk?ntL?s7?;=XTbwr$T)|HshTg*TcFEbYP7Nqx+uvM$7U(C#FWt)Vvi&4I#(%j( zu+|<|i3NW}_cLw@K0-IQ>I^69$U)53Fikf(?8Twpla528ULY!(Y0$w@qiEPw$Yf)t zCz?EZI!v#eAGyI8d217Fn3LcKmmvhxD_J)%3~ae{?gsRcARV%q&AIH&YWH|h!jiFe zfUH?_J(}|q@y$W#fU$+=UPb7ykbFYo-=Nz)7?72!>xsw>m$I@ijLgupMu2^;pX_;i z{UcK7xjQ_O&r7*jFn%Pl-+bepH496jwx=p6071*nZfljRG_Mq!Ir*Xc*4*&Z^1Nf!Z zt4V!|=W|A}o6H0bQdtgtc%<(U5FWh8-|8(~ZT_4itz1E3Qvcuvpd8aH=Q}JJn|Seu zYM&~C)%k(rykA~j?Zd2`l8m0}9*m_bNxuKI^ORb##4&FeVw(9-EP^4)aM|aQp(H_B zo{wFI5iM#144))Lb6L}!*UcVb&Rtz~{&(-Ae<<2e@MtaJ(aozCWl-~rP&ai~HN9Be z;EJ0bdrnu`Q}8zTYJrLfJS>;i;}@LY|Hj>s0+ z`p2&k?knmR*Px=Q&V94zEN*esTh%U=<=aq8VxrQq_mXFu6b0_|%muih1)IjhT`C_f zG)Z^$-g@;C{fbG}CshgKt`vkMq*a$O73`5$@tARVC(rWDDXOWz+ z9<;v5@!A{Y$dp|tk{ZO{5_^zi9y;-c{;eVTgmqnf4u}=*0V6RK#SVyl_KFq4D3_G> z?8<{fh^H!H;uEKm{X@ZZW@=hCZxh}p;*m;$=@KS7a3x=GE2Y?UNy4Cbke1QV&Qr8U7p*WuDWXI{kwy+m z-pvQmGQpYLgf5HJG#;%!OrRWg?lvB%7?vMR{E<&a$Gw@`;X8)!L!w zlE%m8Oj;pJTSAr_dO6-*eLUlaNVuRzJFw_19F1iQDdXX%HsLpA;TF;l@Z(eJ2#eV^ z3LR#QN=HtVVsqalrOQ%e54K6rw13A{o~8Pkvb^*Yyo%NSEIL2ib^gg~|Iogh0W2>K=x>K#fkp$&N6_v6gn4|pvMVg$Pk zb}3*VhO0Y^uJ{m1S?p#xjnUBHz6#zdf)Qq1@M>o3!djO~qeG1HcP1U99p8JUU4)PQ zjA*NxoY21G&LnJ$TTl(cFp+!*FY*z$fL!#mt+Y+g&O`=F(m}&s@{&?iX*S79&n;jo z*O8yaa%dEvNBNedn|4`-;$a5|zQr)YKMKFvnPX-!?4#hIPk)8ESQbh-7`*dDEfXm?EiAa?fo33{kqucSuJ>^Z(Y2&*u z9&9D$0w+~-M>t$M@DXXBN436M|NMBRq`z%+-3E*~YUV~bbq=mUw9{;Xaq^z&Q+HAJ z{qR>|C7bX_-BcLIF0bGlv?!=|I!1F88xKs)JOS3w0XWV(7F)}j#qbr`?U`o?)01|A z9iE7xp_Kg@d5H$HWg&Y5A=&gRU4x5t*cy~{^*2pv1#;6?c`6mXh>MJ2C*7%$UL~+` zj7(4Jy3Mt159SJ{BV%{Qp0viT-A~t?Q<0vLy}^%Sz%mz!vatR&$=Skra-H~TU#*Lb zU98h4x%g`!u&?3=pdh5~yeRQuME{TK>ciJWX3Rw1@u(j$C8a~$O9t_<-LK5x ziq5)s1|Rj@f50kc%S%wzviD9!s7Ak$K4)~3q}*OSnlVMk6|w_^D=@P(5yxA_GN!%c zj?nGFo~m~qC-LKhPOT$d_q$DEZn8C%?h(gPJ)3vp>BQSqmnjSfs@H`%jLAtlJV*TH5Avq)oX+|DQC5! z!>whVLVWM{1sm6gGj{81wiy|+u8v@WOr{_{!*}v!+`vrMoUQg>$Xr}Z=iVgkzH-c% zwc%3eDQ^Dyy1X`6D~e^|t;N3}SkR6d^|o>5S>)}#uN9xC+md- zNwqzd!S0ip+|3NiO)F2#KB6?M{F;Eoz80Q06)PiUqqwvz)FSGzL0uMC0DZLICCN-G z8cFgaZRt>kVbDpCqD_}2DTF0#0#Vqx$Kvunn^rmOTC3Q5DPdd{Zl5ntt*|Z)5+~ZHix8f*~_358~>J{A_OtENM+1YDNSm{yd=68qG z6g$tKB<#*BQ+>_QAcr636Dj?yAN8?JH5Jj@!GqIZ@m-M>3ll7=nEP{_1)3j8i<_e& z%IpYZP4=iA-n#&aUAwb%0!)a&|58vb`Ga3gBa2*LB~;22(tB{ z?j-nG;u@7E?w+4YC*_08 ztomh1`CmlmuPZ^4YOxhWt3mwi48igN7aqs^RUJ=u_z-MrO3BsY+hI`@BJ2yj(vF2TBdFHU0xV3B+ zwv!=Z5gMF#mAXx+4${~jY?-%EU7+Z@b(!HGCj;cM#tzmiy&Gi`?j^x>IW$l7bwjxbk-W*M&(_4+x z@m(u$d-ekQe+((|&wO{*xG{Cl0Z5 z&iBW^Dr~)Q)j5v(+OtsELN;2g1>k0+8fls5CY>bEzj{0x*cX+*P+G89d%3x1#qv^jr(4u147$D>1QazH(O2hnalo zt}8Jl{$WA|-U#+vMHqLcqY|vtAQyx)mO2NRz$jVy6J-?R2JO`EGOf zf(DP(9W?`ra9*Xv^x9Qp%)Ya}QeiPUF~OxF6WZL~u0e5RZv^t7_etg*?&ftubpC>L zY9l$a$^)h|Dffg`WJ(6+=Lcuj4ERb!7HN2~hHYvRE2pB^%8yiaTt4QDgrdXWhYY&i zp|zAVIS;{r8usM<=J*v1pA_5^=9D9NBx?ga!{K??Ih-9 zi!NB)%8lLpoy!lc4Z3jZUX!J~t|fMye@K_SD!%5HOzi83i)F{zH2s`=yp)y_gD;?F zLlU_Df2hk71K8asTxeb&F-n;)nAO@g9v@n#JHjJG4rMxuA_RwX2eRMuWnd$65H-uR z;ea(w!ps7c)u89M%Fy{9&pum%9T=(a%u|L_QY?~ncb}9f$30fX!x}Hiq6pk(GvT07 zTxg~J*2l)f8;8d+(}l~MIfS}fcqCf=LY1*&(};%9#GGO6ZYte45(|-1O@RW8m`auU z;FTk%{9sqJ%%^-~r-Y9zpJa{YKW53rF3oiuGWQ6vJdP>eGPQa&FzO3UMxr|iB=pU8 zvR1#uWT(H>K!_XWUY!dmbzEm3e}(5h{P1lYx(_Pvd$e2H9eH3PQYLZ z3U>_*&lxBnrKSC$ejJtI-?*q?b+etxWw@{%kFEKAvMHh&RAVtG5rTRHd& z>aV_JyF6!2x~%|?d(|`IdILCin*sAfDUtpu%uYvlU8dKg#@TeP{j?PmaszT>=#;@Y z2lOh^L=C;P%>-n+L6hi%qwDT_*4#ajTvL^3S?nWps#PU-%rIbDZ*MLKI|b6u#mPo^ z=Gtm0>nuKyOlq7_+%D`utTRfwII$|lU`lt&3^SNH;=|+CmFX2kuO@-ga z-0ROLh0rq3)RrjI_{XDKG3Od|hZK(>A?_^zA{En28vCr~=3z|)B7(=bDaDCWzTsW@ znlrpq$-NEjOz(U|lb&K@4~|mF4WuOA3J>wo4lT!^?^n)A*45fnJdg08)Amn|Xl8~wz?W}}vMcKY<8W8Oc#TML>w zQzu>vR|momW}*zb?&t9DHUqTsW$ASsT+516Xf-N|1Z-%-S>o8XAo1i9Ap#67$j)?J z&*2(`NUPU9EIBe)@aQzfGg2NKH+YLoEoILF8yrF^Ptd8WxlgC_4C~OVP^O1KoJ7a> z`a7jn4LPxa=^4rU=hy?L;uHpKmh-7hUL2WnUiB1O&rhvww&PlO$5JYMfPdXOa-h-n zKI|RrIU9W1x(MN-a5ke*vfoPRjYt;>dC)1b0w8VwNK66$e(N#w)mkF~4*~WtCv)g` zm!=X{zXTif1u9+(BemEBljo+cZsM}NnS=(}0)&u~p<)l^Zo0d=1P2#>xd^uRZ1q2p zta#49nx)))XY1~BWVLeNe?X9OLEcM&0nk&5ez`GcN=B2tzHlFPY7I9>+Jb71WcYrv zW9H$EOKI-tvp{brU zM+e`4`35nA$Y68k#(pE$xEEt|GA)z%*ilVn5LfkToJeMm%($icd(;bal@zdZrsZ+s ziK08QbA}|N*$hNvZiyE7i6n2BBTNPi_zX1BG8XiywB-R?v8CP1xX(Cbi`m2ysOw`GK|h?v=U%44$zndcberhs{i{n-1!uLWKWHfKfI zUy{vI3$t}7@GenZGb0zpjZ@a+J=s!NwVH zkcJW{5j+rRI5b(+iY;SBDjLvrpTaIz87spx+BE$&A775)))0%vEE;C{vw9vxX&CgE zn;IGr79c&EPTE&OS7G$ylCs)RcMQ>@+Vofyx(qh6*0c;RhWF(#%-y($hy2kHse*Vo zyK;-!rcR2y{d(7q<-*|7kfOpn)B9*pd033_MdnJ+x2N->J2gQG!3Jq45$hGEx>EM{ zM0iatP~*dOdoe|_27zq~Ty(9Zd?T~Dj9Gv3Xo$C;)k<7=XSrTa(sqAZ8yV7FL5gOC z&@FgQZq=L-#ISjoeO7NFB6q4YFhiMr`bv=f&|KO|g`)jv6eGHblPWE*Y#>1_i^0DY z6j2UgMzJ2;u2!rRk)<4v1l}{PcbCO*M?4JA=gCvV(K2z4zX7vFt01F~XCz+ZnL-rb z#Jh(PLYpiTfT8uIDl=Ii&A8A|JvrPnd0yx=6cU|^JVN-f|2Z#5zQ4c)HGC;0zx`|B z0zB3N$lh9|kp0Z*w4#Rn5jV_owDE(Dt<|s9T)fg1!raY^pH*H4(>}z>!eV?9^5ev5 z=tynt+mUcI@ryS4IqT_iJcObl*e{%6vt<@B&Tyxz= zf+MP&&r2syR0-~7v~Gx)*#uzJ#8>D};*7=hI;gU~q2SWE$xtzLxUzOvP#MLWlIP zs@MGV?h=^SIYqr#-n0cg_sry%U1fOR&8T_Jl`liN}2SOqh+Y9V%k zu{rCfn7Z%6r|E?aY~Qdv6y&J#%!BS{1NuK@Q)x##)`fAqAW8CNjiMRvXM?#_2i{G! zTtS@!+z7F-m1xKkeKpQQB3@j1O;PFj=S3H$!^u6Up{Ihk%>8T5N|m;wktNlWyRFA# zX>`iS+EiEmXvJxkiNjXh$c#C%cCClv98;1Z6I{cv(jy`6rc{LzuV0Y*3wM{|_;zVq zDZ&fzVZ+qTr+hv$th4?SI?VQ%A;`-qST~~7{9!l!wdrR`wp37=!K0&#S?i_UP=I-g z`yk!gmXAo#Pvfb!!sz;ju#s{XQjblaE58^l<$cJKOdE~%1gMD7x#>+3{&;-MuxI+A z2=8M^W6N>nacSWkp+JZ_f&*jkZ5Ai_AeA1y#xv^vcE{DeoJwyYk47eu8aaHxB0DSez4ypQLT7GxO(U-0&= zvRS3&7U(py9PqkS{N*Q-3^NVU`0PmSs?4g)3#Os>y%*Cw2Kqwd)RR8$Os7@mNUtk| z9&LzFP+N;?6W+T!f`E`kx7Z4h%-t4wDuy@XQZ9pVdGOKs4%c#recmfB%qc> zt86U0Bj@nQl)FNHOAXdcbL+*`TD`USXrI=xyhOa<^(_2~_jbdgqx520at_H&wt`tiiwx757Dx0Hk1}#!rG_>hS5ibFWoBD zPc=L?l)d}4{P;CuNHWC)GD7t>-D+)(PH2k|6K;*jfA!5+bOrkvFN1J9Q4FrY8Y+ec z*F|8_wbEx_ck@EO{`lze9aXL+Sx_3W+JZVh9n!;Yb2~k?w=46XF6fHM|K>LHU>P3~ zl|IP+t@V264KbTg2dSf_b$Ryd?;kw?zuQmdZ;D2uMCXHTaX-V<#i-?1^+__14i;&w z3cNj)k+23Vwa?bkimC$@ZP#F=o zxS`D5M*F{h{FCaDCO&m8~faxe{NPD{Cq*)OEAz2pEbD>ZqX5tKf@x^Gk;W3mJzOH_EXf!_te{?6A{*dxNR&JSjG-Wye?WUHj$tkBv*N3!n-wXP*1)bCwf8xe(wt zsM3uSv@4z6LAf#8=VJR2|Je#uO#*~64DU~-=zwshc9@aJabuwU_Ie+4!7!2ml*IQy zjrIF>Y3^2>>G>tJLJ!KflLJ1NyAl;kl4iG4M|fsgoy(v=?6%V3{~!cc08|T28y;*+ z5P59u)Q!iC0G&sA`W8st0D7bbl@T7fQNV{$BB%Yi_D-l^1wQ-;FPoVa&wc$K=vTUf zp1keV0C#&}$g(LOZl!^WG?C|*eC=~Ysx-fZ1=|(|G!3+fUQMW`2Axff4&@Qoil3o} zBUIC{1eg?uuWOM^B?mwppju0j9<&fW1GGFuCh98T&O5Vj^lFgSuYkGM1)k5g-y}ys zl?@POtSReG^tYdUZ?BPhc`5%`MQoXZyy>%$NbE6+$c)Uh>wRC}lSzr``fJSxjoRMbPGmUU`lVr03{3%92vKy0TBxj zAz(XJdmCyJcEwl;hqm0@P*PLuQM_JpMU>nh#H{$8tca{y}V2q4B7% z(H&}|G*Ehj7M;HV_7h)S&h@$Y=&HLuu~MvP0PTkTl*zp!aemnEzzdA!a07W&E&fqKB;!zyn1>RYV(Ms=bFICSejlD$D^P3Inv4OP z{o(-d#n{%Y=_UU|N|8L%XeJvDtuXxX1zW1v;kQyZfX0y^w6+DP=UUHe`+iO}E zP719rFJnOjoNR-M~(J($XAKh+Y@#z5IDA)!kRf@F`KkS>N>NNDAOB-9+v>x`+ zRo0_FV*+KbP-34nIh&@{kSJ9W!s&%$U9Lk2pyuIh83d$Iw# zBnbis(O~SqtgHSJa5K|ydc)Aj50Hy9U;bFaVKuF32jxHbW0Lc;3hZ$r#{tq_8P|v5 zZ-{1GPd?t>tj zi6SsVIW-Dmwq7l(3dC`kp`T_Xc@DA8#GcG#$&`b73YS;WqzJJpacpe|-vZ7Is0n{0xG3{cj^Vr%?*G6OHuCW7BY|@+n1H9aBuw6+;vR4R)it!@gi_Gv2nHOi3%2 z6nWGQE9*2YS9Ur>G;2D5t|^qSE-lwFhWqO)uo`nWD}vt3rvED7{u^=p8z?~>yeNiZ z+JaadUHAOqF$CdEhD4or>7z{PrsRnO#LW7FgRsUaC)*5aT&dvc8(%p#;zmv9cus)) zO(dmGw|;{o|Kaw4_#aTiKLo0qhBGP_55vMNMI;PTwkj^TJr;mU)HY$U#;9gh*B&J= z9wfelW)Z>I^S2+yeNI{7sMmXJvyk;Z+yuG>{T>y+umQkPqUXSI-n5lr^(ACU7F0SK zw!U+MMFo|t)yp5`ROs0@^e^I1?vxS9W3YvpWbG^|8PttZqz%)2CCUxH`=_oK{Z8OL zO3cycBwU5S%lo|3Cbk~)7zVP2wu{%=>5Ru{S&Ca=)|oWYKE|qPJBSS`@5T+H(BZmk z_ziIlM^j_>)b1mfj5(T&r2Y|Q=b?~k8z;bq7ERJw+2F2}D*>s(E@&+_rPbqU5TV(( z(u|aTfDl2IWK|PiH^xmTMopWudzcj#FgyUMtl!X|gJ+}9Iv(XD4w*=LlpXWwVE>}sy1 z#?(DQ7_D118;mdxnYac`7{dBu%g;K0?JEGfZOQ&I^Z_1vuPabSVZzY7FxHa8?(ADN z2qjE8lKnkC3&Pq+3$iNGF+N<7&H(`xU>cTi5SMI#nl#7v<{a^2C?t`7zZKXeUe5<* z`E0lsqek2Z9e07xFdRvn)%Ds1UhpDP@5*qHGS#>x2`T{XLS}-7C6-2Aw7Cmff)%3ty=9VY@T zuq;A}X&!_5Z}AGNY5_Db#6VPSZX=zFh3@P2eIBPV9mJ2 zxG;&(Z4u3jgk#*=XVhYC`V){yb%#9EG6=>ZOQKs)GyEzlwe}+O^n4_I0Tg-m-p=Yq zK|OML^S28+!|(4wsA6nd2>|23w}4-07)&jktq}!+9z^+$;I=a364u&K!&2KkN$sj`nVI1x4y}0_ye`<$K&-?lpf-dqmW_*-)DWl3*yF zq|Y5_Zew*aBl}}e*fIn~;7U~s#r0BhtR8t$`IJgN=g^>e76HU~c^uE)&Rdoz)9e3@ z&hLS2kptEA5hptcXy`#HLW=-r1a$*Vs9oJ%GVHy3kF5n!c=D=wq)(H6DklD0vZEw< z(l)HGPSrEzc@N?8%^|f3Xfpig8=@JMw!7cnw9B;?fLKPGGJmuB$d~I&UZAISm@EAs zjXt>t`_Q56h5c!W^inoU48b!;;F=7~;teTEp=!KMEUfwLQ~E!ByE7RGd(^t$X+K{9 zu%_z@F=1-IIs&LvDrsKB_lmxz2Yru5iBd4~_lzU28w4btt9$krzq>}@)-uHMNCT$x zght=-HvIV>$^VCYK*~g4`~S-IUf&+t4>L4g?evcW)63pcc*>g1Km~wagn1C3N`x8R zhI3;flI+{_vhTBc#G;Xm@60n}r&s^z+e5??yKa_eMV1KOAb|mi4?U&8*-D^F8`y-T zCF6Z{*guI7z6L>hw#n_e;(wp9{mPNn7C;XmB9jQl_8g`;eF{Gb&6Tw7?Qs2`PEPQH zq;A)V;i5~D$orrZs(?-(0Z9C%i2Z4B`5M9Race>ZT+0X7mg*Lnx(#UOE(&arsOa)z zg);pyQ3p3BD&xp8eug3KZsppUpUU7JJis8Qu)2*HKaHsh11GRN1LIFaF8U5DNF+g zz7K+UJ9wlW5XmptwP(NL1$ynzK%89%Jg@B2K?=wz@S>^(y$Q$P^lm+8G|WbGB2J+C zP@bYj0KC)9a+=li6rcTAaw8_JK{*DXEZg?oHny(&su;G0@hJN&0Z2`qba;V$7;5q9 zRsd}Qo&Gt{s)7YuL!iX+JO|KC!2RJ^;6wBSA4(Qk1eh0uQ+b}PhJtJ70Zv~B$T}A! zLgvQ6NoLzLKnl3efsmiC#y;2g#zP$ToesDiID?qF=60Yu9|f4&7lZ)jC~(nx-`ks> zGYx%z_$D}LPKIl=vHaQ_G#6;MH=)|ubpRFm!2SeYkXo`#gS^f)MZ)PRJ6F*A26l*rQ@Q_!z?WMeFJt1K&h3>(8j zIvFV3F-AGSv}4_mlJo)kXeSq^_sc*L+#3;J_fb_#Y>7=46aShF+3AH~Xnl!&1hx22gPOs`>+@>#x11MZdne<2AI;(8Q~HnNT-ZkQfQ}5(VwVQka{O@+zaBYA>0BkAp z%dpm-`S30eASh=sY6HfUw!*PsT?YixRj-=9?$qaey}ba*Ajt1Y3aURCSZ7(3q|dEv z0Qu%Mz|72=jP(=3)~>=n&5QtIWK};j955-t6oCslB?9QMY6eWYFR+2z4QhdWp%oB| zdD31$XBXt_&Y^^^evi%iL0AKk8QzxI=WG?EdQn2jkKLeX9-7Z-+jo9g0n>U2071ah zjVqw0FJ^y`{6q$h`89NBg8<0n5mH)!4U1ugG=eS}&289&h7=)hNmJc>@p=M&J|_#j zfW<}e>77@ATzR^20p9KgOd7LPqqZRZzX1a5%@=yuXLsh8y8S*bX})aoepO+=ezmmk z^P?9>uV}E=fXeYnn1Q0q`QXWHAhw)%@XP%$@)*fNGXFR&i{fO%4z9NawNsF+VAjoC z=P4=08-Qb;r^ic97r|83S=*)~t;|OBL`PipzDi@qT`P!s9aRUyxAa@dJUQ~)G^3b7 zji)SgePw_JWPD+u65l+={cXY;{&KY{(G><_i(g~*kP$Q-ju9+_Jjw+6a;%^b%5@ksRM!e8D z0V54l?qpg0mJ(wGwV5}D3t0dl)xrP1_5hX)_X}XnH@7d?5Ednah?*xNC6siozp82m zTUERPZ$_KayZ-|uq)}JUtv|i6>>F3m4I4J>Y7Pf0o$-OgRxHfuiot6;z&vtk+kSlV zk`zrDHk6V}hlr|YA}$5p#9q})eh)xq6ntZfoY>bfOs2%&OCqJ1;L>byn*jKS3xGFA z+7t%fZsq^M^{wEKl`P(!yC*GPB9xow(FV3LRoGh&A-sb;>2w*gpr+4KC)aR}h{tIB zc#)cuqKzDJ&KWY2FuXR0_xPSvlZiZO0vOs2Yq{s^-#xn!sZnh?H$?;T-ii{D$%f5? z%`xd=og39>tYdPw>(pwLglpWxGm!xQQ9p}hXi$gSmNokeYKwuyhb-~lhO?Dh^t}@` zHw=MG$T)3%rv@B#ODxIMy@&WJ`Nr&}2_1AA?$F#koj7GRp>#m>VJ$K;N@Ea?VJ;Sb zMGgZt)$@rv_Xz}zNoESspM*WS$bBUMHb(VC&{c)iqZFFgkRiE3_Z-Abh~jpkavKlC z8A>_*9vVsfMFmsmg9_rq9cQ}4PbViD=DyVmd#T^7?EvaUAz%HQ{3^Ee&T?_8ZibDt zDp0xufC_af#+-^DsL-sWE!Ux;H(Ld4jww?=4v#3|WeN2G7sx-`OafjP`Oq;Eqca`* zZMX~PDQ5992p%ET?|Kt{N#B09XGK+sXDHBccPzkK!5zzsr%GZ#SYUY!q_EkC?6jYZdkAn6pLO4w}D zdV>(w6Cfizj}G^wMZhe$xkx!tmIIGY?L6*yi0(}_&l9hAD;d&Bn$9~KX4%Ql6FQDl zUOumhOfdbv1-xY>x7cD8V8oi{O{FtZ{zIlvuZ4bxAq!C~mU8NbfS#WISr`D%CB{B? zSs}3X+E!_t03Y~_)!z1FD$6KVsee13PJ>Jo7tPCrK-f4MBn{3t0f(9FNXF9rfEBgi z6zn)cRHhEsz9Z9;&oM|HgznQ!(N%MrD9=BLb*k_?ct=M7Mwio$Q34ojTsYl~-br3S zkXZZ}Sc8afR#(IqM^S3=01|A@J3+a}oIxY6^rQxW}wvrG=_M-9vg8N^UuJcYJLEyQxZT zRKi#u!vhVu26_;@;dj$2q)>H!azj+421f;L^DNhS3lo^!oyt zzPbY-t8T7?Qwlhg6mfcMXmU^xM#uw2oLmbZ{LxiYABu(TaC@KGi&8BF8LD2d6Wced zA3Lzs5O}cIuE#d@iaX@*pfw_37I4^}0Q`gtUV;Pxn&b{10mjnG_1OUU6G^CAfEyp` zT(O&w2LIbC<&N_c3iP}wFp}Zfh!dI7^|G0<*Zsau%|pcz+B1OH;$VFIIMWp7_N-rE z&brQc<0A;0hagH+mGlEBmaEVEif1NUT=`XJHJz&Rdo2?%Qq;~=+~t*?WMNY>{)^{eGPV0C>OND?^l)YHh&N zgDw2bETIT-k!5x2eSfa4r@+!}9KN%Z)im}4Vj@R8(Acmg2GD^&=B?gPidqiYDFv7d zrd={$m~irU3QIB*J*b>}P)^Vdw!Aum@Ci+y*tq&|N8h(lWtbNL40^8&Pkfl9n@b}? z>IQHaA{uhN4Cb^>0hiR}u_Z?JRLP82`%_JX8Kn zw9S|b-v5Sgp?rf&iFVI3p&#{vhwMH5=9LmzSdGXs|d#TSrd?q~2A{qFm~ z;}}Q*SBPlJ6aR;YqL}aiDO0xFXm-ti1N0Bk=AQx>zy`Rw>_3?!^e?<%jg7Tzjj8|H zAVF#HUabI#+;8R#{pd^tT*GG_C$m43Zpl1K@Lq;aYwF(@FnBX}EJ)3ctD0H=^rRFM z=-|C6`VAO=bt_u}41>q)BHrJFhX~#)&Abr)SGUHd922y&9?B(ArtSV& z^NJJv47U8ceqwj)`>UP*XZZgy;r}!I|Ev#S{U>%(Owh&nL0CVUeFhgSc?R~%|2aV- zLq;Evx9xyv%vcN`{ox6F4N|F4PpZi`dIGO!A(DJ|tE=EHnElUuzON#3v;VSkPxd=6 z1mCoI>!7reYRV&ReT02U8z_!Cfg*T{%${>!*26sW=K$%d)?A3ybTSr>x`Njq7n?17y943I_a zuh|#=_9&6S6J$EpI6;akO+Nr6o6a7&~EC8qjAHMgW0KYvs z1|GCj>B1TD8TSRY`(cKPc_gX%4IU&5oJmy$G}Ml3NIO+`+%fOk)%agV`%ZYklbnCCkDw_@16?P6c`^x-=o4^U!6GOTL22$`#+Hf~-8IC&=J%gr z&5MOrTW*BOKAIkgLQp*;fDQ~z9U(dZaq_Q5LAisdaRHD%L5d*0 zxn@#&`W(z2VE)V{gQb9YY_rFT^1pDXBG0RO=Z4$aK2Epx(hu0rx z*FXcW8r!dTjiDI zo^67is5RinH`QH8y$M*3>io}npfp(^sW)1Ps^*;Qg3W!9X4THTP}9LW2ggPj48qcj zn5bkG7!MkxR+#+7M+3|6-n(;(|7^E@TFFT?AbLq*K*)oPE7xnUd4QBcUat8)(9oMp z2N}Kw1N27VWI_ZmF?--GZot%4Etp<{0}5sUQwC;+WdUXF7^pQ2&!2eJ7hNUk-^ZCU z0#pG=xl^Ckg{x0V$t8{C{8bQ^%!37OfahA`Vb|~Q0~_~)xuNwCz_Bg0``!XhS{x(! z2WDqZIGW>!SBjX>(H97ZxliQa)>2qzv^rEF?FYLp(8BGIUb2lfHKo5ujjFab!# zy3zv-4M24ZVY&rdeeP3wWOwtepJf?@;7w%F2xNvy#dlkdTetnqvnPXfPj|fC2NChT zQt&sgA80Kr$v{dpW&aMyNv{u7p8McHg=J7?PN)V1`Ub%JfWvRh1ume6_%(rj*tImY zETxdCWnBaRB4Ownl+*#e-`))}goXv%;posaaX=6;r0*;x@Fb&9=eQg8u=YZki6o0Z z>JmU@l?wL&{{v;g7B&}HP_}L7rWZgzan}_vmPH(c>45rD=fLy#mg8E5tBA)6Ir`Cl z%V%8_lyVI=NvD6HdSQ;iz-v8MZTKCO%xIq?Dr)gQex^bU%u}+^VmO&a?=M&r4`9JS zQSmc%_U%W+^B^b2<}p=EfTqIUUAIulN-r$UXW%o!PXOJP{i3q01A2ggb@Dbddee=# zVH6AP5~2eK))@O*K9FO`c4B8R73GR z1t#HbpxSU?4u}q4Nz!B*|$>1mpyrd`Jkfz{><8)p31J}4ZO)s7~AH} zlvgf*ws+HLQo~%ipMnQ1O1(F%M^XBBG$v5sw7VgRYO5d#3X^Eew0##o@isXWee&h= zl>F?0H;V(-b)zZeH;K7d6ppk4LVXZN#b8Gp&p9mwRIGG9cfa0 zcFoRUL52_AkNfvCU0OE{din!BkfI4;m%%BdhaPdihBh;gJoGZfENQk z<3MxAX2C=PG(w-Nd;H(>gX^=+66`oL<-_@=e?ewm8Hf_Hy$fRN^h;6b$e@%5)g3gS zh+@d9R0&3~Tmm1RN+)sy_|_~Q9*A*bH1M^rMt5eio1YGPYknwiP9QoZg9V296;uX? z{QzsMW$h3|Vv3_dMoP`}O_wdT1SR=lggG%if@6Rf=i4`Jvs|A_cVI?Z=XT5XN5}Af zm1Gni0uKVVfWDwg=eKvg9@e{2a^f6J5`f*o3?2yM2t?2 zaNTlar9Ooxd03uO8k@Ba2Th-k=v3w&?=XFxzUMrs_!=Ip8ZSgrP1WXGp4s48@_;i* zenv~Qi7rStjx&H+T<05!#6VmHDt$h^SZee!A(dR9K5cVE6XL5`Be-z|h*(i!)X_CN z-%G8*WOpcu3Qk)+$rCD`Q;BC*Rv?iBzjfOQciTh6g<}T z+@=(71N7>VTYfP1+RR#XA7!fc5n#3Dfn=#yN(a=FTfdX!vPZvSGVI8cV**4N7_C4Y zJk}9rkwurq?(&LSq+VN4*x`t!L|{crn`RK@XHmO3jP+e9ea=jz8-se0?8fMg49+zP zgjRpQUjp}$*+@;`cswmUG)|@d4q7@(>@JTpgxdnASY&AKF!ylAaMF($ww-U0?>%|V z^Lg3_U?V8NQ1J#8{If{p4iJv;b_u{iNjfr5%mNKk); z0J!8L$Z5kyTT|9rHzyzLZgnr3-2Q{_V2~(tv5Fpi2c(xdfr!6*V4gX2Jb^85!HX1&%R;s(5zt>@9i~a- zsdVR^gA9IE3h$yYhfD^cg<(GhLiW2EKwXHyi))pM4wJ8(47=%#w;X81pANrvFLU%0^wwP9T9cE883{KG|pSnA=No38W&tU zx~#`lSCJA<2E`|GU_Lt@VLjhUSrkCb4wP^nyyt+pC8p#`LGvDHFCA*0&(NBN+;#n6 zQt4hJIGB|npqeD={(rIemSI_DZM?UX($Xc;H{IRc-Q5iW(j_h3b<-u?NQjgI(%ncZ zpp+sif(QccbvZN7%shMV5BtM=yvKXIdp>Z?Tyt}C)mrOZ=lT2p&lF(3;Y7EJ+`=IqQ)J>8aYTNH+=lXP+BWbpp42+iHC);wYc6*RR`q&I_JIkzoK?T~l6 zc-uve8&|MnHpR6koX$An8SU)xmD~Yo%ZySJzimY}B~JMZPz148KN8H?*QDN*2bnif z^>Gq*L%$bt94a{;@rK*>t*iih{sP=N=aOoDkNY35+!kNDF8VFZ)x~y0P z1^!lpu^QzFz_(-`fOye zI@!-L(T~{L4ecm5>%x#3J|mFQxzca|_|~lUiyQmq7^+rcZVft`{;R6qszfhiBo&QkG`FXm4^AHD0^a(WeqWGsA@K&m?me~}(UJ>}nW9hk{1m;BvcKVn6 zRoe|w!aI=l2#(}&fpG~dd()~jQ!-=Y>(whohJMmzEnbEegnT-;W&*{T5fJjIlO$-$ zc6eAzCB^ud8_HG%iMZ2GwrhNOKSX50I(nW-pV2f3S#*Qa)X&0QHE zIr>Ews%TOTux@R-nN%Rk=kDHIjabi5MXt6yl&3!|9pT78sKDVfQ? zKG_vw`=TcFy!)*koPwD6e2+w8@8Ad&l^-;WN&rC3^L3%cJHeqLWxcK|X)CmS2O*S} z9h5t_e|a|9Yq$PR-t%TjUIp>BtC>6yK*Z`CN1q&o781dC-_>cti|ZWPQFv;Qy$U@GKFzastB~&&41%^=yWyg$PrT!t}(^@+^Iu8 zOeXh%0%SxES*`hwhwEYdg=)(z+u!(C@4dj%86pU+K3lds%nYgsp2og9B#wOYRg@6@+bSXG83BZP_?Wb@sfs_ zZ~B)ITIRSdeiB|mY-vY530mtX1>q=-cML3#+7V}5niUsNRBSU$W#JwV%Vs@c+eGYi zJx4Z&_Q$L-hxfwO1Pj$7T7S4jaR2yt+pA|>hj4bN%kj-&`J@Ytn>tZ;(L*SMUO`%8gbzhYWGYl4L1gQd4bSd{N1NKAI6N}b$0QJB+(8Gi(&G_yH41OGgdi5DAp&?anWt*nfm}5?|YS^`+CL%F-+X~8hIn$Z9u`U zIQ7l#*afeEq%vNkZFS!U8S;gZO_mxtVQQZ=$|@aFZEbw_zKYY>Nz1}n$VCJ~yT@x% z6+>$MIkQ-Znq5h5<0iJ)k+BPU1SO5R0dRXYST5o!7K^p;2VAESi zcd}M+D>d#D#*0F>g88v*3KQ>f)@(;$t8p-_nHI2;m9W`d8TKi_6aA@jztU3}&2{-} zS4OT=!|4zCfvS~5AUiGw=G3eBmMeI+TL(LXS`jkC@%ZYiv^~VA0{VCp;Ryj2w-C%) zCZ2pMH%(3@nNKGe@CNm~=XA(E35^I7&$~Nx`3#1BJ3O4>$G&a6TyKu@r3b4;{g%tV z%F&8&p`^^q_eQZ!)OJcTl*TN9gCs=_a>8iSZS*`QR6m0f)0iLCPVi1QGrh4_cc)iM zZJm?5GK@B^Xcd?m?uoB_%`2*Fbl^&JT&ao&#E~^qM|n#JxVxYV@jY%)$wo1P=K-&Z zgAPri&kr62Gi2@hMYfklih8Ik_Y4gsa=9m9WH3P|i2zNTg@Gx)?&=oP&t~ZAwxS7a zHdcsT7-xFio*M1f-Ud|4trzeGYC(^FHUn0o&=&p<({LB@iIlIt$#s2i(<53hIS63D zOVXl>Z;V9u>w&tf{ZBrK8ZV(n-r29CR$EnE$8#LjyJ?adNI|BXMMM6=j47Z}G}?>; z6L`&gp2|p0Bly)x^U zs}|k3Js3-)jZMkRpS;lF+)BYm5OOvkse?r+F8V!4`hnDEuw4(f(+8GR>uCq+8Xlau zT-7;m?XYi~>o5{w%RqBhjY+V*iV}CBK81PhD88L5;YAT>DV6s zXZAu81FA{fT4kr3B|_#TiDtrE%9cLKBqvSze57jViT}#*hd~QbF5&bJO)}RVri7Mp zeaPWUi>xEmfrmB;mCl(FgE&BVWF-BXf6UdYHt9Nr>dFYtWG87%Id#K0-F=>J-CiW6ta(LmXg*b0F@{w{h$UVNrfT5cS# zi5*se zp1?Clm=hPPBR7}{GQAz;TjiYMeW4c<7XIcVqoffTLrzEL6E#Qt469rO3%D&&K|8T~ z-@V6(_FTyIYiwG5y5W_1$qxbRd^#Pg za|VI}|`Hpd;{VL*b}9=PF{=H8?9gocBD zevoA`UI$k_*Zbg#ed4uF7?wYvPDdM}6?*Buv05)G~#$U4Z^@ zW~Dq+W)J&$x?Z3t{@KvUreXhtwv06F$vDj{iwGKi?Q@kGy<-8F&K^=BkoF94=#N>} z^y^6XWoo9v4@Tf0Vw~pck0A5R+^uYuS++?_IG|6x$EC*d_%ZO|}JQZ~{ajwahkQfr?3#qy?Fs|*!B$#Bmx{`*{v3$OH1@k$&)v+VmN=S#WF79AFraHDQP%f4F? zE(58r(UCK)=RBi`=Idfo3t=iiy~#oKG}XYZ)^;;9mz`XGGu^rMZv`qfNN2DxT_jE}WPnm4mbt4+hma$g6r9 z6kMy9(n=qdEXP0IE>v;UdV?KV=Gm;uCj|c-O5o3|srF!qJ9Zo7qKUSX=J&9v5f=b2 z8-b|cIdUvD#QYbvW2^+KtIBKNVc|v8eewz0bK?s^LZAD2Tj&HIc5L6K*w6`?LOW8| z%6^t-*y#CbOHoO7YBaI0wiD5xFuI4X8WTZXNCd!0AZfi9uO~Q|HS|4^z%t(`NPj^; zwFbRejMf@UUy7U?d`;YYG$pWFPXE&WQ5YvU|-=lx%|%Kr#1@e%wtx3qD;f4v5l$+ z25r0oXz6FOGIw|Nt?sFsj(gpoy8%i>*qeXgPzdkmc@6s_J8~kr!w%na&dUOXg3CDEe#Z zTb$JEg$UqoAMS}vc6J?q=YjD+x_)DmOUOz8fHT1bQ^W+g?^$f5hwmwJH$RILz6%hR zJF}(5w`JH(lN{apLZ#-3m=^Yt^`2aLSK8=BkTv-P;lW9&WXU0JL4`N+7uj%2R=Hf; zU44(T>9z0MvUMZR2J*9EnGhH9bc*Os?&dOz!5eD?&sY0d%MsMW=469s1V8VY6yc^!Ijx3W(fLBdi>-X1oAmQ1fK?$$*ySQE6-yLC>@M9?yfv;e31WwEz_kWo%uXvMd@t9U6vh&$ZII zM8}roV~`vmpo-0+NVrWM4}E_Gc!|c7&~uV%$S84<=hjE% zlDqhhqvse23Zi=WiEY)p%JVAp3ciW4`!|8e3}%+niJb($Xxg4$+4@<4M3rDIfwK=N z)HHe$b;_r9;1*X3-u6tjjSV*?#D0KZz>fE^3?Br#WZ zk4NCkFcQ{8ltFg4M9E02Qop&fSu(`0>q~hrk1DEAdEYaC)Xu*t6QB-%sT6jr=m`-g z1HFpPZjCl~Jv&JdF!IhDkE#=z2$U-&+KouIt{-@v#zwkIhvTI-FUOSTntQFagF`&X z?2rb+YfDA{V(Zr3Ly2{i0n1JLQi0spo`eqsZQtPsdE<`p7JXuQO<-P#JB#|zaKp~a z!7GM0$|$iSZuKnElzwNHO4%sZFmYJ671)g9sMj8~r(-^uV?EA}%%CRnP|P;9P!p5= z`J~Omm+hD}9?Ko1gk>ceXx3v)M{D}tlG1$K-DO5o#PYAIBY_XgW_aTDj(E|hkEm#t+wf~ z+Wgss0ce^}-}N>I%H!W<@!{8=xmC{+F0NXV65Q**%-}*#WDvpya*0J7&e{b|0jCb3 z)9EHmG}RC>)yTJPs1kj7wCwr?B(hy)K*)k{Acf z{2cez+WKvzzV`|Db5I*$CdneH2$ntymHejhHdVVfRm53vYoQr)!A5Mi`D#;_UNgL{ zkB{*^qkd?^92}rxWq36HfsFm zHDay&Co2EC%{xc$5f>_@43dSX?)+V@>wl}ST&2LeK$P7t1-?rN(O=?(8b3wndlyxB zo$K@Vx08G;^f9uc)^r`3z~X%lSq(N-JG65$!B5&S@Eb1r4IlX69auO7*Fm*p5R82% zq^oAdU^ViM6)nSPzg5F#s>u^=;FUk9L4(QBZSm9tH3>&Ct{~5(we*iN0Q75KfC=L7 zmBiLSEK~`*ru#8?rDfN9G4=5B76#4Ba#y*y4KI)yC%LY;YpP%MXW*AADTH_%i&Pn|Q&b7l=a;4az`7o< zPeCKHfRmBMJ@}dJm{z;9(c#Yu)D-RPm$Whb1`cDG7ugVho#*Wjf41^>CGKj5o3D&YP;YAU4ti|_zf3;mzjWl&*q z4Gtjx|HLnpgnqu?_4^WL6Hx1$hsi*EeISSATOADV1G1EmB|U?)rBxx0-!O!uz@JFW z@Z!Ib7=&Jn;KQBar6Uy{PxcqtVZR=nzj?4f3IoC*a36~w%=M&!+3>-(%NvvyJLszd z5E}w!AC89Jy?Bx>)ZWBx*_E~#7`;hSAou5^I>U9xatOYv|F;es{H7h)FHVGUnf!6H z|9WA4KneGY1*KZullIX#=fvo|3CDffjNSzHh0aU*PDt)MJ#skuDK+1dpZ8|Eu z!E|fTzsh~&`?tKQmI2(0A~*~-`ve(k;9x&JjRgd_my<0_(WwIDD=iGmYsowsc03k9Ljv$v`{Mx^8p|Lz(5l5xY`GR0PO-27>mHeiiZF$ zs09TqppgwdtFype@&$$f(|mC3CLoQ2Gf)8Bic{)}U(s2I#G{$DocQ04Ns)RY<6;4^;{^yArL^rmSm!&%iCCmR{ zvr;89P){+CI)osT!;@ebCJ$!@NpI#)Zul%Y3d6wApEtYeLUvpHM@svxg5 zr>cqpW_-X#U~5G&s){~=N=st~^|41ifJ_D+43!&9 z8r#>LLA(HI8gP)vfD9=I!2%RhK{Jsqpd%ZK$^%3acjd%U4KPs1_yB2Iw7ltt$sW{G z%S2mB4V44NMA4V4Ut75fK+N3w`l`uu#GzW?FDfLF$pwPg)`6CYmy>@Ik|u)-5CMY@ zxQ#rzIT;WD70|&x@tYs-IGe4X+xlrTbMHB(>Hq{vG+GTC0POOSyzQpk&4q73Jk~=i z0`Q)(RgaPTWj&#Pu?Ao!&XW$i(*!Dsm4U{9+oRJ=mPG2csOB?uDC-9h%y*!Z)F`x$ zwDn5EiMIhQUS#fyWva_rgRrcWCN13ag1i({h!0@>@6~~QS5DBH?3BF*^~G#4S0Z$> zkDLLJK}%^C2!=k-nm0OE&!V=99HC1(Q!(;QS6V2jFWP-m%D}GzbX_5Yu{`( zbKM90y{m^L8m*7wzJICO@OK`pT(&t#s?C8e+S&XEa$ffwNRf)Nk$ zt=J{_uCd|P`s|;YBHk=8RW*ycsv){6hY`ZLTFR1znGc+4i?nX+ejob? zRDJi=;i9%lj_>hHeZkZ_JOIfxlwi%-+_%Jmajw!h@1Kr(zZZ9R5STa%YTwJ0T*n2C z0f=AW6MnRg>aS>hqRx_ebVBgN8gk09eGkydn<`3)!80T4I9O_)Y;qyclD4vOie?$5 zKhF3B?6dR4^i*3f+j5L1sg-f0vqvRW3nZn2Qd^LftT7?oeXU3YCLv4S-*_tBtR$kb z;62oZO1jxfWEw-3#~4+so@J^gI+Z88%D%ZXd%1T-zw`m=Km65jOD@A)pkc8RDve7% z)NI1}Jp|qG{tTG+>D0)Kd)ijx7KLoyA=?4l((J&&SIh3`0E^QksMB}_`T2oHK`Rz* z(infGouCD)Is-s}!U)D@h%dtWzzj@Qb_LcSX3TRid8M>sdGOkO!_&8K@y#TBa{ z&01jVEn8E8fw+y=>G0#rHqZ)*#EkYH_jJ#>ez_ukdAEvAU^We6f+DWF zR99HM?jco4SQvp*cWJ~tE)JoUl+@vTIYu?XdGN;PpCWtqm#TZ}kB%w*c*ej#47uGG zdu?d=^c!GOXQE@@+~h-NED1yW{eumi#JwH$;28ZhtPjZBNGj^z=<~a(xjrzF5E#xk z^F6t1G}mL{8#5fvxDRO6^-C!)p-DiUnrPfuiFw6C+LFnt zSKcMM72I?j@6Pw;e`+D15NJw5KS~%8@)wC{MSm#6)=GtYoz)E2q8 zCF&)#@^`++}7Cv2b{KF0X$3KEyt`61t7Fe)bqz9I+|cBU$9tOg#F18v&7f?--p!A`FOiRF_el(e zFpzGvcmhnbq~Za)SZ-<3CQupm=pt|Vlgl=yumo#b#+s4}r@+YpiPQrgC8^wu3@c-(an8MmJ!x~^)hke>So76J z)2Mwu!SiUPnXTw9JVq-4$7U}@DDC-M|E^@vPy}^sUrxhHZ zz--<|1PPvNw!jWeVFp+i>3-`3F^nAcjsD-aGp0{7VQQ?@vt&qh&BJ=HMv@Yh;OF(u zqwz2?2+rklFe-p>?ih(HD>5Gb3M23w8Pp3DM=ybGNCmvP&tJm3r`+Z}PzR@N%IYiu z4e>6BnPwU%*?HI)&I+75sQ9_caqx3B@$V5ftPVC$c1U1;$!JI|mt+6ti%Jov+JeWgZLMDOBjdEwLceI^#%enRm;Ki>Kt*vqIr%J$V#$XCLdWK1T0>De9UH`h}k+E%`HzsdWVpBbTQb zTvq>_Oa6K*0u{j+i($`_)BEel?+*kjfy3F2S$6+*`2X*5us#LKvOn%kJ%-Xd4D#k{!kHUPqF885Ht+ z$~xdfuyFjoM%dajfP6F{lsItruR-|d(jc?u3cOzff>+?~HDGK2v;{-$rV}^{n#@d*|2+l+4J>s(6K$rmlt}uaL$;BSD%0)} zXo1<)8kwd1pR={JMXui_Cv`{x6Tv?&7_$JJf1jDBhT(rc30*D;DY)7+kME5C_$XsZ zUmv(**c-6(NWy=<$l+#kj+6d@N-u}8-YigXIgCax}B15X( z0-_MG?FJFP#JhK(xhOv{rKmV-?(-13e8}+Y>L>X8%7}P7pb(%<0!U$~u-hqzf<4hD z*scMuK*6YDph91<0~?h6L5b5Rcj22u@V~Eb2t(ER69f99F~xC17~(EU{B?+0fgl(6 z)nb*n-i?0-y=&_ff7%|HsTG}K3DH)pgAYGL&I5oSR*N$tkq8N*gSM3mm!E?_KWYY` z@s*u%u9@KFK^ZMVYpw0;)*uk^tAn&+MNARMwS<5@>^|PNub>=w(RW8&Gp~p*vM>d7 zhC5`Bs*j^3N{8qE@$#f1cYXVneaxyS?ezpFT)MG^ub}_nA1GfUTV;5eIPC1iNmJ0W1O5XN%g0 z?vN`g2sD2D=)W7{4t7=!_?-rdWEuGCrpa98N)}SMu+f!mFjdtQ zcp29f?*MOgTH@E07N_|dt|6o_3I)k8;F`*|qZF~d;tf1So=OVUjTb(j6{(k(fc-5QVYSr%Wn}D}JAe^mzg(i>_k0p!Z8r(4l*fkFT zXzNy#O%?d)Cs^vAz5+~jZW6yy=bs*3tQ%kllJ;`t?P3}9lEP+ELQcyfW0}@$r)Fny z<{q}aswnyfcIC-3y&#Fld)qz$qJrh~>#1pA@=3P6pvR$2|J#3@4k7233rkZj%d8oq zcc3XqTQDv_j|ytnVIo-C4Oaa;zBWNvUxM18Ct%l!0TsFrZPD`u*)({jcv*Xa^rx*s zjU~Z2hto=0HU)~$U>sB5A6Wb}#^f0MA=wOf&!ftLCm??n#4aj@cG=H3cZzl=Ro>0EG)^gDcXz=EuG2_zqZ-8mHYa#kx z@#f4VBb3D24}2HhV)=6hS_b&k$UkO;F4==ppd!6+%saLg$BZu@6dZwl%t#?Wm4n$K@1foeVC^}JNRybD zD{9(83VoIXKvf!ahECzp@u=W!Gyn@ype}d|6*vlp^h*K|B2@QepLo$C?1N;Wj69e4 zC&)HyoVm{STg4hr=@j_x>~L2!!B&$5vk;IA^(56cfG(glojSo17%!lu3si) zx>nRgs$#(e)aQ2uNZ?Yk28w?SGSu$R-Z9BFgjb8GM;ahO2EagQuzF`-?}!O<(?V9^ zy4H9fCkp8-m%|Yr@j||_VpGJt?rTAShmY+WSJkd z30r+>m-lm_dGXxsk8r#JRnSrM_rACZeQ;JvbZ7Q5GrDE2$QSs{jrqV;Kd;pcGTw8o zWY6TIwiYHz$NdRcDc&NbP>O$v9ou)?9`$9?vO?}CAtlk*A2eJS86 z#vhWAxTeeO<7ITVI|BBGG2g7cVIj02%e;TS>(uVA-A0n)m_f`wSvZhDGVI)#%kY^a zS#T#d=lq$yHU8=ITNaI=s#($S>us)AV(M^Z)gPmu*DhZr$f2!w9 z!Q~4~4M9u+<>iPOTFw`ppem`IcO;EY_u($QwflVe1ibZf3D)D2gZzmFgNRsED*5c^ z%SzA6+PEmVADWLi!gOO|Fm5+z$q5d` z5MJHw!qjpj8<&4%EPspHwO!e(Ur?Y+d`~@{S@t$%AP^?n1nY9{&D%;$gGl9T+Dr4! z{ut1?1&a)p)eb9*`wSE^OB43lye^U?<;_Nt=@_ zRL}28EJG4=aaH@0@(VYa~g!nb5P zRm#FCKrv$p$X)G#6seT@fivTTBMVrbwF${8UqbzMl=_bEfpiPFqJaZedE6-T7%MfPuoo^&Gv5mE0F+}uPlOjlRu~~g<6mI(HXw8tBTLBJmapb!HR;}5eAaCKd`nA?>62l+nWTg4YzOB^@M~m zt8i8xJH{C=kX07?<~%RVOBQPn?7*jZZTW#_x&j`Vc{*sLNEhM@#%r56n`XC4&^@;J zh4CAcE%LH6Pty*&ETd>YWbe$1-eYK254>>p)N{u_JjND|g+sjCV#vbO+3A9YLeck% zQ^A}K^B$>;pV->2fc*inh zh+b>taE1$zwVm%#Y|ZwVT9&Xzdn&!77&+Gc4A$ID){#@89t11inw7ye;SFQElvfkU z2fE~X2fPXzj}R2$p7{GiKhr1y(?g!z6{xJT9s0esSSth*_uY#bpeIncsMA=MhTXFu z8jdaXFJLk&>HPKD&?c$HK`R)BMFc7hWWxR#_s%gbNb~N^4i-MwT^dQSuQgx0|0N$S zVZO}6~S@0m8OJEF~-yXn!YkUWR^ zIs4lMm%X0X?;fHE+>+S1ySM{}Ru>Rj;D=_kj+sK+whwq)zxMP)c1RlDZ@olk)Kn_j zeB0C%w9DdwN`cM-4=;&|wOg$I^`d}Ac(*eqM%4ESpUZ~=r5AR#(#RyMCzlf0Y#OCd zLu?zC-T?Ra%{hmR{v~nY35H`fc5WQCjF6wd{J5G(OW$LTg}XTO@A1B+DYTWPhzk$# zJ8=}f0s+dy527l5Otxy|)v5KyGm(s9;{zSaY4>P12J=AXs+so}FG!q@$*Q#xs=C?A zUFM-F%5*!{j(65Oy*)zIXxq4TI15$JTz2F58z$Mm1d&q{2`Kpn!|lj~<=YCx+*rNN zNKXa3AFiI`rsd%9=5u$esy%YrYmSb2q|NVPzW7Z6TjFvq#oy+YLH#LC&&Cru?{+$; zTuleCh;eL2@mT;rxz0Q$J28wPsqthSc{kb5$fc{QIaZ9xKOxgS4yRQW4He`!`nKoR zoli`nPMs)deG|`74})^&>9YeA#O2>2D-W5Z)1%{X@g#Kxq-nMHOUQ(B`J3?Xd&^2+ z9$g-tfMgeirfe18!_79@Yb~gtOIJYTe>gCamN^-fMi5)anXXX3Z@6hx8>sA-cZq`D zIsWT@Lo>Ws8huU~VZ$I?oamtFGqIX>?s$4mY86(*K)n!OR%C2XoWuCY(IxZ5VhE=t ztB;_q3C!q_zIH;8$mhPO^$qa4WL8Tx8+gog_ds{ytD*deob!hd99I0rm)YI>fz;%D zT9ee`F>?SW=wai*Y@q? zWXv9K%P%}H$-Mja>N2OZIc&P}BENbJWaMOH+oj;y1beAnJU05aoJ20@NbUyqaXTu5 zp9br4M`O-xhb(@!jCk6apzM^ko#Ck2zA3D5-jUs?h?fNTNmy(A(|m%1r;^kbC)mXU zRQF##Y9#i_(IQzJv~RjqC}bnnbhqWBk@{DXv?cqvT9DZCTT#?r{TM&Hf-vLM+^X!x zBT!E)*NklFaaDS(GLmEBl&y?m8nt*U{QJzRfeLx%fxx(AS+sWvfqP3_cz;VXqk__7 zIs9)9ch?RYUcF@6O00ckJq;PEp?kTv2>H2priSrE#vK;{�r$)PNu(MM*QeHCao_ z?vgb9$|#bMTx9QLMHZuK?~g5UZ+@gv4~$j}Jd1`#12g|r+cLZ zZJv{jK$cbD0vj%4FfT*X&b&9(;MJ`_DaSp-kFf#r9f(P{b30E@s}rl%nLoW~?bh5H zjX%z_8+c7Xrsa^cnYL?ZT_814On z7NObOC?k`bD4m+)zNZx%T6JFX#%Su3UhC{~WxAQvxlYWB9BI3qFKy4_OkgMl<*0aCysd#5=vk`~^p=nbEeO*8KiJ6Up?$)jbLKBb<_O@awNe9k;<4_1lJ z0vo@}8@U>6&;=w@2us!mRyJ1-Zvo%$j=jM|#GncjkGrv{!jg zIVy%&B@TmS(FM!O9v+(4peHA1J3YXEu8b~NQ)6HKBvWq+#oRqovaq*ir77286658wvvYNaLg{!Y4G%225VX$vo5eZ{o-z*&o5CN?LNE^cP0 zA*7d>xj8=Gf_88)q_Jv-%3mG9vMGBYohoy_206Pe_twl&F_j%Bf7Mb*p)l*Ral_iy z$F!dq)vlrE%#OY^kM51<(q+6m@O=-D05rk+RyyLe_%Vwp`Wv)=jD5CM8j(PJNA$W? zRTd?hczy9%oNosTUm^fostYo zTEtwPcl4Eh&MymeiB$3}5m#^8t$g_4)MB!bpKvsuX4-h#P>u;1M;5vml{B!@F38sy z27olvQ&l8;`Kx&N)m{TSH6sT6C7j95SW(U03+YQG@}i9wWjb5gyAzSKWOcOP+4XTy zQO&aRsDLue8HkBTWKYPjKE?KHuMc^1I~il_%(tYp>t@^CVNw#QjLJjvH&CXMWg~sJ z`ei_e_5S?)aVH8zHZZBNTvL@I$DSSwlCu@(6u~5R9 zVxryPD|Jr}(#&$6GgZ1N-}$tM^3s&@VpM?{&bdjkahnM%jo}fOmmd38BG>aDFRO(7 z7uexMJJ%>ChvN6yd{lJid4=^o6-eE)E}_cezpq!j`ITv~wyIX=d;wp)28Wx?@YMW>!of(^asvKA5K( zb}B79TF#q*GuyL#S{ZD>GlE4b3!R@{UOo)-&eL_{NoO+)&9%vhAIX`IZVE*%Xsc_& z>#tO`rry*#v>%0yhSUcr*x`5AgK`18WuMTNKv-2%7-BFib!}nUOP`Kje6k|(#;g;* zngYk09ZdFxd*j2nyclMmB~IH6`X_p{fVN)TrK+o88C6#Hi&Sg2Pl<+=t!}CMrYIW^ zODR3Bw8zlwDzHCMT;iy3GR%0jbxGMjv8OXLzBgSLxREo>POtpLoJCnWQL6^amf3mL ziEu%mNV{wU^{h1?-NNlyKh9dW>l&TIR)O2);I<;nHaYj;LJ8)8T!2YDNj^kZ-7Cfx zw{U-`y1j-;R#@5XNiXfMjj>Bme>Wxo+fm%vO)!MR-Ed2;KLDpjDW~e^1t2$9VLdnO z14288k0`{ZUp_qy6*;$6vKUbo8NO6Tt5%6~^a2@K>UDxX-I=6@HgVV%-Sm?{U%*X6 zKuNA})Y+K?3;yh|R_$}Rnu*TB;7XlZZ@wVM+PFTGY9tO_Z0*|Odg=nho9Y%DvHWx^ zIJf3Q`FvN;vO=#iy}VU!-r%}arHXZq{Ls(r5}@>~Dk*I+)>4()ji60-eCLfI=Q`&@ z=MH=l119rTPAx1O?kZb8;*PS$-kZSmPXItT45{wSMgs+j@I1ubU)4(+Q^>zlRh#sE zTg6&*w}Y?!cPVeD#8p-;GD0j+Q~}DTkJf{@w-JN^GN)WNr7L0(C>XdacSP{`8TT4P zAsVTIz_X`po3Q8hz!D%%Sw-4y4{^*@jnYYue1aC=9RvkV9};x;+HP$@HXh>h;aYX(xdUti>ZR>xOhawazKc5i0h&2mdw6cnv^|LOL`B$3R@*lUL9$%-1+iWR<3 zU{Y)1tliM!RVWUU*)%<*AG1Gb0doxwsMWkd0~anSfyz1Vs0x$5zCF;aFHw)`2Sz(QRV3N3!-bPpVk*TL!)-lk!CKO}3D{%gj0+jUB{f8Nb_X zeYku?Mw0>1!*D*;KE;X2qsq2|%Bk3Qm1kINZT@w<-mG}|uej*Ft|e4e77nQ6)r#%& z#_T4Dub_-5%)l9x1*Tjp<^|1V+&?)|N%A1bUsG>Q!s>3)lyu4vb19y{wNHlL`&uI( z-84EkdS8O{T=*Wy4dr`zK_$;7!qW=@9bq}QW z@aog`|4C{Lnm1e}-#K=)HfAcOpH47SZbGHksu^96bosOccnvbIvDw=#-gM=_BC8NN z6U#(+T%Xz27$Rp!PM`*%_M?g>H5Z})>-aJHtn!ho@!JFGgy+g-G4J&E<>ap=;O2(B zfg6M2!1>K*qP(n=b+pYb~^^#4D|~YBEnFB_sRixcQf2zL2X9 zNPI~1R2Wl#Bl)X&V>wV<-ks1OeBYk=y1KrME>cNVlI-JHZt0$987ZfUlTMzA#kEad z@#ByxsN4qVtTCqSW9^v$ITPz~oM>IWIsDI{=u$Stqf?N0*%~tXTL#FqT!=G$TDV)_J1NFVXR3C9}r!xs> zRu)DL3q4v6euc|Tr~ZPjPnO#7I7k{`6)h~*Os-C7K0ghzH>S;#)-2B{i#<0wvcpJsS;S*NyLcO$OgZu|1LG<4OS zVuIByqQ^M}mC+SOe8ldMJwd08JE1a9(|(V3pVIkj0@xn+{=1cC8SaeoL{mF+ydv8-cC5C+jD+XT-D-E(H+RU)$;sMutL3J1m)DTX z=ZD@6wr;Nv+F`53lJ}`$$rF*JQm>J-cc`e5zy7DK#dT;nW+(adB8Ojsgb3r3TcR8}+@5KG z8Dr7SgNk`%MWD7-+xn!+W%c$8C!I13wl~a46phd4KLfj~jcSn2^=@C$Wp;bEq%Ru? z@^V4G!EU)1TGm)`byxSRLq2*Qk`~41r#=$Nm%wTb*zNb1R`ts?`wUITnmU%T9qsm| z%xF8k%|tvk>Am>`B(^|0@WSWpg}Eumj(OgdoY`B*i}|CPvW>Cuos&LevL9R>7)T$* zL5c{oR7GZEnL}&}m(qU=Vc5eiJ!fN)qrt&E6#J}z7{UCsw&E+@Ky#|lBV4oDvrJ_+ zk~-aWdlf|SsT?#kN9jq)!J6l`{A$+|i5r$gj}D zY3;NA-Q!xF-+M~mCBA2z=$O!^Qk+#mqD`mo<5K)Fp@Atb(mLk{G#R({bUlMlCO|fQ zyTHr`ov6|Pxq56LHi4(YiYxQ;RJobWK@4&8;+KkFRxO3b!Xk$z;G~pXueU0H6LgN* zx85D6^3k4a4y-oXc+^7QBX)3*(c06_I9+gInR_r2J+hrLAxGp1qTOpI7=7u()W3DbiTth0tM<_-n! z$WP$wNroq*C4N{*6MG8P9{VgYIQ=mPmbjDdd<&)O50j}sL`nyzAl|_n?2GQL3zfbP zx0OcB%s1S9RZDg3I!eSvf;9l25hw?WMZc>SAfI@t1^|co_u+}|UVtOuI+Ug?um$Hq zX_|K29DK`I6``a8_}YWi=| z0vE)QO))4?@TnYqsB{Tt$t0+)h2_+9gNJoeX*J}L;}m|yz-f4};V zI`jW4H-v;QV&vqH2lImYPc-V(`H;C`A5rnS#Ur&wbiStLqBUNc7M1a5UTk7M3r8ddK=X5Kedl@_xe3V_u0t z4B$5Pkd9nT=ijX9r3H9Pe>F8b{C*Ao1C@Bog2S{&6uSSqSqGuu zX5I7-7d`pson@TG1&5QLKKj2W{l{(j-`#0T-CxC}zi)J6N#E@3EGos63=|steyC<_ zmRJr378Dp&VTfXMoPnnY6wD2O@WKbw;aoPurW01Hn^Z$?b zzA`MUuIrXmQjii*q`OloRuDuffipMi zqdb4lb*}Hc=e+*#(gl0p_g;IhwdNXYj4}CcABoirDPs6U<&27Z|7v!GM@Oj7N2YcX z+l%mt*aDvFN1O9FfnoH%jpXky2QN|?i`-c#aPGgI{kvcOV>-L{I2u?V_5j1L3=9DU z-MG$8-sHcrb{(i_LS+_j7|*8hf3CP}5%_DS&B(=+{pPmFSq(`mD^hc8f0o-HaS2~m zqHae{@j(xmGz(Sy7+=l+v33(Y&k+U`D6VceX>$7&x#qm~5>3$mJ_4wgOSLND{5*?7+)c|8!?01laes+oef3ZWjYu$^b_pku!!$siH17BlG_m zLcQ-H#9#aFalA%zBuxT76Cy(^KikmtXPwy5tGf^cbbcTE-}gSeT8bI7XzEi|O{-1@ zJsL16B@x#~P=aj&VQ(CEmp>L&{e6JcgA`Fs&@&Ia;=K<9y1fs_B~52#p_Wg-BKxyC zIi1qF&;N^@6fRXa{Pzu}$JpYgn zv|Jx~lE0M7zddQvumQV=*QTtLP>V+BV1_wnwmu1RR5gM9PM9tf9hHi0Vp*@Ala z9VAXYi#HdfsXG`;*4OHfQ;UHRn64f^uL@|Wx(c6|{+Y7^53mseG@m^u!KRK>;5r(n zU#-Mim}=B}c;|8%@@<>7lBfqSW$L~^SiQ-tdw*@FwH1*siU|24b-`-I*AFDtnuwzj z!i(Aap7X0+@5-&t`y-{WFdk+_YRvLIrdkD|T@rgc6!p9KD(P~v1VFOGcy+1oXMeGv zqxvmRVV$QaKbKmwp){b=untg_h==5KD1HBqA}-Jud(L*wuAKT1qg|PD$O|ONV<7aO zV0U!)^~X9)bU*Fe&|c1yGDB0^zo0w^E#=?!XiE>x}cfs z11VVthmZJs{B-q%gg-ZV1M%5cK=xips5@XhL%jR$JIUyhr0a zHtDj?X*~@TQZ=27x>enBuIP84pf+FE*)y zhkS&TeVM0fALDZ;G$oIgPjqw zzc{9uQBgY-WH(f0W7MYY(VJnq@e(bFPU@{J_q&I4O284T&oa@L-{)>jOl=>=V-7XyD$p#?8MGtX zU2tQkF+>FTZ3&ZQZN5-~R!&WYO+lOzKz%dWq`g@iAj1 zt`43{NI`24gq@wU5gmBnwyASLabto3{3ZWpv46wGTfVM+-~6kA?wW*XUnF4)i3Ydy=>R+-ZTSVVQ%SFr%aYwzk zASbyV9jPUHykOsa04(A(Se5sd?WzsL*R(X1wNq%`Hrr;EOEX&9 z@eQXzX$_^s1~;a`n|@c<34+Fu+QY#qSBqZ^L!HMf9^5+py0Laz8dj_iRNjK-eJQ&j zE`IsX3gDozES^z^g53ow1nnLn_fM8-@7UU9{fIwX*v$fptd-pVB&N?TnKYKyI!uV- zS%RIG;ysyftU=R=iUK*KaIhL2Lk`VVvcTvZ>$zvYZMhrwaBqFQZESB`z590Cs|$E4 zhCIy9KAKM+cfWyBE z6Pv=@4p45VdpytAv|U*MQt6%ihe)wx3e`}3y`Di*|0XzOOK8|EvNM+{)ecK73aSa2Zzx+8Xo`_4l@hvNT(X|c_wixC|+62 zoLw_NuY?1b%y&n$vVJLyfyXG>uSk}paU1noKI`l`>U{3ku=bW>>oN5(jE2Wg?oMpZ zrCnD^xUjpwS^MzvqTHW_Dg{yi!7!x^DHCi$nRs#PoEpmvDk1d)h1C91JJC`;{EM9& z53(_7%9XJDb=Y_)vJ)O6@H24ndaH#b5{?y)SVCEcJ*atb@@h-O>9>wXhR-j#|H)WX zAY)mn=1Ncu^uSh1;+cXRsAsl^ga(ZC_CRFhIeZP$1S@8a_RkGh5{n*H*g5%5Q4i#S zwzbPj1#=E?!Ap5(mXjG7$@+_!ZGg`XvZwU-_aZkAy0STxiLXN4n%5w`NwJcx8V5SB zxz8KLFe5nV{xz*cfk>;DV`w)?G%`=v29dt zAuR2)x5)bwfQRq932RG4?+Uoo<=M2#4#;d?{!TZM*%A|Yi+m^a*mK}w;e377`%dtp z2~ltNI;0=wf#iZqJYnkjr>x}PBShoS=;EwNa2ZS54Yow)g~p*$wY_;z;G2t5^pAnj z2vQvBTa(kDjdmLju-J7PtuS}41lXz9z$CQ-%V_w$M%5LEFj|WE+@2F@8}1A!^mm}o z#n)-=A3%cO>RrAcjYe$4w>AfhgH%ycchrYuTFX;Vwq{90U($O04J?hVe6)-Md1I2> zcWC`fZ#44_xB@3TVq=M8JkF3G=)JuBIJBb6kpD@Y&|p^QqW&XVL1!_=pGPrvQ}xck zObU(Bc~OxfzO)lMvVhlKmf*g`L&5a?3ZLsIWDc6=>SoNOpG6Eh@UGN3c0agx z|T_4q<9+ zs7^sr44B{24Ei+~=%iZGt#*15Mc3)X(y)9rAf_p(B;fQTl{)TrBYL>RoOiMID+K@~ zvANaCmX{ssW`bCI$0nER%a+EUb0NI5?<@GG?Ykx*@%297cORh;@*D!_kD|(i-p~Mn zdIIZ7Y*(&*Gd1|OqJPRY{@|@lu_rA@1(G;8qOBdT?9-^w)p1}DEN%T~fZ&ikC9nxh zvPLlI>T4SdeQX;EDCify=>&2X|BPXA2GiKlvz2;e?jJC4>{@u7?b8n-eYl{^q%Jq^ z`)SX8tV4+Fygw~x;P1dcgbR8TVRZX?>x;AgGvJY}M8;QWeo%WX;pk>{;WCWXn(CtT zxT81mhUOj<^vk~yh@V4iBOA8fCdN9de`Jx!?G~z`dHzWt{Duz@ zcFJ* z+UbB>n0yoiMZPG(|KJzRWPrFXAtmRx?h3{xt}OQ=Q#uUzKqo?Sai5K?wVA3%3<;pB zR>TZpQclsTEZ-M|{TG81Z|n|%jUwhTMVcBDboY+*qMv=GjJ3rvia>B`ni)}gZzy|G zdf@b|5pAD&bPD&w)M4UPsaKcy+~}qYM+({j=~IH-&7;Ac@gLu*|98Co-s(E=%RuC- z`=AN5lfd2lX9(_q{QMw2#U>{Jts4t;av})h^^-SJ!WYnq?vMSeyVa2IB88wq%)9Yx zlMQ*wXGn@BR+)|Fejvz(-^}Sr(?+=YHlnalx+_#f*H#P5ERrTpKxPxX19b`x5C{wW z7ySXxC~z$U2x=CJtRYD<@DVoOK*2^t_mK$tbFSLgAP(>528VJ`&JM$VrMyRUSxcl! zG4l&mkS^kw#4Z5jG6wln z=;`Zsc0rx~z5ANqbtp{UUK|;D%GhfPA&Z#cIDom$*awX!1N`E`bPkj#Fs#V@YCn+W2Vm_mA6_-$pZ#~0UNH8n(9!B{mr7upc@k%T}t~S zYky3_7m3H+VDPy-mIm^0(ex+TvrrWHEaGR3qBY$+x!>vI)dL1>}3ZW*nod#o%-a zL2=4mE{)tsbob-N-Sc4UMgJV!Z?a!q>CO*9!LL35s4+S;B*%>(@o%!Y1MpPiF4Thu zKfF4*TN;l$;`(6sfxr~&(asxS@0jt>W^AYkG1x(>oeadrTKTwJZz@)KI-3DA#}BaG zl;xKr_2xHhe=cZ>u=-zCzI){>cuB1jnpobS%a(eNk%#-3n{EX3qUAtjyDs)ggNj6T z+wr^ufz6rENAj-aI#(qLDBM>|(FH<17kIo2W!U}cYN%eV3-V);C`|8eR;<%DW+>@{l z0)lup#a3fP5I{!f4Jh?i)i{xP*Z{B=gq7E)1s``E;2LV!)Bh5-86!ULa*;8 z_;CaAZI?&+06P=vz8(OegAF}e%arakg5>jdB&ghTp)x%#*;WW26)tm|UH_R4#KnQQ zBa+Att0XU{L?Vh{G-<#$`XOSLh*#P8@HRL#*a4wdq1DkR@ca~*Ggx^ZTTccvON8RI zj{foAA;7sCZ1B>)@4qCGNAJ9*;D5paCRm*12hpiM%NH-)x%3x4!^C}wuf+V zEbuWq9hU~>R*Wf+4HWGm-cQfQbC;`C!PzSW+`K&RsM%qU}*?5b%0Lx89~$6StE zd|tnEt7c#<4|&Zd2}?mlx%8EXiBPXMYI>Q$b{MfO-51QAB9t>1Z-(|mb z@8#R#H1{SH%jPfS__ML2vg^(!zO7!5XDgoi@VzWIG-aqYkqL7VE;u(+?RDQ$j8uon z?WAEsx`Wo8n1GC15)mf5qi^Ys`+b#>_eoE+zz?#NrAZ|EqQS(+z&Az9SNw*PkooB~ zQGsq6dQ@Hwuh1_zLMBNMA_uvaP^bdDxwrc~NRd|NA~vm?6Gj9PJ-&{jW*$0lH(_kJGb>S0!gIo(3;jG<}*)*K}3d!-@#dk5O@(Bee~ zgQKYSs>;V)FQ+hRm}%&(e0~$^`F`+5*@5uz`xI3a6gE8tnG0HHA5~#I_KD>?=`7ox zXS5ApZ7Y*{FSN#T!}&cd6oqj~%s1?NG;e8fQe17Dy8dwYjG$`;MtDtQkl#%c@`2l^ zuHK_e4Gdx#Lg6$Cch6sJ&6Be-S^AEMHOcf;#f>{SIn{D0Bi>mo43^dOqXn{?okcH; z3P4kg;O6dJUrK{p5M%>*Eq6}4dw=+zO{L_bRN^rVw;-raVo)tEFDr7on?HSS(GXU zvFmE`iUk$6^6dh6(Fz4qSq-tp!5T2~bLqcLuxY9>$v;W;AmTYChI`*cyxexuOP(r0%(S>z z@<^P#PdW*jbqZ+q{R>q|%2L77@l>gKiJc{?op>n?I`&ijlw`KL?TUW37n65xu+<uLmr1H+c!Kik>%ckbQ1B!L62RQ27>8Wa=A3BkQ&40kHa%Aju?i+*Weq~PGOwJkJd+#Gg;y$dGy1rG-F4jzU$omdYrzex zR_dg455V`$HsT_!4d^|J=vWla<%>AO4;?|8>T@E|#|%L{D>k|!iJCL|9GojcTdo?cig@}7P>k^gA3j;WvUrbt8eq#nkS2K z)l}stPp^mVoBr@mKK57)42$h@JA@b0*ETTk(*LD(ij;DvfdMsC< zqo}E)FKuyYeQGMHf#=F`Rg{l)nMyuT;4x9RP0Y{g?elAH!JQ>;6(*Pq_@v4-;onJg6SkLtDt~hX?d=X zG|E#{EC@Dk@mUJ^atg#$RQWMf;y!L3iKU$ax#9H=kh~fjd`?i7e#k4EA2#9EbP4=) zl}0RY7iG6p(ph8`J3{nAy3 zW}q3TF_v~Bi%;W>?pb~4-hsR#wL@W6)jm@0qe7hmzG{&^?1EfTnH}GIq1@HSpgUd} zyC${)eo9-gY2|Z#|JX&nqq-k$NOQyNUh6;{0YQ;KZ072MoWy!`>G=eeZ!HJQ!L6XA zD?PvnW$-STsSZVd7e7V zb(_^_vV(F$ecdzt%oe+mO@N~L4sXBg>rXD*kwI%qrKyy!Ro}g=IL&vXw`^?8sP!PX zI?c*tf2&t8k$Qr$Z!S$Ow&0Uurc`#&uUu0cK;_#;xGl-ya_q9JAF|I!Rs#sH`thtZ z99j~ozSFuW0sm*ncuBf9;wAi{J#~>T&pHbyRF1_@*GE)oWc_~Hop3Lud5XA$q_ET5 zlPQT!V$DPw!#5XH^(%OU(Z-DOwPByKmG8Qu)ewg-ym(f}id#rqQ3{u^CEm)^W^UwO z#<|(X%M`{wv)!z@05h& zFX)0k57h$ZoZ3nmudS5Of1kT2wM%2~n>kKMAGjtseZU$xcVsJ(8d zBbcmy3x-5Bj-O)AW_ej#>0mPnSfqz7F}f_H;PhfdQw%MOx!99<^W~75bNP;pW;vuN zVw1mMl+B+iXKs4T%Z!fMhY`Si@`;fxg`p4^-NxgkAlulMsPM$bsDd->qXX;dw_gg4 zNb4HhU%UG*g>9oFM+sk|v*m`Br=1Vu(3QTv`-(kS!gdy_jb403gXu!nartgdZuVVU zM0RQsn8&KKhs*l!Ka*%K9Ey$T8eVwN@Qh7x$(s1u7?v=|mE}sb;9zhQ=p(Cn1J)%Z zvX^pL-MN%uJhn~h^P?s1LW>sXsMaSKh|-zPS#7h(FV~g!_+EEqaPSqDY_M*>*Ng;!iYxiB33=r|`gOwN%a%5rt<+wPH?8 zt`XsRp_+F|8>0}bCTQ?DUlhQbsgJIhQzXt3tbQ74l|pHq*&P=kp)$gX(eHEWn0~-g zJ&?I@PzzR*PThGX%eL1$%Ab)Z#;wRvc_rrbw*keTR#(IzC3NlY?wTJn_1mQ zF_)}7yXxbxn(FEJ1Oo95HA*7>#E{6ryrzzmSi&NNfdaLQsdDeyDVp5`Kj`AHVpjm1 zz+$=LNu;9sF;JF|a%GMvH&Wm%ld7MqMOZclmA*Xb+wcjke3bkVe!-8d-H6zykI6AN zf7^j~R191j^SDY5?%=d*#gyV|O`qh`-cjJDBp>DM7uZb6fG8t<(RZ2;M69;cB{O@h z_*vWSCvGdJLul>t;`^Jo7%jbCyb&Xw@YAl?2)yP-+V3@>TZQM;)up;6u^Pq+G`gpR zJrbaBe6}Fm;bZ(IH<`h;-Rq+LLWGx-r{wo+DL1c~!B^)B&OQCO<^pQUz_3?&N)Z3@*g!bt?E>G@xT(Pl8 zA717-hbc4Svl!l0Zv5i8liZmVG7gPj7;Xy?h7xkQcU^8(#AOJ*THnTU zl54$#6<;zT#_AN4Qj%0`-zUlhirev(FKqP{-?WevN0UCn#U`{+(qapBJx|obdlid) zq|m`I?8;7qqz>I<>Knn))tD?)da35U{d|-@&bnEfZf>%nTpa>?c@oYl511PSM5<@< z4s!de!bJ&mO-muecN+_FYEyh$5?004?bE<;uFsC2l$EUii#4kiS0MGz(;7K&b9_)u zX7@=#)-6vW4?Vkx&z#|_c9UHf_>F7#I29m3KU5zsd7 z^Na8BNdGh-c30aYza~sYU!C#NadlCvKHf;RKyJ*;Fro{6MRD_QLq)vl8--bTh1EJ@ z3ux_c<42zR&V{9_}~QC8@vCVubXM10KpA5lSP>cJK_Q67T4?xxE5DOHk(+9PaRXH(bgLN zWKG7WRsPjW?o6Ab2b|{k&$64?sfR~}I7jycS1Uc5W=TDL5=~nJ2%TOVuTJxC=avxx z(IUUu=vy5}s<%h7!YNzF9>U&W-mdI=RS)(~0a-?`!og|r#8-22x0c8`Qc_;ZEA2zp z*p^&-tmz#?hJoNt0jv*66tZVq7e7-_Njzu8Mj?-2b^j2qEHJ`Im;IdGg_7QXJ$C%I zRY#+<)$PIKbFL-x7Nm6M7rA;7{pmd&*|cfqMA~}h2SfRZD$Cy9^?ZRW%L5&yxYcNR z%pW+au5CVXn3hmr)`rfTMRubnHTtzvxvGA|?RoH%h+Wg}_xhAy$8`wgyFEZTAbz<{ zJU^g#V(;ZOCHZduFBO{iMj8n-FER;bKT@=PopAZetVQ@ERf#h6aLfGl>rlX|xRn0Q zu*>VM$ZADCp=Q&_!9*R^xnU37ven&7n3REJ+h4;~ZW}*fj%mp?vQD;o#~3mH43T)r>Y^R?=;fENjZ3^3Vqm2O^4!ZO1XNj+sohf@7;)IfDX-Fn zXFX!2w&P4LoSzy(c_ylC&;Gf`5S{LvVuN}=fne9*?6ALNly^!~eEP!?i|n&Ocvs?# z9wun=o60gPg7=Yv%siTkV+U)lQnPj7@+He2;gXS}g|Sp+P2UO}ccC{Yja7@3Z;^Y% zt+Syzn{hxa2>WTzOF^|{#u@4=iA zBCDA`{+I2M^sGEIliX+7lgkn+HFtd0YC`7xv6H5x^stZr#C#ZVy;CTZof7y2q&Iy`)VziT!0@;{Eh1Kb04ziQbX%b1PzV~AgR4DstWjsd{ zlSxp~roQwTt-K?H#toHml)-9*%u_jyI8dd!id+HrbFN~W$}cs>O0I1J6f=NA9Eb*&x(m%6H<7ya}rA7d1-arwWc1#-HU83(fUcU2cwUndX)~{n8F_@Z2?~Rw?l^ zL|qh%HQ6xoEVoi&ovKhhs3oMo8iioDm$>N%rW0oz@JmOo^WEO$BT45~*N9w5KcA>& zI!eQ%fJeohsY$mSDLjpp!=?cKMy3py)wFUN?=2A8kS{l=s^+pJgeIVXc?C$5|ErM$%4K@sZLFH>Gz7`Azk@5*tGRZSC~mMQutfoTbmF^fKuWub^n z-T0W55R@-SArLdaICnxRh@~=+*B^zSg71{c_2gH&t(Z6z6h$*O>JF&n#`jR|@>^{1 zioqzh+(+}PIl*|%m@T$HxJC1QVl&wWd-4Y&D_76n)71hwC)2g)Bx=NmmZdjJiujzR z^meLVpgHD`+EP7LzK$on;&iYUlziSmc zYL?^oH}MJ>DJ+VRsIe$~{g>>CoC!4S|H(B1{w4{j#lac0dd%EWnJAR{s0+AGeuw%9 zjSLm@E{IAw6#H-ft2qSGiO+lq`bo$6eS0Z&z)UVFq(mNVuH%8-sM(qScJ@ECuK%w) zo2MWzybG)tcJz=cEW}Jo3p2zUg9aymF~JPm>wOPrPhj<#9MZuCJ1YLAm9CuMC1qT+ zuvaa1a!`j0M^ebsZn zsI5R2qD}Nl;TPrdm_F=0f^L2j_dG)H%ZPwjc5D*7dZg7f3b+!-x0i1GvRwZ;wPZ-0 zhUWL7SBHO(gu^#oM*}3x6YE2pW51r~C(AQX2_gtviNw50*7A(E8 z&@42a^5RG?hjZ~sqpbmnN%ONb;Xe{4K_{D^nh-bKUta{W)g^{4!v$$2z)0f_;VtAS~Pzxj8Mo3;b{&k`mG&1;yUq1T6r65%ioGN-1@PU ze&EF#lVRtC7w(4`+wbQGw5<>Vq8O*L_DGmyNa;iLlydw;GXE^lwus$?_Jz%(5l_Mb z{xbRre-~>x4CZ2FjIk~3{}tItj9bAt?JRwzaU_b>GPL0nbuh;fTbDl{sFqO#F|SXi z`!6nS85<0;f{fjxBN8VLs&a62PAeX{g;bype4-(#oBzL>OX1P2^soF8uO1G|-U9f< z!dokU1RsaZYC7=f9QXbb6dis|1Eda!cJJPidE|)e(~)v^ z!W?J*s|h7gUCQ}B#|doY=f8i^l$}QS<*gOsznTK-SZIXh$nN~S(XU^8N=le!cwzJC zwIqaT^!KEHed5nwe20V8Qw$}F55L5(Yk8n8x|#O(S37KVz{Hr>>GeHwEjhHcO-KE@ z>gRVXN4s^MeL>>Lwc*fa=_3EnGa&ykM6ItNb$fW^T3OV3ky2rXKMxy#5?CDYVGO5I z^~kmVk=<8Qjy}dLMML(ggnj>1_@DP2g|&5?-%K#BjvnpF+vRAHki)kfNI>>#bvMXl z-VS5mGjMs&@7lzd)t8C0QQiJ^Mn_(P*Od>^!(VadXC|Aaj{IN?BIPsckr%;$b_@Ba zBb|EuyRUY$^85A e|M$A?2dV*+bTK1SWh4sxQMjZcQ*zPt*8c)ByOAgW From 0a8979d3ab80374eb4f84d08e060bcec70174c73 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 25 Mar 2016 16:52:11 -0700 Subject: [PATCH 043/210] ARROW-37: [C++ / Python] Implement BooleanArray and BooleanBuilder. Handle Python built-in bool This patch implements `arrow::Type::BOOL` in bit-packed form. ARROW-59 (Support for boolean in Python lists) is also addressed here. Author: Wes McKinney Closes #40 from wesm/ARROW-37 and squashes the following commits: f6b60f1 [Wes McKinney] Clean up template instantiations to fix clang 2d3b855 [Wes McKinney] Use gcc-4.9 to build thirdparty, too 74f704b [Wes McKinney] Try to fix clang issues, make some PrimitiveBuilder methods protected dc8d0a4 [Wes McKinney] Fix up pyarrow per C++ API changes. Add boolean builtin conversion / API support 2299a6c [Wes McKinney] Install logging.h 2dae07e [Wes McKinney] cpplint 0f55344 [Wes McKinney] Initialize memory to 0 after PoolBuffer::Resize to avoid boolean bit setting issues 83527a9 [Wes McKinney] Draft BooleanBuilder and arrange to share common code between normal numeric builders and boolean builder --- .travis.yml | 4 +- cpp/src/arrow/api.h | 1 - cpp/src/arrow/array-test.cc | 5 +- cpp/src/arrow/builder.cc | 7 + cpp/src/arrow/builder.h | 2 + cpp/src/arrow/test-util.h | 31 +- cpp/src/arrow/type.h | 15 +- cpp/src/arrow/types/CMakeLists.txt | 1 - cpp/src/arrow/types/boolean.h | 32 -- cpp/src/arrow/types/construct.cc | 3 +- cpp/src/arrow/types/list-test.cc | 5 +- cpp/src/arrow/types/list.h | 11 +- cpp/src/arrow/types/primitive-test.cc | 214 ++++++++----- cpp/src/arrow/types/primitive.cc | 179 ++++++++++- cpp/src/arrow/types/primitive.h | 302 ++++++++++++------- cpp/src/arrow/types/string-test.cc | 2 +- cpp/src/arrow/util/CMakeLists.txt | 3 +- cpp/src/arrow/util/bit-util.cc | 14 +- cpp/src/arrow/util/bit-util.h | 13 +- python/pyarrow/includes/libarrow.pxd | 3 + python/pyarrow/scalar.pyx | 6 +- python/pyarrow/tests/test_convert_builtin.py | 5 +- python/pyarrow/tests/test_scalars.py | 39 ++- python/src/pyarrow/adapters/builtin.cc | 26 +- 24 files changed, 643 insertions(+), 280 deletions(-) delete mode 100644 cpp/src/arrow/types/boolean.h diff --git a/.travis.yml b/.travis.yml index 49a956ead3d..d89a200b892 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,10 +20,10 @@ matrix: language: cpp os: linux before_script: - - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh - script: - export CC="gcc-4.9" - export CXX="g++-4.9" + - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh + script: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh - compiler: clang diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 7be7f88c22e..2ae80f642f2 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -27,7 +27,6 @@ #include "arrow/table.h" #include "arrow/type.h" -#include "arrow/types/boolean.h" #include "arrow/types/construct.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 7c6eaf55c0d..121b802d994 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -71,8 +71,7 @@ TEST_F(TestArray, TestIsNull) { if (x == 0) ++null_count; } - std::shared_ptr null_buf = test::bytes_to_null_buffer(null_bitmap.data(), - null_bitmap.size()); + std::shared_ptr null_buf = test::bytes_to_null_buffer(null_bitmap); std::unique_ptr arr; arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_count, null_buf)); @@ -82,7 +81,7 @@ TEST_F(TestArray, TestIsNull) { ASSERT_TRUE(arr->null_bitmap()->Equals(*null_buf.get())); for (size_t i = 0; i < null_bitmap.size(); ++i) { - EXPECT_EQ(static_cast(null_bitmap[i]), !arr->IsNull(i)) << i; + EXPECT_EQ(null_bitmap[i], !arr->IsNull(i)) << i; } } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 6a62dc3b0e0..4061f35fd5e 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -54,5 +54,12 @@ Status ArrayBuilder::Advance(int32_t elements) { return Status::OK(); } +Status ArrayBuilder::Reserve(int32_t elements) { + if (length_ + elements > capacity_) { + int32_t new_capacity = util::next_power2(length_ + elements); + return Resize(new_capacity); + } + return Status::OK(); +} } // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 308e54c80d7..d1a49dce799 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -69,6 +69,8 @@ class ArrayBuilder { // Resizes the null_bitmap array Status Resize(int32_t new_bits); + Status Reserve(int32_t extra_bits); + // For cases where raw data was memcpy'd into the internal buffers, allows us // to advance the length of the builder. It is your responsibility to use // this function responsibly. diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index ea3ce5f7f53..b2bce269992 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -98,28 +98,27 @@ void randint(int64_t N, T lower, T upper, std::vector* out) { } +template +void random_real(int n, uint32_t seed, T min_value, T max_value, + std::vector* out) { + std::mt19937 gen(seed); + std::uniform_real_distribution d(min_value, max_value); + for (int i = 0; i < n; ++i) { + out->push_back(d(gen)); + } +} + + template std::shared_ptr to_buffer(const std::vector& values) { return std::make_shared(reinterpret_cast(values.data()), values.size() * sizeof(T)); } -void random_null_bitmap(int64_t n, double pct_null, std::vector* null_bitmap) { - Random rng(random_seed()); - for (int i = 0; i < n; ++i) { - if (rng.NextDoubleFraction() > pct_null) { - null_bitmap->push_back(1); - } else { - // null - null_bitmap->push_back(0); - } - } -} - -void random_null_bitmap(int64_t n, double pct_null, std::vector* null_bitmap) { +void random_null_bitmap(int64_t n, double pct_null, uint8_t* null_bitmap) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - null_bitmap->push_back(rng.NextDoubleFraction() > pct_null); + null_bitmap[i] = rng.NextDoubleFraction() > pct_null; } } @@ -160,11 +159,11 @@ static inline int null_count(const std::vector& valid_bytes) { return result; } -std::shared_ptr bytes_to_null_buffer(uint8_t* bytes, int length) { +std::shared_ptr bytes_to_null_buffer(const std::vector& bytes) { std::shared_ptr out; // TODO(wesm): error checking - util::bytes_to_bits(bytes, length, &out); + util::bytes_to_bits(bytes, &out); return out; } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 5984b6718dd..86e47791b7c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -132,6 +132,10 @@ struct DataType { return children_.size(); } + virtual int value_size() const { + return -1; + } + virtual std::string ToString() const = 0; }; @@ -191,11 +195,14 @@ inline std::string PrimitiveType::ToString() const { #define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ typedef C_TYPE c_type; \ static constexpr Type::type type_enum = Type::ENUM; \ - static constexpr int size = SIZE; \ \ TYPENAME() \ : PrimitiveType() {} \ \ + virtual int value_size() const { \ + return SIZE; \ + } \ + \ static const char* name() { \ return NAME; \ } @@ -295,6 +302,12 @@ struct StructType : public DataType { std::string ToString() const override; }; +// These will be defined elsewhere +template +struct type_traits { +}; + + } // namespace arrow #endif // ARROW_TYPE_H diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 595b3be6e16..f3e41289bfe 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -21,7 +21,6 @@ # Headers: top level install(FILES - boolean.h collection.h construct.h datetime.h diff --git a/cpp/src/arrow/types/boolean.h b/cpp/src/arrow/types/boolean.h deleted file mode 100644 index 1cb91f9ba49..00000000000 --- a/cpp/src/arrow/types/boolean.h +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_BOOLEAN_H -#define ARROW_TYPES_BOOLEAN_H - -#include "arrow/types/primitive.h" - -namespace arrow { - -// typedef PrimitiveArrayImpl BooleanArray; - -class BooleanBuilder : public ArrayBuilder { -}; - -} // namespace arrow - -#endif // ARROW_TYPES_BOOLEAN_H diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index df2317c340b..34647a5005b 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -51,7 +51,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(UINT64, UInt64Builder); BUILDER_CASE(INT64, Int64Builder); - // BUILDER_CASE(BOOL, BooleanBuilder); + BUILDER_CASE(BOOL, BooleanBuilder); BUILDER_CASE(FLOAT, FloatBuilder); BUILDER_CASE(DOUBLE, DoubleBuilder); @@ -83,6 +83,7 @@ Status MakePrimitiveArray(const std::shared_ptr& type, int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out) { switch (type->type) { + MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); MAKE_PRIMITIVE_ARRAY_CASE(INT8, Int8Array); MAKE_PRIMITIVE_ARRAY_CASE(UINT16, UInt16Array); diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index eb55ca868ee..4eb560ea522 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -116,11 +116,14 @@ TEST_F(TestListBuilder, TestBasics) { Int32Builder* vb = static_cast(builder_->value_builder().get()); + EXPECT_OK(builder_->Reserve(lengths.size())); + EXPECT_OK(vb->Reserve(values.size())); + int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { ASSERT_OK(builder_->Append(is_null[i] > 0)); for (int j = 0; j < lengths[i]; ++j) { - ASSERT_OK(vb->Append(values[pos++])); + vb->Append(values[pos++]); } } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 72e20e943c3..8073b512176 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -116,7 +116,8 @@ class ListBuilder : public Int32Builder { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } - memcpy(raw_buffer() + length_, values, length * elsize_); + memcpy(raw_data_ + length_, values, + type_traits::bytes_required(length)); if (valid_bytes != nullptr) { AppendNulls(valid_bytes, length); @@ -132,13 +133,13 @@ class ListBuilder : public Int32Builder { // Add final offset if the length is non-zero if (length_) { - raw_buffer()[length_] = items->length(); + raw_data_[length_] = items->length(); } - auto result = std::make_shared(type_, length_, values_, items, + auto result = std::make_shared(type_, length_, data_, items, null_count_, null_bitmap_); - values_ = null_bitmap_ = nullptr; + data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; return result; @@ -162,7 +163,7 @@ class ListBuilder : public Int32Builder { } else { util::set_bit(null_bitmap_data_, length_); } - raw_buffer()[length_++] = value_builder_->length(); + raw_data_[length_++] = value_builder_->length(); return Status::OK(); } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 10ba113c591..761845d9381 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -69,11 +69,11 @@ PRIMITIVE_TEST(BooleanType, BOOL, "bool"); // ---------------------------------------------------------------------- // Primitive type tests -TEST_F(TestBuilder, TestResize) { +TEST_F(TestBuilder, TestReserve) { builder_->Init(10); ASSERT_EQ(2, builder_->null_bitmap()->size()); - builder_->Resize(30); + builder_->Reserve(30); ASSERT_EQ(4, builder_->null_bitmap()->size()); } @@ -83,8 +83,9 @@ class TestPrimitiveBuilder : public TestBuilder { typedef typename Attrs::ArrayType ArrayType; typedef typename Attrs::BuilderType BuilderType; typedef typename Attrs::T T; + typedef typename Attrs::Type Type; - void SetUp() { + virtual void SetUp() { TestBuilder::SetUp(); type_ = Attrs::type(); @@ -99,58 +100,44 @@ class TestPrimitiveBuilder : public TestBuilder { void RandomData(int N, double pct_null = 0.1) { Attrs::draw(N, &draws_); - test::random_null_bitmap(N, pct_null, &valid_bytes_); + + valid_bytes_.resize(N); + test::random_null_bitmap(N, pct_null, valid_bytes_.data()); } - void CheckNullable() { - int size = builder_->length(); + void Check(const std::shared_ptr& builder, bool nullable) { + int size = builder->length(); - auto ex_data = std::make_shared( - reinterpret_cast(draws_.data()), + auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), size * sizeof(T)); - auto ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_.data(), size); - int32_t ex_null_count = test::null_count(valid_bytes_); + std::shared_ptr ex_null_bitmap; + int32_t ex_null_count = 0; + + if (nullable) { + ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_); + ex_null_count = test::null_count(valid_bytes_); + } else { + ex_null_bitmap = nullptr; + } auto expected = std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr result = std::dynamic_pointer_cast( - builder_->Finish()); + builder->Finish()); // Builder is now reset - ASSERT_EQ(0, builder_->length()); - ASSERT_EQ(0, builder_->capacity()); - ASSERT_EQ(0, builder_->null_count()); - ASSERT_EQ(nullptr, builder_->buffer()); + ASSERT_EQ(0, builder->length()); + ASSERT_EQ(0, builder->capacity()); + ASSERT_EQ(0, builder->null_count()); + ASSERT_EQ(nullptr, builder->data()); ASSERT_EQ(ex_null_count, result->null_count()); ASSERT_TRUE(result->EqualsExact(*expected.get())); } - void CheckNonNullable() { - int size = builder_nn_->length(); - - auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), - size * sizeof(T)); - - auto expected = std::make_shared(size, ex_data); - - std::shared_ptr result = std::dynamic_pointer_cast( - builder_nn_->Finish()); - - // Builder is now reset - ASSERT_EQ(0, builder_nn_->length()); - ASSERT_EQ(0, builder_nn_->capacity()); - ASSERT_EQ(nullptr, builder_nn_->buffer()); - - ASSERT_TRUE(result->EqualsExact(*expected.get())); - ASSERT_EQ(0, result->null_count()); - } - protected: - TypePtr type_; - TypePtr type_nn_; + std::shared_ptr type_; shared_ptr builder_; shared_ptr builder_nn_; @@ -158,14 +145,14 @@ class TestPrimitiveBuilder : public TestBuilder { vector valid_bytes_; }; -#define PTYPE_DECL(CapType, c_type) \ - typedef CapType##Array ArrayType; \ - typedef CapType##Builder BuilderType; \ - typedef CapType##Type Type; \ - typedef c_type T; \ - \ - static TypePtr type() { \ - return TypePtr(new Type()); \ +#define PTYPE_DECL(CapType, c_type) \ + typedef CapType##Array ArrayType; \ + typedef CapType##Builder BuilderType; \ + typedef CapType##Type Type; \ + typedef c_type T; \ + \ + static std::shared_ptr type() { \ + return std::shared_ptr(new Type()); \ } #define PINT_DECL(CapType, c_type, LOWER, UPPER) \ @@ -176,6 +163,14 @@ class TestPrimitiveBuilder : public TestBuilder { } \ } +#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type); \ + static void draw(int N, vector* draws) { \ + test::random_real(N, 0, LOWER, UPPER, draws); \ + } \ + } + PINT_DECL(UInt8, uint8_t, 0, UINT8_MAX); PINT_DECL(UInt16, uint16_t, 0, UINT16_MAX); PINT_DECL(UInt32, uint32_t, 0, UINT32_MAX); @@ -186,25 +181,89 @@ PINT_DECL(Int16, int16_t, INT16_MIN, INT16_MAX); PINT_DECL(Int32, int32_t, INT32_MIN, INT32_MAX); PINT_DECL(Int64, int64_t, INT64_MIN, INT64_MAX); -typedef ::testing::Types Primitives; +PFLOAT_DECL(Float, float, -1000, 1000); +PFLOAT_DECL(Double, double, -1000, 1000); + +struct PBoolean { + PTYPE_DECL(Boolean, uint8_t); +}; + +template <> +void TestPrimitiveBuilder::RandomData(int N, double pct_null) { + draws_.resize(N); + valid_bytes_.resize(N); + + test::random_null_bitmap(N, 0.5, draws_.data()); + test::random_null_bitmap(N, pct_null, valid_bytes_.data()); +} + +template <> +void TestPrimitiveBuilder::Check( + const std::shared_ptr& builder, bool nullable) { + int size = builder->length(); + + auto ex_data = test::bytes_to_null_buffer(draws_); + + std::shared_ptr ex_null_bitmap; + int32_t ex_null_count = 0; + + if (nullable) { + ex_null_bitmap = test::bytes_to_null_buffer(valid_bytes_); + ex_null_count = test::null_count(valid_bytes_); + } else { + ex_null_bitmap = nullptr; + } + + auto expected = std::make_shared(size, ex_data, ex_null_count, + ex_null_bitmap); + std::shared_ptr result = std::dynamic_pointer_cast( + builder->Finish()); + + // Builder is now reset + ASSERT_EQ(0, builder->length()); + ASSERT_EQ(0, builder->capacity()); + ASSERT_EQ(0, builder->null_count()); + ASSERT_EQ(nullptr, builder->data()); + + ASSERT_EQ(ex_null_count, result->null_count()); + + ASSERT_EQ(expected->length(), result->length()); + + for (int i = 0; i < result->length(); ++i) { + if (nullable) { + ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; + } + bool actual = util::get_bit(result->raw_data(), i); + ASSERT_EQ(static_cast(draws_[i]), actual) << i; + } + ASSERT_TRUE(result->EqualsExact(*expected.get())); +} + +typedef ::testing::Types Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); #define DECL_T() \ typedef typename TestFixture::T T; +#define DECL_TYPE() \ + typedef typename TestFixture::Type Type; + #define DECL_ARRAYTYPE() \ typedef typename TestFixture::ArrayType ArrayType; TYPED_TEST(TestPrimitiveBuilder, TestInit) { - DECL_T(); + DECL_TYPE(); int n = 1000; - ASSERT_OK(this->builder_->Init(n)); - ASSERT_EQ(n, this->builder_->capacity()); - ASSERT_EQ(n * sizeof(T), this->builder_->buffer()->size()); + ASSERT_OK(this->builder_->Reserve(n)); + ASSERT_EQ(util::next_power2(n), this->builder_->capacity()); + ASSERT_EQ(util::next_power2(type_traits::bytes_required(n)), + this->builder_->data()->size()); // unsure if this should go in all builder classes ASSERT_EQ(0, this->builder_->num_children()); @@ -235,12 +294,14 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { this->RandomData(size); + this->builder_->Reserve(size); + int i; for (i = 0; i < size; ++i) { if (valid_bytes[i] > 0) { - ASSERT_OK(this->builder_->Append(draws[i])); + this->builder_->Append(draws[i]); } else { - ASSERT_OK(this->builder_->AppendNull()); + this->builder_->AppendNull(); } } @@ -261,31 +322,41 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { this->RandomData(size); + this->builder_->Reserve(1000); + this->builder_nn_->Reserve(1000); + int i; + int null_count = 0; // Append the first 1000 for (i = 0; i < 1000; ++i) { if (valid_bytes[i] > 0) { - ASSERT_OK(this->builder_->Append(draws[i])); + this->builder_->Append(draws[i]); } else { - ASSERT_OK(this->builder_->AppendNull()); + this->builder_->AppendNull(); + ++null_count; } - ASSERT_OK(this->builder_nn_->Append(draws[i])); + this->builder_nn_->Append(draws[i]); } + ASSERT_EQ(null_count, this->builder_->null_count()); + ASSERT_EQ(1000, this->builder_->length()); ASSERT_EQ(1024, this->builder_->capacity()); ASSERT_EQ(1000, this->builder_nn_->length()); ASSERT_EQ(1024, this->builder_nn_->capacity()); + this->builder_->Reserve(size - 1000); + this->builder_nn_->Reserve(size - 1000); + // Append the next 9000 for (i = 1000; i < size; ++i) { if (valid_bytes[i] > 0) { - ASSERT_OK(this->builder_->Append(draws[i])); + this->builder_->Append(draws[i]); } else { - ASSERT_OK(this->builder_->AppendNull()); + this->builder_->AppendNull(); } - ASSERT_OK(this->builder_nn_->Append(draws[i])); + this->builder_nn_->Append(draws[i]); } ASSERT_EQ(size, this->builder_->length()); @@ -294,8 +365,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { ASSERT_EQ(size, this->builder_nn_->length()); ASSERT_EQ(util::next_power2(size), this->builder_nn_->capacity()); - this->CheckNullable(); - this->CheckNonNullable(); + this->Check(this->builder_, true); + this->Check(this->builder_nn_, false); } @@ -327,31 +398,34 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { ASSERT_EQ(size, this->builder_->length()); ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); - this->CheckNullable(); - this->CheckNonNullable(); + this->Check(this->builder_, true); + this->Check(this->builder_nn_, false); } TYPED_TEST(TestPrimitiveBuilder, TestAdvance) { int n = 1000; - ASSERT_OK(this->builder_->Init(n)); + ASSERT_OK(this->builder_->Reserve(n)); ASSERT_OK(this->builder_->Advance(100)); ASSERT_EQ(100, this->builder_->length()); ASSERT_OK(this->builder_->Advance(900)); - ASSERT_RAISES(Invalid, this->builder_->Advance(1)); + + int too_many = this->builder_->capacity() - 1000 + 1; + ASSERT_RAISES(Invalid, this->builder_->Advance(too_many)); } TYPED_TEST(TestPrimitiveBuilder, TestResize) { - DECL_T(); + DECL_TYPE(); int cap = MIN_BUILDER_CAPACITY * 2; - ASSERT_OK(this->builder_->Resize(cap)); + ASSERT_OK(this->builder_->Reserve(cap)); ASSERT_EQ(cap, this->builder_->capacity()); - ASSERT_EQ(cap * sizeof(T), this->builder_->buffer()->size()); - ASSERT_EQ(util::ceil_byte(cap) / 8, this->builder_->null_bitmap()->size()); + ASSERT_EQ(type_traits::bytes_required(cap), this->builder_->data()->size()); + ASSERT_EQ(util::bytes_for_bits(cap), + this->builder_->null_bitmap()->size()); } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index ecd5d68ff45..c54d0757c47 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -20,20 +20,20 @@ #include #include "arrow/util/buffer.h" +#include "arrow/util/logging.h" namespace arrow { // ---------------------------------------------------------------------- // Primitive array base -PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, int value_size, +PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) : Array(type, length, null_count, null_bitmap) { data_ = data; raw_data_ = data == nullptr? nullptr : data_->data(); - value_size_ = value_size; } bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { @@ -52,12 +52,15 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; + int value_size = type_->value_size(); + DCHECK_GT(value_size, 0); + for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && memcmp(this_data, other_data, value_size_)) { + if (!IsNull(i) && memcmp(this_data, other_data, value_size)) { return false; } - this_data += value_size_; - other_data += value_size_; + this_data += value_size; + other_data += value_size; } return true; } else { @@ -73,4 +76,170 @@ bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +template +Status PrimitiveBuilder::Init(int32_t capacity) { + RETURN_NOT_OK(ArrayBuilder::Init(capacity)); + data_ = std::make_shared(pool_); + + int64_t nbytes = type_traits::bytes_required(capacity); + RETURN_NOT_OK(data_->Resize(nbytes)); + memset(data_->mutable_data(), 0, nbytes); + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return Status::OK(); +} + +template +Status PrimitiveBuilder::Resize(int32_t capacity) { + // XXX: Set floor size for now + if (capacity < MIN_BUILDER_CAPACITY) { + capacity = MIN_BUILDER_CAPACITY; + } + + if (capacity_ == 0) { + RETURN_NOT_OK(Init(capacity)); + } else { + RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); + + int64_t old_bytes = data_->size(); + int64_t new_bytes = type_traits::bytes_required(capacity); + RETURN_NOT_OK(data_->Resize(new_bytes)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + + memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + } + capacity_ = capacity; + return Status::OK(); +} + +template +Status PrimitiveBuilder::Reserve(int32_t elements) { + if (length_ + elements > capacity_) { + int32_t new_capacity = util::next_power2(length_ + elements); + return Resize(new_capacity); + } + return Status::OK(); +} + +template +Status PrimitiveBuilder::Append(const value_type* values, int32_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(PrimitiveBuilder::Reserve(length)); + + if (length > 0) { + memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); + } + + if (valid_bytes != nullptr) { + PrimitiveBuilder::AppendNulls(valid_bytes, length); + } else { + for (int i = 0; i < length; ++i) { + util::set_bit(null_bitmap_data_, length_ + i); + } + } + + length_ += length; + return Status::OK(); +} + +template +void PrimitiveBuilder::AppendNulls(const uint8_t* valid_bytes, int32_t length) { + // If valid_bytes is all not null, then none of the values are null + for (int i = 0; i < length; ++i) { + if (valid_bytes[i] == 0) { + ++null_count_; + } else { + util::set_bit(null_bitmap_data_, length_ + i); + } + } +} + +template +std::shared_ptr PrimitiveBuilder::Finish() { + std::shared_ptr result = std::make_shared< + typename type_traits::ArrayType>( + type_, length_, data_, null_count_, null_bitmap_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return result; +} + +template <> +Status PrimitiveBuilder::Append(const uint8_t* values, int32_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + for (int i = 0; i < length; ++i) { + if (values[i] > 0) { + util::set_bit(raw_data_, length_ + i); + } else { + util::clear_bit(raw_data_, length_ + i); + } + } + + if (valid_bytes != nullptr) { + PrimitiveBuilder::AppendNulls(valid_bytes, length); + } else { + for (int i = 0; i < length; ++i) { + util::set_bit(null_bitmap_data_, length_ + i); + } + } + length_ += length; + return Status::OK(); +} + +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, + int32_t null_count, + const std::shared_ptr& null_bitmap) : + PrimitiveArray(std::make_shared(), length, + data, null_count, null_bitmap) {} + +bool BooleanArray::EqualsExact(const BooleanArray& other) const { + if (this == &other) return true; + if (null_count_ != other.null_count_) { + return false; + } + + if (null_count_ > 0) { + bool equal_bitmap = null_bitmap_->Equals(*other.null_bitmap_, + util::bytes_for_bits(length_)); + if (!equal_bitmap) { + return false; + } + + const uint8_t* this_data = raw_data_; + const uint8_t* other_data = other.raw_data_; + + for (int i = 0; i < length_; ++i) { + if (!IsNull(i) && util::get_bit(this_data, i) != util::get_bit(other_data, i)) { + return false; + } + } + return true; + } else { + return data_->Equals(*other.data_, util::bytes_for_bits(length_)); + } +} + +bool BooleanArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) return true; + if (Type::BOOL != arr->type_enum()) { + return false; + } + return EqualsExact(*static_cast(arr.get())); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 4eaff433229..ec6fee35513 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "arrow/array.h" #include "arrow/builder.h" @@ -37,7 +38,7 @@ class MemoryPool; // Base class for fixed-size logical types class PrimitiveArray : public Array { public: - PrimitiveArray(const TypePtr& type, int32_t length, int value_size, + PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); @@ -51,25 +52,19 @@ class PrimitiveArray : public Array { protected: std::shared_ptr data_; const uint8_t* raw_data_; - int value_size_; }; #define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ class NAME : public PrimitiveArray { \ public: \ using value_type = T; \ - NAME(const TypePtr& type, int32_t length, \ - const std::shared_ptr& data, \ - int32_t null_count = 0, \ - const std::shared_ptr& null_bitmap = nullptr) : \ - PrimitiveArray(std::make_shared(), length, \ - sizeof(T), data, null_count, null_bitmap) {} \ + using PrimitiveArray::PrimitiveArray; \ \ NAME(int32_t length, const std::shared_ptr& data, \ int32_t null_count = 0, \ const std::shared_ptr& null_bitmap = nullptr) : \ PrimitiveArray(std::make_shared(), length, \ - sizeof(T), data, null_count, null_bitmap) {} \ + data, null_count, null_bitmap) {} \ \ bool EqualsExact(const NAME& other) const { \ return PrimitiveArray::EqualsExact( \ @@ -96,148 +91,241 @@ NUMERIC_ARRAY_DECL(Int64Array, Int64Type, int64_t); NUMERIC_ARRAY_DECL(FloatArray, FloatType, float); NUMERIC_ARRAY_DECL(DoubleArray, DoubleType, double); -template +template class PrimitiveBuilder : public ArrayBuilder { public: typedef typename Type::c_type value_type; explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type), - values_(nullptr) { - elsize_ = sizeof(value_type); - } + data_(nullptr) {} virtual ~PrimitiveBuilder() {} - Status Resize(int32_t capacity) { - // XXX: Set floor size for now - if (capacity < MIN_BUILDER_CAPACITY) { - capacity = MIN_BUILDER_CAPACITY; - } - - if (capacity_ == 0) { - RETURN_NOT_OK(Init(capacity)); - } else { - RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); - RETURN_NOT_OK(values_->Resize(capacity * elsize_)); - } - capacity_ = capacity; - return Status::OK(); - } - - Status Init(int32_t capacity) { - RETURN_NOT_OK(ArrayBuilder::Init(capacity)); - values_ = std::make_shared(pool_); - return values_->Resize(capacity * elsize_); - } - - Status Reserve(int32_t elements) { - if (length_ + elements > capacity_) { - int32_t new_capacity = util::next_power2(length_ + elements); - return Resize(new_capacity); - } - return Status::OK(); - } + using ArrayBuilder::Advance; - Status Advance(int32_t elements) { - return ArrayBuilder::Advance(elements); - } + // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + void AppendNulls(const uint8_t* valid_bytes, int32_t length); - // Scalar append - Status Append(value_type val) { + Status AppendNull() { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); } - util::set_bit(null_bitmap_data_, length_); - raw_buffer()[length_++] = val; + ++null_count_; + ++length_; return Status::OK(); } + std::shared_ptr data() const { + return data_; + } + // Vector append // // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot Status Append(const value_type* values, int32_t length, - const uint8_t* valid_bytes = nullptr) { - if (length_ + length > capacity_) { - int32_t new_capacity = util::next_power2(length_ + length); - RETURN_NOT_OK(Resize(new_capacity)); - } - if (length > 0) { - memcpy(raw_buffer() + length_, values, length * elsize_); - } + const uint8_t* valid_bytes = nullptr); - if (valid_bytes != nullptr) { - AppendNulls(valid_bytes, length); - } else { - for (int i = 0; i < length; ++i) { - util::set_bit(null_bitmap_data_, length_ + i); - } - } + // Ensure that builder can accommodate an additional number of + // elements. Resizes if the current capacity is not sufficient + Status Reserve(int32_t elements); - length_ += length; - return Status::OK(); + std::shared_ptr Finish() override; + + protected: + std::shared_ptr data_; + value_type* raw_data_; + + Status Init(int32_t capacity); + + // Increase the capacity of the builder to accommodate at least the indicated + // number of elements + Status Resize(int32_t capacity); +}; + +template +class NumericBuilder : public PrimitiveBuilder { + public: + using typename PrimitiveBuilder::value_type; + using PrimitiveBuilder::PrimitiveBuilder; + + using PrimitiveBuilder::Append; + + // Scalar append. Does not capacity-check; make sure to call Reserve beforehand + void Append(value_type val) { + util::set_bit(null_bitmap_data_, length_); + raw_data_[length_++] = val; } - // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - void AppendNulls(const uint8_t* valid_bytes, int32_t length) { - // If valid_bytes is all not null, then none of the values are null - for (int i = 0; i < length; ++i) { - if (valid_bytes[i] == 0) { - ++null_count_; - } else { - util::set_bit(null_bitmap_data_, length_ + i); - } - } + protected: + using PrimitiveBuilder::length_; + using PrimitiveBuilder::null_bitmap_data_; + using PrimitiveBuilder::raw_data_; + + using PrimitiveBuilder::Init; + using PrimitiveBuilder::Resize; +}; + +template <> +struct type_traits { + typedef UInt8Array ArrayType; + + static inline int bytes_required(int elements) { + return elements; } +}; - Status AppendNull() { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); - } - ++null_count_; - ++length_; - return Status::OK(); +template <> +struct type_traits { + typedef Int8Array ArrayType; + + static inline int bytes_required(int elements) { + return elements; } +}; - std::shared_ptr Finish() override { - std::shared_ptr result = std::make_shared( - type_, length_, values_, null_count_, null_bitmap_); +template <> +struct type_traits { + typedef UInt16Array ArrayType; - values_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return result; + static inline int bytes_required(int elements) { + return elements * sizeof(uint16_t); } +}; + +template <> +struct type_traits { + typedef Int16Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(int16_t); + } +}; - value_type* raw_buffer() { - return reinterpret_cast(values_->mutable_data()); +template <> +struct type_traits { + typedef UInt32Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(uint32_t); } +}; - std::shared_ptr buffer() const { - return values_; +template <> +struct type_traits { + typedef Int32Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(int32_t); } +}; - protected: - std::shared_ptr values_; - int elsize_; +template <> +struct type_traits { + typedef UInt64Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(uint64_t); + } +}; + +template <> +struct type_traits { + typedef Int64Array ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(int64_t); + } +}; +template <> +struct type_traits { + typedef FloatArray ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(float); + } +}; + +template <> +struct type_traits { + typedef DoubleArray ArrayType; + + static inline int bytes_required(int elements) { + return elements * sizeof(double); + } }; // Builders -typedef PrimitiveBuilder UInt8Builder; -typedef PrimitiveBuilder UInt16Builder; -typedef PrimitiveBuilder UInt32Builder; -typedef PrimitiveBuilder UInt64Builder; +typedef NumericBuilder UInt8Builder; +typedef NumericBuilder UInt16Builder; +typedef NumericBuilder UInt32Builder; +typedef NumericBuilder UInt64Builder; + +typedef NumericBuilder Int8Builder; +typedef NumericBuilder Int16Builder; +typedef NumericBuilder Int32Builder; +typedef NumericBuilder Int64Builder; + +typedef NumericBuilder FloatBuilder; +typedef NumericBuilder DoubleBuilder; + -typedef PrimitiveBuilder Int8Builder; -typedef PrimitiveBuilder Int16Builder; -typedef PrimitiveBuilder Int32Builder; -typedef PrimitiveBuilder Int64Builder; +class BooleanArray : public PrimitiveArray { + public: + using PrimitiveArray::PrimitiveArray; + + BooleanArray(int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr); + + bool EqualsExact(const BooleanArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + + const uint8_t* raw_data() const { + return reinterpret_cast(raw_data_); + } + + bool Value(int i) const { + return util::get_bit(raw_data(), i); + } +}; + +template <> +struct type_traits { + typedef BooleanArray ArrayType; + + static inline int bytes_required(int elements) { + return util::bytes_for_bits(elements); + } +}; + +class BooleanBuilder : public PrimitiveBuilder { + public: + explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) : + PrimitiveBuilder(pool, type) {} + + virtual ~BooleanBuilder() {} + + using PrimitiveBuilder::Append; + + // Scalar append + void Append(bool val) { + util::set_bit(null_bitmap_data_, length_); + if (val) { + util::set_bit(raw_data_, length_); + } else { + util::clear_bit(raw_data_, length_); + } + ++length_; + } -typedef PrimitiveBuilder FloatBuilder; -typedef PrimitiveBuilder DoubleBuilder; + void Append(uint8_t val) { + Append(static_cast(val)); + } +}; } // namespace arrow diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index b329b4f0ef7..d3a4cc37f9c 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -92,7 +92,7 @@ class TestStringContainer : public ::testing::Test { offsets_buf_ = test::to_buffer(offsets_); - null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_.data(), valid_bytes_.size()); + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared(length_, offsets_buf_, values_, diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index fed05e3690c..d2a4b091fad 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -23,6 +23,7 @@ install(FILES bit-util.h buffer.h + logging.h macros.h memory-pool.h status.h @@ -59,7 +60,7 @@ if (ARROW_BUILD_BENCHMARKS) ) else() target_link_libraries(arrow_benchmark_main - benchmark + benchmark pthread ) endif() diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 292cb33887f..6c6d5330eab 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -23,25 +24,24 @@ namespace arrow { -void util::bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits) { - for (int i = 0; i < length; ++i) { - if (static_cast(bytes[i])) { +void util::bytes_to_bits(const std::vector& bytes, uint8_t* bits) { + for (size_t i = 0; i < bytes.size(); ++i) { + if (bytes[i] > 0) { set_bit(bits, i); } } } -Status util::bytes_to_bits(uint8_t* bytes, int length, +Status util::bytes_to_bits(const std::vector& bytes, std::shared_ptr* out) { - int bit_length = ceil_byte(length) / 8; + int bit_length = util::bytes_for_bits(bytes.size()); auto buffer = std::make_shared(); RETURN_NOT_OK(buffer->Resize(bit_length)); memset(buffer->mutable_data(), 0, bit_length); - bytes_to_bits(bytes, length, buffer->mutable_data()); + bytes_to_bits(bytes, buffer->mutable_data()); *out = buffer; - return Status::OK(); } diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 08222d50894..8d6287130dd 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -20,6 +20,7 @@ #include #include +#include namespace arrow { @@ -43,15 +44,19 @@ static inline int64_t ceil_2bytes(int64_t size) { static constexpr uint8_t BITMASK[] = {1, 2, 4, 8, 16, 32, 64, 128}; static inline bool get_bit(const uint8_t* bits, int i) { - return bits[i / 8] & BITMASK[i % 8]; + return static_cast(bits[i / 8] & BITMASK[i % 8]); } static inline bool bit_not_set(const uint8_t* bits, int i) { return (bits[i / 8] & BITMASK[i % 8]) == 0; } +static inline void clear_bit(uint8_t* bits, int i) { + bits[i / 8] &= ~BITMASK[i % 8]; +} + static inline void set_bit(uint8_t* bits, int i) { - bits[i / 8] |= 1 << (i % 8); + bits[i / 8] |= BITMASK[i % 8]; } static inline int64_t next_power2(int64_t n) { @@ -66,8 +71,8 @@ static inline int64_t next_power2(int64_t n) { return n; } -void bytes_to_bits(uint8_t* bytes, int length, uint8_t* bits); -Status bytes_to_bits(uint8_t*, int, std::shared_ptr*); +void bytes_to_bits(const std::vector& bytes, uint8_t* bits); +Status bytes_to_bits(const std::vector&, std::shared_ptr*); } // namespace util diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index e6afcbd79b6..943a08f84a0 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -86,6 +86,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsNull(int i) + cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray): + c_bool Value(int i) + cdef cppclass CUInt8Array" arrow::UInt8Array"(CArray): uint8_t Value(int i) diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 04f013d6ca7..0d391e5f26b 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -58,7 +58,10 @@ cdef class ArrayValue(Scalar): cdef class BooleanValue(ArrayValue): - pass + + def as_py(self): + cdef CBooleanArray* ap = self.sp_array.get() + return ap.Value(self.index) cdef class Int8Value(ArrayValue): @@ -172,6 +175,7 @@ cdef class ListValue(ArrayValue): cdef dict _scalar_classes = { + Type_BOOL: BooleanValue, Type_UINT8: Int8Value, Type_UINT16: Int16Value, Type_UINT32: Int32Value, diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 25f69691210..2beb6b39d73 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -22,7 +22,10 @@ class TestConvertList(unittest.TestCase): def test_boolean(self): - pass + arr = pyarrow.from_pylist([True, None, False, None]) + assert len(arr) == 4 + assert arr.null_count == 2 + assert arr.type == pyarrow.bool_() def test_empty_list(self): arr = pyarrow.from_pylist([]) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 021737db672..4fb850a4d47 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -16,67 +16,74 @@ # under the License. from pyarrow.compat import unittest, u -import pyarrow as arrow +import pyarrow as A class TestScalars(unittest.TestCase): def test_null_singleton(self): with self.assertRaises(Exception): - arrow.NAType() + A.NAType() def test_bool(self): - pass + arr = A.from_pylist([True, None, False, None]) + + v = arr[0] + assert isinstance(v, A.BooleanValue) + assert repr(v) == "True" + assert v.as_py() == True + + assert arr[1] is A.NA def test_int64(self): - arr = arrow.from_pylist([1, 2, None]) + arr = A.from_pylist([1, 2, None]) v = arr[0] - assert isinstance(v, arrow.Int64Value) + assert isinstance(v, A.Int64Value) assert repr(v) == "1" assert v.as_py() == 1 - assert arr[2] is arrow.NA + assert arr[2] is A.NA def test_double(self): - arr = arrow.from_pylist([1.5, None, 3]) + arr = A.from_pylist([1.5, None, 3]) v = arr[0] - assert isinstance(v, arrow.DoubleValue) + assert isinstance(v, A.DoubleValue) assert repr(v) == "1.5" assert v.as_py() == 1.5 - assert arr[1] is arrow.NA + assert arr[1] is A.NA v = arr[2] assert v.as_py() == 3.0 def test_string(self): - arr = arrow.from_pylist(['foo', None, u('bar')]) + arr = A.from_pylist(['foo', None, u('bar')]) v = arr[0] - assert isinstance(v, arrow.StringValue) + assert isinstance(v, A.StringValue) assert repr(v) == "'foo'" assert v.as_py() == 'foo' - assert arr[1] is arrow.NA + assert arr[1] is A.NA v = arr[2].as_py() assert v == 'bar' assert isinstance(v, str) def test_list(self): - arr = arrow.from_pylist([['foo', None], None, ['bar'], []]) + arr = A.from_pylist([['foo', None], None, ['bar'], []]) v = arr[0] assert len(v) == 2 - assert isinstance(v, arrow.ListValue) + assert isinstance(v, A.ListValue) assert repr(v) == "['foo', None]" assert v.as_py() == ['foo', None] assert v[0].as_py() == 'foo' - assert v[1] is arrow.NA + assert v[1] is A.NA - assert arr[1] is arrow.NA + assert arr[1] is A.NA v = arr[3] assert len(v) == 0 diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index acb13acecaf..78ef1b31f34 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -61,6 +61,8 @@ class ScalarVisitor { ++total_count_; if (obj == Py_None) { ++none_count_; + } else if (PyBool_Check(obj)) { + ++bool_count_; } else if (PyFloat_Check(obj)) { ++float_count_; } else if (IsPyInteger(obj)) { @@ -256,6 +258,20 @@ class TypedConverter : public SeqConverter { class BoolConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + if (item.obj() == Py_True) { + typed_builder_->Append(true); + } else { + typed_builder_->Append(false); + } + } + } return Status::OK(); } }; @@ -265,14 +281,15 @@ class Int64Converter : public TypedConverter { Status AppendData(PyObject* seq) override { int64_t val; Py_ssize_t size = PySequence_Size(seq); + RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + typed_builder_->AppendNull(); } else { val = PyLong_AsLongLong(item.obj()); RETURN_IF_PYERROR(); - RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + typed_builder_->Append(val); } } return Status::OK(); @@ -284,14 +301,15 @@ class DoubleConverter : public TypedConverter { Status AppendData(PyObject* seq) override { double val; Py_ssize_t size = PySequence_Size(seq); + RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + typed_builder_->AppendNull(); } else { val = PyFloat_AsDouble(item.obj()); RETURN_IF_PYERROR(); - RETURN_ARROW_NOT_OK(typed_builder_->Append(val)); + typed_builder_->Append(val); } } return Status::OK(); From d3cb6b47fde2935522b73c7150d83e364f4e19c9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 26 Mar 2016 17:07:40 -0700 Subject: [PATCH 044/210] ARROW-22: [C++] Convert flat Parquet schemas to Arrow schemas I'm going to limit the amount of nested data (especially repeated fields) cases in this patch as I haven't yet thought through the nested data reassembly from repetition / definition levels. Since the effective Arrow schemas may "collapse" multiple levels of nesting (for example: 3-level array encoding -- see https://github.com/apache/parquet-cpp/blob/master/src/parquet/schema/types.h), we'll need to track the logical correspondence between repetition and definition levels so that the right null bits can be set easily during reassembly. Closes #37. Closes #38. Closes #39 Author: Wes McKinney Author: Uwe L. Korn Closes #41 from wesm/ARROW-22 and squashes the following commits: f388210 [Wes McKinney] Correct typo in Layout.md (thanks @takahirox) e5c429a [Wes McKinney] Test for some unsupported Parquet schema types, add unannotated FIXED_LEN_BYTE_ARRAY to List 54daa9b [Wes McKinney] Refactor tests to invoke FromParquetSchema 74d6bae [Wes McKinney] Convert BYTE_ARRAY to StringType or List depending on the logical type b7b9ca9 [Uwe L. Korn] Add basic conversion for primitive types 0e2a7f1 [Uwe L. Korn] Add macro for adding dependencies to tests 0dd1109 [Uwe L. Korn] ARROW-78: Add constructor for DecimalType --- cpp/CMakeLists.txt | 11 ++ cpp/src/arrow/parquet/CMakeLists.txt | 8 +- cpp/src/arrow/parquet/parquet-schema-test.cc | 147 +++++++++++++++ cpp/src/arrow/parquet/schema.cc | 178 +++++++++++++++++++ cpp/src/arrow/parquet/schema.h | 44 +++++ cpp/src/arrow/types/decimal.cc | 32 ++++ cpp/src/arrow/types/decimal.h | 11 ++ cpp/src/arrow/util/status.h | 1 + format/Layout.md | 2 +- 9 files changed, 432 insertions(+), 2 deletions(-) create mode 100644 cpp/src/arrow/parquet/parquet-schema-test.cc create mode 100644 cpp/src/arrow/parquet/schema.cc create mode 100644 cpp/src/arrow/parquet/schema.h create mode 100644 cpp/src/arrow/types/decimal.cc diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6d701079b48..6ed2768d139 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,16 @@ function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARGN}) endfunction() +# A wrapper for target_link_libraries() that is compatible with NO_TESTS. +function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + target_link_libraries(${TEST_NAME} ${ARGN}) +endfunction() + enable_testing() ############################################################ @@ -528,6 +538,7 @@ set(ARROW_SRCS src/arrow/ipc/metadata-internal.cc src/arrow/types/construct.cc + src/arrow/types/decimal.cc src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 7b449affab0..0d5cf263ec3 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,17 +19,23 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS + schema.cc ) set(PARQUET_LIBS + arrow + ${PARQUET_SHARED_LIB} ) -add_library(arrow_parquet STATIC +add_library(arrow_parquet SHARED ${PARQUET_SRCS} ) target_link_libraries(arrow_parquet ${PARQUET_LIBS}) SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) +ADD_ARROW_TEST(parquet-schema-test) +ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) + # Headers: top level install(FILES DESTINATION include/arrow/parquet) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc new file mode 100644 index 00000000000..9c3093d9ff7 --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/util/status.h" + +#include "arrow/parquet/schema.h" + +namespace arrow { + +namespace parquet { + +using parquet_cpp::Repetition; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +const auto BOOL = std::make_shared(); +const auto UINT8 = std::make_shared(); +const auto INT32 = std::make_shared(); +const auto INT64 = std::make_shared(); +const auto FLOAT = std::make_shared(); +const auto DOUBLE = std::make_shared(); +const auto UTF8 = std::make_shared(); +const auto BINARY = std::make_shared( + std::make_shared("", UINT8)); + +class TestConvertParquetSchema : public ::testing::Test { + public: + virtual void SetUp() {} + + void CheckFlatSchema(const std::shared_ptr& expected_schema) { + ASSERT_EQ(expected_schema->num_fields(), result_schema_->num_fields()); + for (int i = 0; i < expected_schema->num_fields(); ++i) { + auto lhs = result_schema_->field(i); + auto rhs = expected_schema->field(i); + EXPECT_TRUE(lhs->Equals(rhs)) + << i << " " << lhs->ToString() << " != " << rhs->ToString(); + } + } + + Status ConvertSchema(const std::vector& nodes) { + NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); + descr_.Init(schema); + return FromParquetSchema(&descr_, &result_schema_); + } + + protected: + parquet_cpp::SchemaDescriptor descr_; + std::shared_ptr result_schema_; +}; + +TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { + std::vector parquet_fields; + std::vector> arrow_fields; + + parquet_fields.push_back( + PrimitiveNode::Make("boolean", Repetition::REQUIRED, parquet_cpp::Type::BOOLEAN)); + arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32)); + arrow_fields.push_back(std::make_shared("int32", INT32, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64)); + arrow_fields.push_back(std::make_shared("int64", INT64, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("float", Repetition::OPTIONAL, parquet_cpp::Type::FLOAT)); + arrow_fields.push_back(std::make_shared("float", FLOAT)); + + parquet_fields.push_back( + PrimitiveNode::Make("double", Repetition::OPTIONAL, parquet_cpp::Type::DOUBLE)); + arrow_fields.push_back(std::make_shared("double", DOUBLE)); + + parquet_fields.push_back( + PrimitiveNode::Make("binary", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY)); + arrow_fields.push_back(std::make_shared("binary", BINARY)); + + parquet_fields.push_back( + PrimitiveNode::Make("string", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY, + parquet_cpp::LogicalType::UTF8)); + arrow_fields.push_back(std::make_shared("string", UTF8)); + + parquet_fields.push_back( + PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, + parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY, + parquet_cpp::LogicalType::NONE, 12)); + arrow_fields.push_back(std::make_shared("flba-binary", BINARY)); + + auto arrow_schema = std::make_shared(arrow_fields); + ASSERT_OK(ConvertSchema(parquet_fields)); + + CheckFlatSchema(arrow_schema); +} + +TEST_F(TestConvertParquetSchema, UnsupportedThings) { + std::vector unsupported_nodes; + + unsupported_nodes.push_back( + PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96)); + + unsupported_nodes.push_back( + GroupNode::Make("repeated-group", Repetition::REPEATED, {})); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int32", Repetition::OPTIONAL, + parquet_cpp::Type::INT32, parquet_cpp::LogicalType::DATE)); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int64", Repetition::OPTIONAL, + parquet_cpp::Type::INT64, parquet_cpp::LogicalType::TIMESTAMP_MILLIS)); + + for (const NodePtr& node : unsupported_nodes) { + ASSERT_RAISES(NotImplemented, ConvertSchema({node})); + } +} + +TEST(TestNodeConversion, DateAndTime) { +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc new file mode 100644 index 00000000000..6b1de572617 --- /dev/null +++ b/cpp/src/arrow/parquet/schema.cc @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/parquet/schema.h" + +#include + +#include "parquet/api/schema.h" + +#include "arrow/util/status.h" +#include "arrow/types/decimal.h" + +using parquet_cpp::schema::Node; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +using parquet_cpp::LogicalType; + +namespace arrow { + +namespace parquet { + +const auto BOOL = std::make_shared(); +const auto UINT8 = std::make_shared(); +const auto INT32 = std::make_shared(); +const auto INT64 = std::make_shared(); +const auto FLOAT = std::make_shared(); +const auto DOUBLE = std::make_shared(); +const auto UTF8 = std::make_shared(); +const auto BINARY = std::make_shared( + std::make_shared("", UINT8)); + +TypePtr MakeDecimalType(const PrimitiveNode* node) { + int precision = node->decimal_metadata().precision; + int scale = node->decimal_metadata().scale; + return std::make_shared(precision, scale); +} + +static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::UTF8: + *out = UTF8; + break; + default: + // BINARY + *out = BINARY; + break; + } + return Status::OK(); +} + +static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = BINARY; + break; + case LogicalType::DECIMAL: + *out = MakeDecimalType(node); + break; + default: + return Status::NotImplemented("unhandled type"); + break; + } + + return Status::OK(); +} + +static Status FromInt32(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT32; + break; + default: + return Status::NotImplemented("Unhandled logical type for int32"); + break; + } + return Status::OK(); +} + +static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT64; + break; + default: + return Status::NotImplemented("Unhandled logical type for int64"); + break; + } + return Status::OK(); +} + +// TODO: Logical Type Handling +Status NodeToField(const NodePtr& node, std::shared_ptr* out) { + std::shared_ptr type; + + if (node->is_repeated()) { + return Status::NotImplemented("No support yet for repeated node types"); + } + + if (node->is_group()) { + const GroupNode* group = static_cast(node.get()); + std::vector> fields(group->field_count()); + for (int i = 0; i < group->field_count(); i++) { + RETURN_NOT_OK(NodeToField(group->field(i), &fields[i])); + } + type = std::make_shared(fields); + } else { + // Primitive (leaf) node + const PrimitiveNode* primitive = static_cast(node.get()); + + switch (primitive->physical_type()) { + case parquet_cpp::Type::BOOLEAN: + type = BOOL; + break; + case parquet_cpp::Type::INT32: + RETURN_NOT_OK(FromInt32(primitive, &type)); + break; + case parquet_cpp::Type::INT64: + RETURN_NOT_OK(FromInt64(primitive, &type)); + break; + case parquet_cpp::Type::INT96: + // TODO: Do we have that type in Arrow? + // type = TypePtr(new Int96Type()); + return Status::NotImplemented("int96"); + case parquet_cpp::Type::FLOAT: + type = FLOAT; + break; + case parquet_cpp::Type::DOUBLE: + type = DOUBLE; + break; + case parquet_cpp::Type::BYTE_ARRAY: + // TODO: Do we have that type in Arrow? + RETURN_NOT_OK(FromByteArray(primitive, &type)); + break; + case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY: + RETURN_NOT_OK(FromFLBA(primitive, &type)); + break; + } + } + + *out = std::make_shared(node->name(), type, !node->is_required()); + return Status::OK(); +} + +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr* out) { + // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes + // from the root Parquet node + const GroupNode* schema_node = static_cast( + parquet_schema->schema().get()); + + std::vector> fields(schema_node->field_count()); + for (int i = 0; i < schema_node->field_count(); i++) { + RETURN_NOT_OK(NodeToField(schema_node->field(i), &fields[i])); + } + + *out = std::make_shared(fields); + return Status::OK(); +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h new file mode 100644 index 00000000000..61de193a338 --- /dev/null +++ b/cpp/src/arrow/parquet/schema.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_SCHEMA_H +#define ARROW_PARQUET_SCHEMA_H + +#include + +#include "parquet/api/schema.h" + +#include "arrow/schema.h" +#include "arrow/type.h" + +namespace arrow { + +class Status; + +namespace parquet { + +Status NodeToField(const parquet_cpp::schema::NodePtr& node, + std::shared_ptr* out); + +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr* out); + +} // namespace parquet + +} // namespace arrow + +#endif diff --git a/cpp/src/arrow/types/decimal.cc b/cpp/src/arrow/types/decimal.cc new file mode 100644 index 00000000000..f120c1a9dfd --- /dev/null +++ b/cpp/src/arrow/types/decimal.cc @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/decimal.h" + +#include +#include + +namespace arrow { + +std::string DecimalType::ToString() const { + std::stringstream s; + s << "decimal(" << precision << ", " << scale << ")"; + return s.str(); +} + +} // namespace arrow + diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 464c3ff8da9..26243b42b0e 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -18,13 +18,24 @@ #ifndef ARROW_TYPES_DECIMAL_H #define ARROW_TYPES_DECIMAL_H +#include + #include "arrow/type.h" namespace arrow { struct DecimalType : public DataType { + explicit DecimalType(int precision_, int scale_) + : DataType(Type::DECIMAL), precision(precision_), + scale(scale_) { } int precision; int scale; + + static char const *name() { + return "decimal"; + } + + std::string ToString() const override; }; } // namespace arrow diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index b5931232dbd..4e273edcb8f 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -109,6 +109,7 @@ class Status { bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } bool IsIOError() const { return code() == StatusCode::IOError; } + bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. diff --git a/format/Layout.md b/format/Layout.md index 2d46ece606e..1b532c6b381 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -58,7 +58,7 @@ Base requirements * Memory layout and random access patterns for each relative type * Null value representation -## Non-goals (for this document +## Non-goals (for this document) * To enumerate or specify logical types that can be implemented as primitive (fixed-width) value types. For example: signed and unsigned integers, From d6d53b25ef4e8cd7d8c34df56661817366906bbf Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 27 Mar 2016 12:28:18 -0700 Subject: [PATCH 045/210] ARROW-63: [C++] Enable ctest to work on systems with Python 3 as the default Python Author: Wes McKinney Closes #42 from wesm/ARROW-63 and squashes the following commits: 9840308 [Wes McKinney] Make asan_symbolize.py work on both Python 2.7 and 3.x --- cpp/build-support/asan_symbolize.py | 36 ++++++++++++++++++----------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/cpp/build-support/asan_symbolize.py b/cpp/build-support/asan_symbolize.py index 839a1984bd3..1108044d7d6 100755 --- a/cpp/build-support/asan_symbolize.py +++ b/cpp/build-support/asan_symbolize.py @@ -64,7 +64,7 @@ def open_llvm_symbolizer(self): '--functions=true', '--inlining=true'] if DEBUG: - print ' '.join(cmd) + print(' '.join(cmd)) return subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) @@ -76,8 +76,9 @@ def symbolize(self, addr, binary, offset): try: symbolizer_input = '%s %s' % (binary, offset) if DEBUG: - print symbolizer_input - print >> self.pipe.stdin, symbolizer_input + print(symbolizer_input) + self.pipe.stdin.write(symbolizer_input) + self.pipe.stdin.write('\n') while True: function_name = self.pipe.stdout.readline().rstrip() if not function_name: @@ -113,7 +114,7 @@ def __init__(self, binary): def open_addr2line(self): cmd = ['addr2line', '-f', '-e', self.binary] if DEBUG: - print ' '.join(cmd) + print(' '.join(cmd)) return subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE) @@ -122,7 +123,8 @@ def symbolize(self, addr, binary, offset): if self.binary != binary: return None try: - print >> self.pipe.stdin, offset + self.pipe.stdin.write(offset) + self.pipe.stdin.write('\n') function_name = self.pipe.stdout.readline().rstrip() file_name = self.pipe.stdout.readline().rstrip() except Exception: @@ -145,11 +147,12 @@ def __init__(self, addr, binary): self.pipe = None def write_addr_to_pipe(self, offset): - print >> self.pipe.stdin, '0x%x' % int(offset, 16) + self.pipe.stdin.write('0x%x' % int(offset, 16)) + self.pipe.stdin.write('\n') def open_atos(self): if DEBUG: - print 'atos -o %s -arch %s' % (self.binary, self.arch) + print('atos -o %s -arch %s' % (self.binary, self.arch)) cmdline = ['atos', '-o', self.binary, '-arch', self.arch] self.pipe = subprocess.Popen(cmdline, stdin=subprocess.PIPE, @@ -168,7 +171,7 @@ def symbolize(self, addr, binary, offset): # foo(type1, type2) (in object.name) (filename.cc:80) match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line) if DEBUG: - print 'atos_line: ', atos_line + print('atos_line: {0}'.format(atos_line)) if match: function_name = match.group(1) function_name = re.sub('\(.*?\)', '', function_name) @@ -282,7 +285,7 @@ def symbolize(self, addr, binary, offset): function_name, file_name, line_no = res result = ['%s in %s %s:%d' % ( addr, function_name, file_name, line_no)] - print result + print(result) return result else: return None @@ -318,15 +321,20 @@ def symbolize_address(self, addr, binary, offset): def print_symbolized_lines(self, symbolized_lines): if not symbolized_lines: - print self.current_line + print(self.current_line) else: for symbolized_frame in symbolized_lines: - print ' #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip() + print(' #' + str(self.frame_no) + ' ' + symbolized_frame.rstrip()) self.frame_no += 1 def process_stdin(self): self.frame_no = 0 - sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + + if sys.version_info[0] == 2: + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + else: + # Unbuffered output is not supported in Python 3 + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w') while True: line = sys.stdin.readline() @@ -337,10 +345,10 @@ def process_stdin(self): '^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)') match = re.match(stack_trace_line_format, line) if not match: - print self.current_line + print(self.current_line) continue if DEBUG: - print line + print(line) _, frameno_str, addr, binary, offset = match.groups() if frameno_str == '0': # Assume that frame #0 is the first frame of new stack trace. From 017187749f3916e589015a4db2409258a0b3c03c Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 27 Mar 2016 12:30:58 -0700 Subject: [PATCH 046/210] ARROW-65: Be less restrictive on PYTHON_LIBRARY search paths Current CMake FindPythonLibs also uses this option instead of NO_DEFAULT_PATH. Author: Uwe L. Korn Closes #43 from xhochy/arrow-65 and squashes the following commits: 10eb9e0 [Uwe L. Korn] ARROW-65: Be less restrictive on PYTHON_LIBRARY search paths --- python/cmake_modules/FindPythonLibsNew.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cmake_modules/FindPythonLibsNew.cmake b/python/cmake_modules/FindPythonLibsNew.cmake index c70e6bc26a7..0f2295aa43b 100644 --- a/python/cmake_modules/FindPythonLibsNew.cmake +++ b/python/cmake_modules/FindPythonLibsNew.cmake @@ -166,7 +166,7 @@ else() find_library(PYTHON_LIBRARY NAMES "python${PYTHON_LIBRARY_SUFFIX}" PATHS ${_PYTHON_LIBS_SEARCH} - NO_DEFAULT_PATH) + NO_SYSTEM_ENVIRONMENT_PATH) message(STATUS "Found Python lib ${PYTHON_LIBRARY}") endif() From 1fd0668a1330e72b1b137d90d00906bc188243e0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 28 Mar 2016 09:36:20 -0700 Subject: [PATCH 047/210] ARROW-30: [Python] Routines for converting between arrow::Array/Table and pandas.DataFrame There is a lot to do here for maximum compatibility, but this gets things started. Author: Wes McKinney Closes #46 from wesm/ARROW-30 and squashes the following commits: 0a9e747 [Wes McKinney] Invoke py.test with python -m pytest 4c9f766 [Wes McKinney] More scaffolding. Table wrapper. Initial unit tests passing 8475a0e [Wes McKinney] More pandas conversion scaffolding, enable libpyarrow to use the NumPy C API globally d1f05c5 [Wes McKinney] cpplint f0cc451 [Wes McKinney] Give libpyarrow a reference to numpy.nan 5e09bfe [Wes McKinney] Compiling, but untested draft of pandas <-> arrow converters --- ci/travis_script_python.sh | 8 +- cpp/README.md | 6 +- cpp/src/arrow/array.h | 13 +- cpp/src/arrow/types/string.cc | 10 + cpp/src/arrow/types/string.h | 4 +- cpp/src/arrow/util/buffer.h | 42 ++ python/CMakeLists.txt | 6 +- python/pyarrow/__init__.py | 8 +- python/pyarrow/array.pyx | 135 ++++ python/pyarrow/config.pyx | 13 +- python/pyarrow/includes/common.pxd | 6 + python/pyarrow/includes/libarrow.pxd | 52 +- python/pyarrow/includes/pyarrow.pxd | 9 +- python/pyarrow/tests/test_convert_pandas.py | 172 +++++ python/src/pyarrow/adapters/pandas.cc | 714 ++++++++++++++++++ python/src/pyarrow/adapters/pandas.h | 21 + python/src/pyarrow/common.h | 23 +- python/src/pyarrow/{init.cc => config.cc} | 11 +- python/src/pyarrow/config.h | 39 + .../src/pyarrow/{init.h => do_import_numpy.h} | 12 +- python/src/pyarrow/numpy_interop.h | 58 ++ 21 files changed, 1313 insertions(+), 49 deletions(-) create mode 100644 python/pyarrow/tests/test_convert_pandas.py create mode 100644 python/src/pyarrow/adapters/pandas.cc rename python/src/pyarrow/{init.cc => config.cc} (84%) create mode 100644 python/src/pyarrow/config.h rename python/src/pyarrow/{init.h => do_import_numpy.h} (83%) create mode 100644 python/src/pyarrow/numpy_interop.h diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index af6b0085724..d45b895d8cf 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -48,17 +48,11 @@ python_version_tests() { python setup.py build_ext --inplace - py.test -vv -r sxX pyarrow + python -m pytest -vv -r sxX pyarrow } # run tests for python 2.7 and 3.5 python_version_tests 2.7 python_version_tests 3.5 -# if [ $TRAVIS_OS_NAME == "linux" ]; then -# valgrind --tool=memcheck py.test -vv -r sxX arrow -# else -# py.test -vv -r sxX arrow -# fi - popd diff --git a/cpp/README.md b/cpp/README.md index 542cce43a13..9026cf963f8 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -42,12 +42,12 @@ Detailed unit test logs will be placed in the build directory under `build/test- ### Building/Running benchmarks -Follow the directions for simple build except run cmake +Follow the directions for simple build except run cmake with the `--ARROW_BUILD_BENCHMARKS` parameter set correctly: cmake -DARROW_BUILD_BENCHMARKS=ON .. -and instead of make unittest run either `make; ctest` to run both unit tests +and instead of make unittest run either `make; ctest` to run both unit tests and benchmarks or `make runbenchmark` to run only the benchmark tests. Benchmark logs will be placed in the build directory under `build/benchmark-logs`. @@ -60,4 +60,4 @@ variables * Googletest: `GTEST_HOME` (only required to build the unit tests) * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) - +* Flatbuffers: `FLATBUFFERS_HOME` (only required for the IPC extensions) diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 133adf32cbd..097634d74f8 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -34,13 +34,10 @@ class Buffer; // // The base class is only required to have a null bitmap buffer if the null // count is greater than 0 -// -// Any buffers used to initialize the array have their references "stolen". If -// you wish to use the buffer beyond the lifetime of the array, you need to -// explicitly increment its reference count class Array { public: - Array(const TypePtr& type, int32_t length, int32_t null_count = 0, + Array(const std::shared_ptr& type, int32_t length, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); virtual ~Array() {} @@ -60,11 +57,15 @@ class Array { return null_bitmap_; } + const uint8_t* null_bitmap_data() const { + return null_bitmap_data_; + } + bool EqualsExact(const Array& arr) const; virtual bool Equals(const std::shared_ptr& arr) const = 0; protected: - TypePtr type_; + std::shared_ptr type_; int32_t null_count_; int32_t length_; diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index dea42e102b0..80b075cdfbb 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -20,8 +20,18 @@ #include #include +#include "arrow/type.h" + namespace arrow { +const std::shared_ptr STRING(new StringType()); + +StringArray::StringArray(int32_t length, + const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap) : + StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + std::string CharType::ToString() const { std::stringstream s; s << "char(" << size << ")"; diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index fda722ba6de..84cd0326ec8 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -79,9 +79,7 @@ class StringArray : public ListArray { const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) : - StringArray(std::make_shared(), length, offsets, values, - null_count, null_bitmap) {} + const std::shared_ptr& null_bitmap = nullptr); // Compute the pointer t const uint8_t* GetValue(int i, int32_t* out_length) const { diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 0c3e210abd9..c15f9b630cd 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -18,11 +18,13 @@ #ifndef ARROW_UTIL_BUFFER_H #define ARROW_UTIL_BUFFER_H +#include #include #include #include #include "arrow/util/macros.h" +#include "arrow/util/status.h" namespace arrow { @@ -146,6 +148,46 @@ class PoolBuffer : public ResizableBuffer { MemoryPool* pool_; }; +static constexpr int64_t MIN_BUFFER_CAPACITY = 1024; + +class BufferBuilder { + public: + explicit BufferBuilder(MemoryPool* pool) : + pool_(pool), + capacity_(0), + size_(0) {} + + Status Append(const uint8_t* data, int length) { + if (capacity_ < length + size_) { + if (capacity_ == 0) { + buffer_ = std::make_shared(pool_); + } + capacity_ = std::max(MIN_BUFFER_CAPACITY, capacity_); + while (capacity_ < length + size_) { + capacity_ *= 2; + } + RETURN_NOT_OK(buffer_->Resize(capacity_)); + data_ = buffer_->mutable_data(); + } + memcpy(data_ + size_, data, length); + size_ += length; + return Status::OK(); + } + + std::shared_ptr Finish() { + auto result = buffer_; + buffer_ = nullptr; + return result; + } + + private: + std::shared_ptr buffer_; + MemoryPool* pool_; + uint8_t* data_; + int64_t capacity_; + int64_t size_; +}; + } // namespace arrow #endif // ARROW_UTIL_BUFFER_H diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 0ecafc7202e..ebe825f65c4 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -220,9 +220,12 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") ## Python and libraries find_package(PythonLibsNew REQUIRED) +find_package(NumPy REQUIRED) include(UseCython) include_directories(SYSTEM + ${NUMPY_INCLUDE_DIRS} + ${PYTHON_INCLUDE_DIRS} src) ############################################################ @@ -409,11 +412,12 @@ add_subdirectory(src/pyarrow/util) set(PYARROW_SRCS src/pyarrow/common.cc + src/pyarrow/config.cc src/pyarrow/helpers.cc - src/pyarrow/init.cc src/pyarrow/status.cc src/pyarrow/adapters/builtin.cc + src/pyarrow/adapters/pandas.cc ) set(LINK_LIBS diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9a080709beb..c343f5ba5f1 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -17,7 +17,11 @@ # flake8: noqa -from pyarrow.array import (Array, from_pylist, total_allocated_bytes, +import pyarrow.config + +from pyarrow.array import (Array, + from_pandas_series, from_pylist, + total_allocated_bytes, BooleanArray, NumericArray, Int8Array, UInt8Array, ListArray, StringArray) @@ -37,4 +41,4 @@ list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.array import RowBatch +from pyarrow.array import RowBatch, Table, from_pandas_dataframe diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index c5d40ddd7a4..88770cdaa96 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -22,6 +22,8 @@ from pyarrow.includes.libarrow cimport * cimport pyarrow.includes.pyarrow as pyarrow +import pyarrow.config + from pyarrow.compat import frombytes, tobytes from pyarrow.error cimport check_status @@ -44,6 +46,10 @@ cdef class Array: self.type = DataType() self.type.init(self.sp_array.get().type()) + @staticmethod + def from_pandas(obj, mask=None): + return from_pandas_series(obj, mask) + property null_count: def __get__(self): @@ -160,7 +166,15 @@ cdef class StringArray(Array): cdef dict _array_classes = { Type_NA: NullArray, Type_BOOL: BooleanArray, + Type_UINT8: UInt8Array, + Type_UINT16: UInt16Array, + Type_UINT32: UInt32Array, + Type_UINT64: UInt64Array, + Type_INT8: Int8Array, + Type_INT16: Int16Array, + Type_INT32: Int32Array, Type_INT64: Int64Array, + Type_FLOAT: FloatArray, Type_DOUBLE: DoubleArray, Type_LIST: ListArray, Type_STRING: StringArray, @@ -194,6 +208,49 @@ def from_pylist(object list_obj, DataType type=None): return box_arrow_array(sp_array) + +def from_pandas_series(object series, object mask=None): + cdef: + shared_ptr[CArray] out + + series_values = series_as_ndarray(series) + + if mask is None: + check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(), + series_values, &out)) + else: + mask = series_as_ndarray(mask) + check_status(pyarrow.PandasMaskedToArrow( + pyarrow.GetMemoryPool(), series_values, mask, &out)) + + return box_arrow_array(out) + + +def from_pandas_dataframe(object df, name=None): + cdef: + list names = [] + list arrays = [] + + for name in df.columns: + col = df[name] + arr = from_pandas_series(col) + + names.append(name) + arrays.append(arr) + + return Table.from_arrays(names, arrays, name=name) + + +cdef object series_as_ndarray(object obj): + import pandas as pd + + if isinstance(obj, pd.Series): + result = obj.values + else: + result = obj + + return result + #---------------------------------------------------------------------- # Table-like data structures @@ -225,3 +282,81 @@ cdef class RowBatch: def __getitem__(self, i): return self.arrays[i] + + +cdef class Table: + ''' + Do not call this class's constructor directly. + ''' + cdef: + shared_ptr[CTable] sp_table + CTable* table + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CTable]& table): + self.sp_table = table + self.table = table.get() + + @staticmethod + def from_pandas(df, name=None): + pass + + @staticmethod + def from_arrays(names, arrays, name=None): + cdef: + Array arr + Table result + c_string c_name + vector[shared_ptr[CField]] fields + vector[shared_ptr[CColumn]] columns + shared_ptr[CSchema] schema + shared_ptr[CTable] table + + cdef int K = len(arrays) + + fields.resize(K) + columns.resize(K) + for i in range(K): + arr = arrays[i] + c_name = tobytes(names[i]) + + fields[i].reset(new CField(c_name, arr.type.sp_type, True)) + columns[i].reset(new CColumn(fields[i], arr.sp_array)) + + if name is None: + c_name = '' + else: + c_name = tobytes(name) + + schema.reset(new CSchema(fields)) + table.reset(new CTable(c_name, schema, columns)) + + result = Table() + result.init(table) + + return result + + def to_pandas(self): + """ + Convert the arrow::Table to a pandas DataFrame + """ + cdef: + PyObject* arr + shared_ptr[CColumn] col + + import pandas as pd + + names = [] + data = [] + for i in range(self.table.num_columns()): + col = self.table.column(i) + check_status(pyarrow.ArrowToPandas(col, &arr)) + names.append(frombytes(col.get().name())) + data.append( arr) + + # One ref count too many + Py_XDECREF(arr) + + return pd.DataFrame(dict(zip(names, data)), columns=names) diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx index 521bc066cd4..1047a472fe3 100644 --- a/python/pyarrow/config.pyx +++ b/python/pyarrow/config.pyx @@ -2,7 +2,18 @@ # distutils: language = c++ # cython: embedsignature = True -cdef extern from 'pyarrow/init.h' namespace 'pyarrow': +cdef extern from 'pyarrow/do_import_numpy.h': + pass + +cdef extern from 'pyarrow/numpy_interop.h' namespace 'pyarrow': + int import_numpy() + +cdef extern from 'pyarrow/config.h' namespace 'pyarrow': void pyarrow_init() + void pyarrow_set_numpy_nan(object o) +import_numpy() pyarrow_init() + +import numpy as np +pyarrow_set_numpy_nan(np.nan) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 839427a6990..e86d5d77e8b 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -22,10 +22,16 @@ from libcpp cimport bool as c_bool from libcpp.string cimport string as c_string from libcpp.vector cimport vector +from cpython cimport PyObject +cimport cpython + # This must be included for cerr and other things to work cdef extern from "": pass +cdef extern from "": + void Py_XDECREF(PyObject* o) + cdef extern from "" namespace "std" nogil: cdef cppclass shared_ptr[T]: diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 943a08f84a0..42f1f25073d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -19,6 +19,25 @@ from pyarrow.includes.common cimport * +cdef extern from "arrow/api.h" namespace "arrow" nogil: + # We can later add more of the common status factory methods as needed + cdef CStatus CStatus_OK "Status::OK"() + + cdef cppclass CStatus "arrow::Status": + CStatus() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsNotImplemented() + c_bool IsInvalid() + + cdef cppclass Buffer: + uint8_t* data() + int64_t size() + cdef extern from "arrow/api.h" namespace "arrow" nogil: enum Type" arrow::Type::type": @@ -129,25 +148,30 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CStringArray" arrow::StringArray"(CListArray): c_string GetString(int i) + cdef cppclass CChunkedArray" arrow::ChunkedArray": + pass -cdef extern from "arrow/api.h" namespace "arrow" nogil: - # We can later add more of the common status factory methods as needed - cdef CStatus CStatus_OK "Status::OK"() + cdef cppclass CColumn" arrow::Column": + CColumn(const shared_ptr[CField]& field, + const shared_ptr[CArray]& data) - cdef cppclass CStatus "arrow::Status": - CStatus() + int64_t length() + int64_t null_count() + const c_string& name() + const shared_ptr[CDataType]& type() + const shared_ptr[CChunkedArray]& data() - c_string ToString() + cdef cppclass CTable" arrow::Table": + CTable(const c_string& name, const shared_ptr[CSchema]& schema, + const vector[shared_ptr[CColumn]]& columns) - c_bool ok() - c_bool IsOutOfMemory() - c_bool IsKeyError() - c_bool IsNotImplemented() - c_bool IsInvalid() + int num_columns() + int num_rows() - cdef cppclass Buffer: - uint8_t* data() - int64_t size() + const c_string& name() + + const shared_ptr[CSchema]& schema() + const shared_ptr[CColumn]& column(int i) cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil: diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index eedfc854468..1066b8034be 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,7 +18,8 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport CArray, CDataType, Type, MemoryPool +from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, + Type, MemoryPool) cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed @@ -41,4 +42,10 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: shared_ptr[CDataType] GetPrimitiveType(Type type) Status ConvertPySequence(object obj, shared_ptr[CArray]* out) + Status PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CArray]* out) + Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, + shared_ptr[CArray]* out) + + Status ArrowToPandas(const shared_ptr[CColumn]& arr, PyObject** out) + MemoryPool* GetMemoryPool() diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py new file mode 100644 index 00000000000..6dc9c689e24 --- /dev/null +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -0,0 +1,172 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import unittest + +import numpy as np + +import pandas as pd +import pandas.util.testing as tm + +import pyarrow as A + + +class TestPandasConversion(unittest.TestCase): + + def setUp(self): + pass + + def tearDown(self): + pass + + def _check_pandas_roundtrip(self, df, expected=None): + table = A.from_pandas_dataframe(df) + result = table.to_pandas() + if expected is None: + expected = df + tm.assert_frame_equal(result, expected) + + def test_float_no_nulls(self): + data = {} + numpy_dtypes = ['f4', 'f8'] + num_values = 100 + + for dtype in numpy_dtypes: + values = np.random.randn(num_values) + data[dtype] = values.astype(dtype) + + df = pd.DataFrame(data) + self._check_pandas_roundtrip(df) + + def test_float_nulls(self): + num_values = 100 + + null_mask = np.random.randint(0, 10, size=num_values) < 3 + dtypes = ['f4', 'f8'] + expected_cols = [] + + arrays = [] + for name in dtypes: + values = np.random.randn(num_values).astype(name) + + arr = A.from_pandas_series(values, null_mask) + arrays.append(arr) + + values[null_mask] = np.nan + + expected_cols.append(values) + + ex_frame = pd.DataFrame(dict(zip(dtypes, expected_cols)), + columns=dtypes) + + table = A.Table.from_arrays(dtypes, arrays) + result = table.to_pandas() + tm.assert_frame_equal(result, ex_frame) + + def test_integer_no_nulls(self): + data = {} + + numpy_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + num_values = 100 + + for dtype in numpy_dtypes: + info = np.iinfo(dtype) + values = np.random.randint(info.min, + min(info.max, np.iinfo('i8').max), + size=num_values) + data[dtype] = values.astype(dtype) + + df = pd.DataFrame(data) + self._check_pandas_roundtrip(df) + + def test_integer_with_nulls(self): + # pandas requires upcast to float dtype + + int_dtypes = ['i1', 'i2', 'i4', 'i8', 'u1', 'u2', 'u4', 'u8'] + num_values = 100 + + null_mask = np.random.randint(0, 10, size=num_values) < 3 + + expected_cols = [] + arrays = [] + for name in int_dtypes: + values = np.random.randint(0, 100, size=num_values) + + arr = A.from_pandas_series(values, null_mask) + arrays.append(arr) + + expected = values.astype('f8') + expected[null_mask] = np.nan + + expected_cols.append(expected) + + ex_frame = pd.DataFrame(dict(zip(int_dtypes, expected_cols)), + columns=int_dtypes) + + table = A.Table.from_arrays(int_dtypes, arrays) + result = table.to_pandas() + + tm.assert_frame_equal(result, ex_frame) + + def test_boolean_no_nulls(self): + num_values = 100 + + np.random.seed(0) + + df = pd.DataFrame({'bools': np.random.randn(num_values) > 0}) + self._check_pandas_roundtrip(df) + + def test_boolean_nulls(self): + # pandas requires upcast to object dtype + num_values = 100 + np.random.seed(0) + + mask = np.random.randint(0, 10, size=num_values) < 3 + values = np.random.randint(0, 10, size=num_values) < 5 + + arr = A.from_pandas_series(values, mask) + + expected = values.astype(object) + expected[mask] = None + + ex_frame = pd.DataFrame({'bools': expected}) + + table = A.Table.from_arrays(['bools'], [arr]) + result = table.to_pandas() + + tm.assert_frame_equal(result, ex_frame) + + def test_boolean_object_nulls(self): + arr = np.array([False, None, True] * 100, dtype=object) + df = pd.DataFrame({'bools': arr}) + self._check_pandas_roundtrip(df) + + def test_strings(self): + repeats = 1000 + values = [b'foo', None, u'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values * repeats}) + + values = ['foo', None, u'bar', 'qux', None] + expected = pd.DataFrame({'strings': values * repeats}) + self._check_pandas_roundtrip(df, expected) + + # def test_category(self): + # repeats = 1000 + # values = [b'foo', None, u'bar', 'qux', np.nan] + # df = pd.DataFrame({'strings': values * repeats}) + # df['strings'] = df['strings'].astype('category') + # self._check_pandas_roundtrip(df) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc new file mode 100644 index 00000000000..22f1d7575f8 --- /dev/null +++ b/python/src/pyarrow/adapters/pandas.cc @@ -0,0 +1,714 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for pandas conversion via NumPy + +#include + +#include "pyarrow/numpy_interop.h" + +#include +#include +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/util/bit-util.h" + +#include "pyarrow/common.h" +#include "pyarrow/config.h" +#include "pyarrow/status.h" + +namespace pyarrow { + +using arrow::Array; +using arrow::Column; +namespace util = arrow::util; + +// ---------------------------------------------------------------------- +// Serialization + +template +struct npy_traits { +}; + +template <> +struct npy_traits { + typedef uint8_t value_type; + using ArrayType = arrow::BooleanArray; + + static constexpr bool supports_nulls = false; + static inline bool isnull(uint8_t v) { + return false; + } +}; + +#define NPY_INT_DECL(TYPE, CapType, T) \ + template <> \ + struct npy_traits { \ + typedef T value_type; \ + using ArrayType = arrow::CapType##Array; \ + \ + static constexpr bool supports_nulls = false; \ + static inline bool isnull(T v) { \ + return false; \ + } \ + }; + +NPY_INT_DECL(INT8, Int8, int8_t); +NPY_INT_DECL(INT16, Int16, int16_t); +NPY_INT_DECL(INT32, Int32, int32_t); +NPY_INT_DECL(INT64, Int64, int64_t); +NPY_INT_DECL(UINT8, UInt8, uint8_t); +NPY_INT_DECL(UINT16, UInt16, uint16_t); +NPY_INT_DECL(UINT32, UInt32, uint32_t); +NPY_INT_DECL(UINT64, UInt64, uint64_t); + +template <> +struct npy_traits { + typedef float value_type; + using ArrayType = arrow::FloatArray; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(float v) { + return v != v; + } +}; + +template <> +struct npy_traits { + typedef double value_type; + using ArrayType = arrow::DoubleArray; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(double v) { + return v != v; + } +}; + +template <> +struct npy_traits { + typedef PyObject* value_type; + static constexpr bool supports_nulls = true; +}; + +template +class ArrowSerializer { + public: + ArrowSerializer(arrow::MemoryPool* pool, PyArrayObject* arr, PyArrayObject* mask) : + pool_(pool), + arr_(arr), + mask_(mask) { + length_ = PyArray_SIZE(arr_); + } + + Status Convert(std::shared_ptr* out); + + int stride() const { + return PyArray_STRIDES(arr_)[0]; + } + + Status InitNullBitmap() { + int null_bytes = util::bytes_for_bits(length_); + + null_bitmap_ = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(null_bitmap_->Resize(null_bytes)); + + null_bitmap_data_ = null_bitmap_->mutable_data(); + memset(null_bitmap_data_, 0, null_bytes); + + return Status::OK(); + } + + bool is_strided() const { + npy_intp* astrides = PyArray_STRIDES(arr_); + return astrides[0] != PyArray_DESCR(arr_)->elsize; + } + + private: + Status ConvertData(); + + Status ConvertObjectStrings(std::shared_ptr* out) { + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + + auto offsets_buffer = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(offsets_buffer->Resize(sizeof(int32_t) * (length_ + 1))); + int32_t* offsets = reinterpret_cast(offsets_buffer->mutable_data()); + + arrow::BufferBuilder data_builder(pool_); + arrow::Status s; + PyObject* obj; + int length; + int offset = 0; + int64_t null_count = 0; + for (int64_t i = 0; i < length_; ++i) { + obj = objects[i]; + if (PyUnicode_Check(obj)) { + obj = PyUnicode_AsUTF8String(obj); + if (obj == NULL) { + PyErr_Clear(); + return Status::TypeError("failed converting unicode to UTF8"); + } + length = PyBytes_GET_SIZE(obj); + s = data_builder.Append( + reinterpret_cast(PyBytes_AS_STRING(obj)), length); + Py_DECREF(obj); + if (!s.ok()) { + return Status::ArrowError(s.ToString()); + } + util::set_bit(null_bitmap_data_, i); + } else if (PyBytes_Check(obj)) { + length = PyBytes_GET_SIZE(obj); + RETURN_ARROW_NOT_OK(data_builder.Append( + reinterpret_cast(PyBytes_AS_STRING(obj)), length)); + util::set_bit(null_bitmap_data_, i); + } else { + // NULL + // No change to offset + length = 0; + ++null_count; + } + offsets[i] = offset; + offset += length; + } + // End offset + offsets[length_] = offset; + + std::shared_ptr data_buffer = data_builder.Finish(); + + auto values = std::make_shared(data_buffer->size(), + data_buffer); + *out = std::shared_ptr( + new arrow::StringArray(length_, offsets_buffer, values, null_count, + null_bitmap_)); + + return Status::OK(); + } + + Status ConvertBooleans(std::shared_ptr* out) { + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + + int nbytes = util::bytes_for_bits(length_); + auto data = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(data->Resize(nbytes)); + uint8_t* bitmap = data->mutable_data(); + memset(bitmap, 0, nbytes); + + int64_t null_count = 0; + for (int64_t i = 0; i < length_; ++i) { + if (objects[i] == Py_True) { + util::set_bit(bitmap, i); + util::set_bit(null_bitmap_data_, i); + } else if (objects[i] != Py_False) { + ++null_count; + } else { + util::set_bit(null_bitmap_data_, i); + } + } + + *out = std::make_shared(length_, data, null_count, + null_bitmap_); + + return Status::OK(); + } + + arrow::MemoryPool* pool_; + + PyArrayObject* arr_; + PyArrayObject* mask_; + + int64_t length_; + + std::shared_ptr data_; + std::shared_ptr null_bitmap_; + uint8_t* null_bitmap_data_; +}; + +// Returns null count +static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { + int64_t null_count = 0; + const uint8_t* mask_values = static_cast(PyArray_DATA(mask)); + // TODO(wesm): strided null mask + for (int i = 0; i < length; ++i) { + if (mask_values[i]) { + ++null_count; + } else { + util::set_bit(bitmap, i); + } + } + return null_count; +} + +template +static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) { + typedef npy_traits traits; + typedef typename traits::value_type T; + + int64_t null_count = 0; + const T* values = reinterpret_cast(data); + + // TODO(wesm): striding + for (int i = 0; i < length; ++i) { + if (traits::isnull(values[i])) { + ++null_count; + } else { + util::set_bit(bitmap, i); + } + } + + return null_count; +} + +template +inline Status ArrowSerializer::Convert(std::shared_ptr* out) { + typedef npy_traits traits; + + if (mask_ != nullptr || traits::supports_nulls) { + RETURN_NOT_OK(InitNullBitmap()); + } + + int64_t null_count = 0; + if (mask_ != nullptr) { + null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); + } else if (traits::supports_nulls) { + null_count = ValuesToBitmap(PyArray_DATA(arr_), length_, null_bitmap_data_); + } + + RETURN_NOT_OK(ConvertData()); + *out = std::make_shared(length_, data_, null_count, + null_bitmap_); + + return Status::OK(); +} + +static inline bool PyObject_is_null(const PyObject* obj) { + return obj == Py_None || obj == numpy_nan; +} + +static inline bool PyObject_is_string(const PyObject* obj) { +#if PY_MAJOR_VERSION >= 3 + return PyUnicode_Check(obj) || PyBytes_Check(obj); +#else + return PyString_Check(obj) || PyUnicode_Check(obj); +#endif +} + +static inline bool PyObject_is_bool(const PyObject* obj) { +#if PY_MAJOR_VERSION >= 3 + return PyString_Check(obj) || PyBytes_Check(obj); +#else + return PyString_Check(obj) || PyUnicode_Check(obj); +#endif +} + +template <> +inline Status ArrowSerializer::Convert(std::shared_ptr* out) { + // Python object arrays are annoying, since we could have one of: + // + // * Strings + // * Booleans with nulls + // * Mixed type (not supported at the moment by arrow format) + // + // Additionally, nulls may be encoded either as np.nan or None. So we have to + // do some type inference and conversion + + RETURN_NOT_OK(InitNullBitmap()); + + // TODO: mask not supported here + const PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + + for (int64_t i = 0; i < length_; ++i) { + if (PyObject_is_null(objects[i])) { + continue; + } else if (PyObject_is_string(objects[i])) { + return ConvertObjectStrings(out); + } else if (PyBool_Check(objects[i])) { + return ConvertBooleans(out); + } else { + return Status::TypeError("unhandled python type"); + } + } + + return Status::TypeError("Unable to infer type of object array, were all null"); +} + +template +inline Status ArrowSerializer::ConvertData() { + // TODO(wesm): strided arrays + if (is_strided()) { + return Status::ValueError("no support for strided data yet"); + } + + data_ = std::make_shared(arr_); + return Status::OK(); +} + +template <> +inline Status ArrowSerializer::ConvertData() { + if (is_strided()) { + return Status::ValueError("no support for strided data yet"); + } + + int nbytes = util::bytes_for_bits(length_); + auto buffer = std::make_shared(pool_); + RETURN_ARROW_NOT_OK(buffer->Resize(nbytes)); + + const uint8_t* values = reinterpret_cast(PyArray_DATA(arr_)); + + uint8_t* bitmap = buffer->mutable_data(); + + memset(bitmap, 0, nbytes); + for (int i = 0; i < length_; ++i) { + if (values[i] > 0) { + util::set_bit(bitmap, i); + } + } + + data_ = buffer; + + return Status::OK(); +} + +template <> +inline Status ArrowSerializer::ConvertData() { + return Status::TypeError("NYI"); +} + + +#define TO_ARROW_CASE(TYPE) \ + case NPY_##TYPE: \ + { \ + ArrowSerializer converter(pool, arr, mask); \ + RETURN_NOT_OK(converter.Convert(out)); \ + } \ + break; + +Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, + std::shared_ptr* out) { + PyArrayObject* arr = reinterpret_cast(ao); + PyArrayObject* mask = nullptr; + + if (mo != nullptr) { + mask = reinterpret_cast(mo); + } + + if (PyArray_NDIM(arr) != 1) { + return Status::ValueError("only handle 1-dimensional arrays"); + } + + switch(PyArray_DESCR(arr)->type_num) { + TO_ARROW_CASE(BOOL); + TO_ARROW_CASE(INT8); + TO_ARROW_CASE(INT16); + TO_ARROW_CASE(INT32); + TO_ARROW_CASE(INT64); + TO_ARROW_CASE(UINT8); + TO_ARROW_CASE(UINT16); + TO_ARROW_CASE(UINT32); + TO_ARROW_CASE(UINT64); + TO_ARROW_CASE(FLOAT32); + TO_ARROW_CASE(FLOAT64); + TO_ARROW_CASE(OBJECT); + default: + std::stringstream ss; + ss << "unsupported type " << PyArray_DESCR(arr)->type_num + << std::endl; + return Status::NotImplemented(ss.str()); + } + return Status::OK(); +} + +Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, + std::shared_ptr* out) { + return PandasMaskedToArrow(pool, ao, nullptr, out); +} + +// ---------------------------------------------------------------------- +// Deserialization + +template +struct arrow_traits { +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_BOOL; + static constexpr bool supports_nulls = false; + static constexpr bool is_boolean = true; + static constexpr bool is_integer = false; + static constexpr bool is_floating = false; +}; + +#define INT_DECL(TYPE) \ + template <> \ + struct arrow_traits { \ + static constexpr int npy_type = NPY_##TYPE; \ + static constexpr bool supports_nulls = false; \ + static constexpr double na_value = NAN; \ + static constexpr bool is_boolean = false; \ + static constexpr bool is_integer = true; \ + static constexpr bool is_floating = false; \ + typedef typename npy_traits::value_type T; \ + }; + +INT_DECL(INT8); +INT_DECL(INT16); +INT_DECL(INT32); +INT_DECL(INT64); +INT_DECL(UINT8); +INT_DECL(UINT16); +INT_DECL(UINT32); +INT_DECL(UINT64); + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT32; + static constexpr bool supports_nulls = true; + static constexpr float na_value = NAN; + static constexpr bool is_boolean = false; + static constexpr bool is_integer = false; + static constexpr bool is_floating = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_FLOAT64; + static constexpr bool supports_nulls = true; + static constexpr double na_value = NAN; + static constexpr bool is_boolean = false; + static constexpr bool is_integer = false; + static constexpr bool is_floating = true; + typedef typename npy_traits::value_type T; +}; + +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_OBJECT; + static constexpr bool supports_nulls = true; + static constexpr bool is_boolean = false; + static constexpr bool is_integer = false; + static constexpr bool is_floating = false; +}; + + +static inline PyObject* make_pystring(const uint8_t* data, int32_t length) { +#if PY_MAJOR_VERSION >= 3 + return PyUnicode_FromStringAndSize(reinterpret_cast(data), length); +#else + return PyString_FromStringAndSize(reinterpret_cast(data), length); +#endif +} + +template +class ArrowDeserializer { + public: + ArrowDeserializer(const std::shared_ptr& col) : + col_(col) {} + + Status Convert(PyObject** out) { + const std::shared_ptr data = col_->data(); + if (data->num_chunks() > 1) { + return Status::NotImplemented("Chunked column conversion NYI"); + } + + auto chunk = data->chunk(0); + + RETURN_NOT_OK(ConvertValues(chunk)); + *out = reinterpret_cast(out_); + return Status::OK(); + } + + Status AllocateOutput(int type) { + npy_intp dims[1] = {col_->length()}; + out_ = reinterpret_cast(PyArray_SimpleNew(1, dims, type)); + + if (out_ == NULL) { + // Error occurred, trust that SimpleNew set the error state + return Status::OK(); + } + + return Status::OK(); + } + + template + inline typename std::enable_if< + arrow_traits::is_floating, Status>::type + ConvertValues(const std::shared_ptr& arr) { + typedef typename arrow_traits::T T; + + arrow::PrimitiveArray* prim_arr = static_cast( + arr.get()); + + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + + if (arr->null_count() > 0) { + T* out_values = reinterpret_cast(PyArray_DATA(out_)); + const T* in_values = reinterpret_cast(prim_arr->data()->data()); + for (int64_t i = 0; i < arr->length(); ++i) { + out_values[i] = arr->IsNull(i) ? NAN : in_values[i]; + } + } else { + memcpy(PyArray_DATA(out_), prim_arr->data()->data(), + arr->length() * arr->type()->value_size()); + } + + return Status::OK(); + } + + // Integer specialization + template + inline typename std::enable_if< + arrow_traits::is_integer, Status>::type + ConvertValues(const std::shared_ptr& arr) { + typedef typename arrow_traits::T T; + + arrow::PrimitiveArray* prim_arr = static_cast( + arr.get()); + + const T* in_values = reinterpret_cast(prim_arr->data()->data()); + + if (arr->null_count() > 0) { + RETURN_NOT_OK(AllocateOutput(NPY_FLOAT64)); + + // Upcast to double, set NaN as appropriate + double* out_values = reinterpret_cast(PyArray_DATA(out_)); + for (int i = 0; i < arr->length(); ++i) { + out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i]; + } + } else { + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + + memcpy(PyArray_DATA(out_), in_values, + arr->length() * arr->type()->value_size()); + } + + return Status::OK(); + } + + // Boolean specialization + template + inline typename std::enable_if< + arrow_traits::is_boolean, Status>::type + ConvertValues(const std::shared_ptr& arr) { + arrow::BooleanArray* bool_arr = static_cast(arr.get()); + + if (arr->null_count() > 0) { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + + PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); + for (int64_t i = 0; i < arr->length(); ++i) { + if (bool_arr->IsNull(i)) { + Py_INCREF(Py_None); + out_values[i] = Py_None; + } else if (bool_arr->Value(i)) { + // True + Py_INCREF(Py_True); + out_values[i] = Py_True; + } else { + // False + Py_INCREF(Py_False); + out_values[i] = Py_False; + } + } + } else { + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + + uint8_t* out_values = reinterpret_cast(PyArray_DATA(out_)); + for (int64_t i = 0; i < arr->length(); ++i) { + out_values[i] = static_cast(bool_arr->Value(i)); + } + } + + return Status::OK(); + } + + // UTF8 + template + inline typename std::enable_if< + T2 == arrow::Type::STRING, Status>::type + ConvertValues(const std::shared_ptr& arr) { + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + + PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); + + arrow::StringArray* string_arr = static_cast(arr.get()); + + const uint8_t* data; + int32_t length; + if (arr->null_count() > 0) { + for (int64_t i = 0; i < arr->length(); ++i) { + if (string_arr->IsNull(i)) { + Py_INCREF(Py_None); + out_values[i] = Py_None; + } else { + data = string_arr->GetValue(i, &length); + + out_values[i] = make_pystring(data, length); + if (out_values[i] == nullptr) { + return Status::OK(); + } + } + } + } else { + for (int64_t i = 0; i < arr->length(); ++i) { + data = string_arr->GetValue(i, &length); + out_values[i] = make_pystring(data, length); + if (out_values[i] == nullptr) { + return Status::OK(); + } + } + } + return Status::OK(); + } + private: + std::shared_ptr col_; + PyArrayObject* out_; +}; + +#define FROM_ARROW_CASE(TYPE) \ + case arrow::Type::TYPE: \ + { \ + ArrowDeserializer converter(col); \ + return converter.Convert(out); \ + } \ + break; + +Status ArrowToPandas(const std::shared_ptr& col, PyObject** out) { + switch(col->type()->type) { + FROM_ARROW_CASE(BOOL); + FROM_ARROW_CASE(INT8); + FROM_ARROW_CASE(INT16); + FROM_ARROW_CASE(INT32); + FROM_ARROW_CASE(INT64); + FROM_ARROW_CASE(UINT8); + FROM_ARROW_CASE(UINT16); + FROM_ARROW_CASE(UINT32); + FROM_ARROW_CASE(UINT64); + FROM_ARROW_CASE(FLOAT); + FROM_ARROW_CASE(DOUBLE); + FROM_ARROW_CASE(STRING); + default: + return Status::NotImplemented("Arrow type reading not implemented"); + } + return Status::OK(); +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index a4f41638087..58eb3ca61cd 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -21,8 +21,29 @@ #ifndef PYARROW_ADAPTERS_PANDAS_H #define PYARROW_ADAPTERS_PANDAS_H +#include + +#include + +namespace arrow { + +class Array; +class Column; + +} // namespace arrow + namespace pyarrow { +class Status; + +Status ArrowToPandas(const std::shared_ptr& col, PyObject** out); + +Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, + std::shared_ptr* out); + +Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, + std::shared_ptr* out); + } // namespace pyarrow #endif // PYARROW_ADAPTERS_PANDAS_H diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index db6361384c1..cc9ad9ec5bb 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -18,7 +18,9 @@ #ifndef PYARROW_COMMON_H #define PYARROW_COMMON_H -#include +#include "pyarrow/config.h" + +#include "arrow/util/buffer.h" namespace arrow { class MemoryPool; } @@ -90,6 +92,25 @@ struct PyObjectStringify { arrow::MemoryPool* GetMemoryPool(); +class NumPyBuffer : public arrow::Buffer { + public: + NumPyBuffer(PyArrayObject* arr) : + Buffer(nullptr, 0) { + arr_ = arr; + Py_INCREF(arr); + + data_ = reinterpret_cast(PyArray_DATA(arr_)); + size_ = PyArray_SIZE(arr_); + } + + virtual ~NumPyBuffer() { + Py_XDECREF(arr_); + } + + private: + PyArrayObject* arr_; +}; + } // namespace pyarrow #endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/init.cc b/python/src/pyarrow/config.cc similarity index 84% rename from python/src/pyarrow/init.cc rename to python/src/pyarrow/config.cc index acd851e1687..730d2db99a5 100644 --- a/python/src/pyarrow/init.cc +++ b/python/src/pyarrow/config.cc @@ -15,11 +15,20 @@ // specific language governing permissions and limitations // under the License. -#include "pyarrow/init.h" +#include + +#include "pyarrow/config.h" namespace pyarrow { void pyarrow_init() { } +PyObject* numpy_nan = nullptr; + +void pyarrow_set_numpy_nan(PyObject* obj) { + Py_INCREF(obj); + numpy_nan = obj; +} + } // namespace pyarrow diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h new file mode 100644 index 00000000000..48ae715d842 --- /dev/null +++ b/python/src/pyarrow/config.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_CONFIG_H +#define PYARROW_CONFIG_H + +#include + +#include "pyarrow/numpy_interop.h" + +#if PY_MAJOR_VERSION >= 3 + #define PyString_Check PyUnicode_Check +#endif + +namespace pyarrow { + +extern PyObject* numpy_nan; + +void pyarrow_init(); + +void pyarrow_set_numpy_nan(PyObject* obj); + +} // namespace pyarrow + +#endif // PYARROW_CONFIG_H diff --git a/python/src/pyarrow/init.h b/python/src/pyarrow/do_import_numpy.h similarity index 83% rename from python/src/pyarrow/init.h rename to python/src/pyarrow/do_import_numpy.h index 71e67a20c1c..bb4a3829591 100644 --- a/python/src/pyarrow/init.h +++ b/python/src/pyarrow/do_import_numpy.h @@ -15,13 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef PYARROW_INIT_H -#define PYARROW_INIT_H +// Trick borrowed from dynd-python for initializing the NumPy array API -namespace pyarrow { - -void pyarrow_init(); - -} // namespace pyarrow - -#endif // PYARROW_INIT_H +// Trigger the array import (inversion of NO_IMPORT_ARRAY) +#define NUMPY_IMPORT_ARRAY diff --git a/python/src/pyarrow/numpy_interop.h b/python/src/pyarrow/numpy_interop.h new file mode 100644 index 00000000000..882d287c7c5 --- /dev/null +++ b/python/src/pyarrow/numpy_interop.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_NUMPY_INTEROP_H +#define PYARROW_NUMPY_INTEROP_H + +#include + +#include + +// Don't use the deprecated Numpy functions +#ifdef NPY_1_7_API_VERSION +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#else +#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED +#define NPY_ARRAY_ALIGNED NPY_ALIGNED +#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE +#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY +#endif + +// This is required to be able to access the NumPy C API properly in C++ files +// other than this main one +#define PY_ARRAY_UNIQUE_SYMBOL pyarrow_ARRAY_API +#ifndef NUMPY_IMPORT_ARRAY +#define NO_IMPORT_ARRAY +#endif + +#include +#include + +namespace pyarrow { + +inline int import_numpy() { +#ifdef NUMPY_IMPORT_ARRAY + import_array1(-1); + import_umath1(-1); +#endif + + return 0; +} + +} // namespace pyarrow + +#endif // PYARROW_NUMPY_INTEROP_H From ecadd0bcb9f022a5067826ed564f513ffd0c578e Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 28 Mar 2016 09:38:13 -0700 Subject: [PATCH 048/210] ARROW-80: Handle len call for pre-init arrays Author: Uwe L. Korn Closes #45 from xhochy/arrow-80 and squashes the following commits: d9a1160 [Uwe L. Korn] Add unit test for repr on pre-init Array 6208d7d [Uwe L. Korn] ARROW-80: Handle len call for pre-init arrays --- python/pyarrow/array.pyx | 5 ++++- python/pyarrow/tests/test_array.py | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 88770cdaa96..155c965f3e8 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -67,7 +67,10 @@ cdef class Array: return '{0}\n{1}'.format(type_format, values) def __len__(self): - return self.sp_array.get().length() + if self.sp_array.get(): + return self.sp_array.get().length() + else: + return 0 def isnull(self): raise NotImplemented diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 36aaaa4f93d..d608f8167df 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -22,6 +22,10 @@ class TestArrayAPI(unittest.TestCase): + def test_repr_on_pre_init_array(self): + arr = pyarrow.array.Array() + assert len(repr(arr)) > 0 + def test_getitem_NA(self): arr = pyarrow.from_pylist([1, None, 2]) assert arr[1] is pyarrow.NA From 80ec2c17fccac484993868f951d95362cb75cea9 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 28 Mar 2016 09:39:55 -0700 Subject: [PATCH 049/210] ARROW-79: [Python] Add benchmarks Run them using `asv run --python=same` or `asv dev`. Author: Uwe L. Korn Closes #44 from xhochy/arrow-79 and squashes the following commits: d3c6401 [Uwe L. Korn] Move benchmarks to toplevel folder 2737f18 [Uwe L. Korn] ARROW-79: [Python] Add benchmarks --- python/.gitignore | 3 ++ python/asv.conf.json | 73 +++++++++++++++++++++++++++++++++++ python/benchmarks/__init__.py | 17 ++++++++ python/benchmarks/array.py | 38 ++++++++++++++++++ python/doc/Benchmarks.md | 11 ++++++ 5 files changed, 142 insertions(+) create mode 100644 python/asv.conf.json create mode 100644 python/benchmarks/__init__.py create mode 100644 python/benchmarks/array.py create mode 100644 python/doc/Benchmarks.md diff --git a/python/.gitignore b/python/.gitignore index 80103a1a529..3cb591ea766 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -35,3 +35,6 @@ dist # coverage .coverage coverage.xml + +# benchmark working dir +.asv diff --git a/python/asv.conf.json b/python/asv.conf.json new file mode 100644 index 00000000000..96beba64c2e --- /dev/null +++ b/python/asv.conf.json @@ -0,0 +1,73 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "pyarrow", + + // The project's homepage + "project_url": "https://arrow.apache.org/", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": "https://github.com/apache/arrow/", + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "tip" (for mercurial). + // "branches": ["master"], // for git + // "branches": ["tip"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "virtualenv", + + // the base URL to show a commit for the project. + "show_commit_url": "https://github.com/apache/arrow/commit/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["2.7", "3.3"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list indicates to just test against the default (latest) + // version. + // "matrix": { + // "numpy": ["1.6", "1.7"] + // }, + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": "build/benchmarks/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache wheels of the recent builds in each + // environment, making them faster to install next time. This is + // number of builds to keep, per environment. + // "wheel_cache_size": 0 +} diff --git a/python/benchmarks/__init__.py b/python/benchmarks/__init__.py new file mode 100644 index 00000000000..245692337bc --- /dev/null +++ b/python/benchmarks/__init__.py @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + diff --git a/python/benchmarks/array.py b/python/benchmarks/array.py new file mode 100644 index 00000000000..6ab73d18d1f --- /dev/null +++ b/python/benchmarks/array.py @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pyarrow + +class Conversions(object): + params = (1, 10 ** 5, 10 ** 6, 10 ** 7) + + def time_from_pylist(self, n): + pyarrow.from_pylist(list(range(n))) + + def peakmem_from_pylist(self, n): + pyarrow.from_pylist(list(range(n))) + +class ScalarAccess(object): + params = (1, 10 ** 5, 10 ** 6, 10 ** 7) + + def setUp(self, n): + self._array = pyarrow.from_pylist(list(range(n))) + + def time_as_py(self, n): + for i in range(n): + self._array[i].as_py() + diff --git a/python/doc/Benchmarks.md b/python/doc/Benchmarks.md new file mode 100644 index 00000000000..8edfb6209e4 --- /dev/null +++ b/python/doc/Benchmarks.md @@ -0,0 +1,11 @@ +## Benchmark Requirements + +The benchmarks are run using [asv][1] which is also their only requirement. + +## Running the benchmarks + +To run the benchmarks, call `asv run --python=same`. You cannot use the +plain `asv run` command at the moment as asv cannot handle python packages +in subdirectories of a repository. + +[1]: https://asv.readthedocs.org/ From df7726d44ab59828aacc20a1786287ba7ade2562 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 28 Mar 2016 10:39:25 -0700 Subject: [PATCH 050/210] ARROW-88: [C++] Refactor usages of parquet_cpp namespace I also removed an unneeded `Py_XDECREF` from ARROW-30; didn't want to create a separate patch for that. Author: Wes McKinney Closes #49 from wesm/ARROW-88 and squashes the following commits: c4d81dc [Wes McKinney] Refactor usages of parquet_cpp namespace --- cpp/src/arrow/parquet/parquet-schema-test.cc | 40 ++++++++++---------- cpp/src/arrow/parquet/schema.cc | 29 +++++++------- cpp/src/arrow/parquet/schema.h | 4 +- python/pyarrow/array.pyx | 3 -- python/pyarrow/includes/parquet.pxd | 2 +- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index 9c3093d9ff7..02a8caf03c9 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -26,15 +26,17 @@ #include "arrow/parquet/schema.h" +using ParquetType = parquet::Type; +using parquet::LogicalType; +using parquet::Repetition; +using parquet::schema::NodePtr; +using parquet::schema::GroupNode; +using parquet::schema::PrimitiveNode; + namespace arrow { namespace parquet { -using parquet_cpp::Repetition; -using parquet_cpp::schema::NodePtr; -using parquet_cpp::schema::GroupNode; -using parquet_cpp::schema::PrimitiveNode; - const auto BOOL = std::make_shared(); const auto UINT8 = std::make_shared(); const auto INT32 = std::make_shared(); @@ -66,7 +68,7 @@ class TestConvertParquetSchema : public ::testing::Test { } protected: - parquet_cpp::SchemaDescriptor descr_; + ::parquet::SchemaDescriptor descr_; std::shared_ptr result_schema_; }; @@ -75,40 +77,40 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { std::vector> arrow_fields; parquet_fields.push_back( - PrimitiveNode::Make("boolean", Repetition::REQUIRED, parquet_cpp::Type::BOOLEAN)); + PrimitiveNode::Make("boolean", Repetition::REQUIRED, ParquetType::BOOLEAN)); arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); parquet_fields.push_back( - PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32)); + PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32)); arrow_fields.push_back(std::make_shared("int32", INT32, false)); parquet_fields.push_back( - PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64)); + PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64)); arrow_fields.push_back(std::make_shared("int64", INT64, false)); parquet_fields.push_back( - PrimitiveNode::Make("float", Repetition::OPTIONAL, parquet_cpp::Type::FLOAT)); + PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); arrow_fields.push_back(std::make_shared("float", FLOAT)); parquet_fields.push_back( - PrimitiveNode::Make("double", Repetition::OPTIONAL, parquet_cpp::Type::DOUBLE)); + PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE)); arrow_fields.push_back(std::make_shared("double", DOUBLE)); parquet_fields.push_back( PrimitiveNode::Make("binary", Repetition::OPTIONAL, - parquet_cpp::Type::BYTE_ARRAY)); + ParquetType::BYTE_ARRAY)); arrow_fields.push_back(std::make_shared("binary", BINARY)); parquet_fields.push_back( PrimitiveNode::Make("string", Repetition::OPTIONAL, - parquet_cpp::Type::BYTE_ARRAY, - parquet_cpp::LogicalType::UTF8)); + ParquetType::BYTE_ARRAY, + LogicalType::UTF8)); arrow_fields.push_back(std::make_shared("string", UTF8)); parquet_fields.push_back( PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, - parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY, - parquet_cpp::LogicalType::NONE, 12)); + ParquetType::FIXED_LEN_BYTE_ARRAY, + LogicalType::NONE, 12)); arrow_fields.push_back(std::make_shared("flba-binary", BINARY)); auto arrow_schema = std::make_shared(arrow_fields); @@ -121,18 +123,18 @@ TEST_F(TestConvertParquetSchema, UnsupportedThings) { std::vector unsupported_nodes; unsupported_nodes.push_back( - PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96)); + PrimitiveNode::Make("int96", Repetition::REQUIRED, ParquetType::INT96)); unsupported_nodes.push_back( GroupNode::Make("repeated-group", Repetition::REPEATED, {})); unsupported_nodes.push_back( PrimitiveNode::Make("int32", Repetition::OPTIONAL, - parquet_cpp::Type::INT32, parquet_cpp::LogicalType::DATE)); + ParquetType::INT32, LogicalType::DATE)); unsupported_nodes.push_back( PrimitiveNode::Make("int64", Repetition::OPTIONAL, - parquet_cpp::Type::INT64, parquet_cpp::LogicalType::TIMESTAMP_MILLIS)); + ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); for (const NodePtr& node : unsupported_nodes) { ASSERT_RAISES(NotImplemented, ConvertSchema({node})); diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 6b1de572617..d8eb2addb0a 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -24,12 +24,13 @@ #include "arrow/util/status.h" #include "arrow/types/decimal.h" -using parquet_cpp::schema::Node; -using parquet_cpp::schema::NodePtr; -using parquet_cpp::schema::GroupNode; -using parquet_cpp::schema::PrimitiveNode; +using parquet::schema::Node; +using parquet::schema::NodePtr; +using parquet::schema::GroupNode; +using parquet::schema::PrimitiveNode; -using parquet_cpp::LogicalType; +using ParquetType = parquet::Type; +using parquet::LogicalType; namespace arrow { @@ -124,30 +125,30 @@ Status NodeToField(const NodePtr& node, std::shared_ptr* out) { const PrimitiveNode* primitive = static_cast(node.get()); switch (primitive->physical_type()) { - case parquet_cpp::Type::BOOLEAN: + case ParquetType::BOOLEAN: type = BOOL; break; - case parquet_cpp::Type::INT32: + case ParquetType::INT32: RETURN_NOT_OK(FromInt32(primitive, &type)); break; - case parquet_cpp::Type::INT64: + case ParquetType::INT64: RETURN_NOT_OK(FromInt64(primitive, &type)); break; - case parquet_cpp::Type::INT96: + case ParquetType::INT96: // TODO: Do we have that type in Arrow? // type = TypePtr(new Int96Type()); return Status::NotImplemented("int96"); - case parquet_cpp::Type::FLOAT: + case ParquetType::FLOAT: type = FLOAT; break; - case parquet_cpp::Type::DOUBLE: + case ParquetType::DOUBLE: type = DOUBLE; break; - case parquet_cpp::Type::BYTE_ARRAY: + case ParquetType::BYTE_ARRAY: // TODO: Do we have that type in Arrow? RETURN_NOT_OK(FromByteArray(primitive, &type)); break; - case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY: + case ParquetType::FIXED_LEN_BYTE_ARRAY: RETURN_NOT_OK(FromFLBA(primitive, &type)); break; } @@ -157,7 +158,7 @@ Status NodeToField(const NodePtr& node, std::shared_ptr* out) { return Status::OK(); } -Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, +Status FromParquetSchema(const ::parquet::SchemaDescriptor* parquet_schema, std::shared_ptr* out) { // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes // from the root Parquet node diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h index 61de193a338..a8408970ede 100644 --- a/cpp/src/arrow/parquet/schema.h +++ b/cpp/src/arrow/parquet/schema.h @@ -31,10 +31,10 @@ class Status; namespace parquet { -Status NodeToField(const parquet_cpp::schema::NodePtr& node, +Status NodeToField(const ::parquet::schema::NodePtr& node, std::shared_ptr* out); -Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, +Status FromParquetSchema(const ::parquet::SchemaDescriptor* parquet_schema, std::shared_ptr* out); } // namespace parquet diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 155c965f3e8..255efc268fe 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -359,7 +359,4 @@ cdef class Table: names.append(frombytes(col.get().name())) data.append( arr) - # One ref count too many - Py_XDECREF(arr) - return pd.DataFrame(dict(zip(names, data)), columns=names) diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index 99a2d423d9c..ffdc5d48706 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -19,7 +19,7 @@ from pyarrow.includes.common cimport * -cdef extern from "parquet/api/reader.h" namespace "parquet_cpp" nogil: +cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: pass From 38897ee29f85765f7646e90237fa85f98ccb55f5 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 28 Mar 2016 10:42:14 -0700 Subject: [PATCH 051/210] ARROW-83: [C++] Add basic test infrastructure for DecimalType Author: Uwe L. Korn Closes #47 from xhochy/arrow-83 and squashes the following commits: 6eabd7a [Uwe L. Korn] Remove unused forward decl e1854e9 [Uwe L. Korn] ARROW-83: [C++] Add basic test infrastructure for DecimalType --- cpp/src/arrow/types/CMakeLists.txt | 1 + cpp/src/arrow/types/decimal-test.cc | 40 +++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 cpp/src/arrow/types/decimal-test.cc diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index f3e41289bfe..72a8e776646 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -34,6 +34,7 @@ install(FILES DESTINATION include/arrow/types) +ADD_ARROW_TEST(decimal-test) ADD_ARROW_TEST(list-test) ADD_ARROW_TEST(primitive-test) ADD_ARROW_TEST(string-test) diff --git a/cpp/src/arrow/types/decimal-test.cc b/cpp/src/arrow/types/decimal-test.cc new file mode 100644 index 00000000000..89896c8b425 --- /dev/null +++ b/cpp/src/arrow/types/decimal-test.cc @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/types/decimal.h" + +namespace arrow { + +TEST(TypesTest, TestDecimalType) { + DecimalType t1(8, 4); + + ASSERT_EQ(t1.type, Type::DECIMAL); + ASSERT_EQ(t1.precision, 8); + ASSERT_EQ(t1.scale, 4); + + ASSERT_EQ(t1.ToString(), std::string("decimal(8, 4)")); + + // Test copy constructor + DecimalType t2 = t1; + ASSERT_EQ(t2.type, Type::DECIMAL); + ASSERT_EQ(t2.precision, 8); + ASSERT_EQ(t2.scale, 4); +} + +} // namespace arrow From 2d8627cd81f83783b0ceb01d137a46b581ecba26 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 28 Mar 2016 10:49:08 -0700 Subject: [PATCH 052/210] ARROW-87: [C++] Add all four possible ways to encode Decimals in Parquet to schema conversion See also: https://github.com/Parquet/parquet-format/blob/master/LogicalTypes.md#decimal Author: Uwe L. Korn Closes #48 from xhochy/arrow-87 and squashes the following commits: 05ca3be [Uwe L. Korn] Use parquet:: namespace instead of parquet_cpp 6bafc5f [Uwe L. Korn] ARROW-87: [C++] Add all four possible ways to encode Decimals in Parquet to schema conversion --- cpp/src/arrow/parquet/parquet-schema-test.cc | 36 ++++++++++++++++++++ cpp/src/arrow/parquet/schema.cc | 9 +++++ 2 files changed, 45 insertions(+) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index 02a8caf03c9..a289ddbfde6 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -22,6 +22,7 @@ #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/types/decimal.h" #include "arrow/util/status.h" #include "arrow/parquet/schema.h" @@ -46,6 +47,7 @@ const auto DOUBLE = std::make_shared(); const auto UTF8 = std::make_shared(); const auto BINARY = std::make_shared( std::make_shared("", UINT8)); +const auto DECIMAL_8_4 = std::make_shared(8, 4); class TestConvertParquetSchema : public ::testing::Test { public: @@ -119,6 +121,40 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { CheckFlatSchema(arrow_schema); } +TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) { + std::vector parquet_fields; + std::vector> arrow_fields; + + parquet_fields.push_back( + PrimitiveNode::Make("flba-decimal", Repetition::OPTIONAL, + ParquetType::FIXED_LEN_BYTE_ARRAY, + LogicalType::DECIMAL, 4, 8, 4)); + arrow_fields.push_back(std::make_shared("flba-decimal", DECIMAL_8_4)); + + parquet_fields.push_back( + PrimitiveNode::Make("binary-decimal", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, + LogicalType::DECIMAL, -1, 8, 4)); + arrow_fields.push_back(std::make_shared("binary-decimal", DECIMAL_8_4)); + + parquet_fields.push_back( + PrimitiveNode::Make("int32-decimal", Repetition::OPTIONAL, + ParquetType::INT32, + LogicalType::DECIMAL, -1, 8, 4)); + arrow_fields.push_back(std::make_shared("int32-decimal", DECIMAL_8_4)); + + parquet_fields.push_back( + PrimitiveNode::Make("int64-decimal", Repetition::OPTIONAL, + ParquetType::INT64, + LogicalType::DECIMAL, -1, 8, 4)); + arrow_fields.push_back(std::make_shared("int64-decimal", DECIMAL_8_4)); + + auto arrow_schema = std::make_shared(arrow_fields); + ASSERT_OK(ConvertSchema(parquet_fields)); + + CheckFlatSchema(arrow_schema); +} + TEST_F(TestConvertParquetSchema, UnsupportedThings) { std::vector unsupported_nodes; diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index d8eb2addb0a..14f4f5be53c 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -57,6 +57,9 @@ static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) { case LogicalType::UTF8: *out = UTF8; break; + case LogicalType::DECIMAL: + *out = MakeDecimalType(node); + break; default: // BINARY *out = BINARY; @@ -86,6 +89,9 @@ static Status FromInt32(const PrimitiveNode* node, TypePtr* out) { case LogicalType::NONE: *out = INT32; break; + case LogicalType::DECIMAL: + *out = MakeDecimalType(node); + break; default: return Status::NotImplemented("Unhandled logical type for int32"); break; @@ -98,6 +104,9 @@ static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { case LogicalType::NONE: *out = INT64; break; + case LogicalType::DECIMAL: + *out = MakeDecimalType(node); + break; default: return Status::NotImplemented("Unhandled logical type for int64"); break; From 5a68f8d737aa94ff3d09dae4e5b29883e798e9c4 Mon Sep 17 00:00:00 2001 From: Dan Robinson Date: Thu, 31 Mar 2016 10:02:54 -0700 Subject: [PATCH 053/210] ARROW-93: Fix builds when using XCode 7.3 Author: Dan Robinson Closes #54 from danrobinson/ARROW-93 and squashes the following commits: ddff5b0 [Dan Robinson] ARROW-93: Fix builds when using XCode 7.3 --- cpp/cmake_modules/CompilerInfo.cmake | 2 +- python/cmake_modules/CompilerInfo.cmake | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake_modules/CompilerInfo.cmake b/cpp/cmake_modules/CompilerInfo.cmake index 07860682f9b..e1c821cca5d 100644 --- a/cpp/cmake_modules/CompilerInfo.cmake +++ b/cpp/cmake_modules/CompilerInfo.cmake @@ -31,7 +31,7 @@ elseif("${COMPILER_VERSION_FULL}" MATCHES ".*based on LLVM.*") # clang on Mac OS X, XCode 7. No version replacement is done # because Apple no longer advertises the upstream LLVM version. -elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-700\\..*") +elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-70[0-9]\\..*") set(COMPILER_FAMILY "clang") # gcc diff --git a/python/cmake_modules/CompilerInfo.cmake b/python/cmake_modules/CompilerInfo.cmake index e66bc2693ee..55f989a1a6c 100644 --- a/python/cmake_modules/CompilerInfo.cmake +++ b/python/cmake_modules/CompilerInfo.cmake @@ -34,7 +34,7 @@ elseif("${COMPILER_VERSION_FULL}" MATCHES ".*based on LLVM.*") # clang on Mac OS X, XCode 7. No version replacement is done # because Apple no longer advertises the upstream LLVM version. -elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-700\\..*") +elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-70[0-9]\\..*") set(COMPILER_FAMILY "clang") # gcc From b3ebce1b3471abbdc4516ff86014aa26bcc99a24 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 31 Mar 2016 17:27:56 -0700 Subject: [PATCH 054/210] ARROW-89: [Python] Add benchmarks for Arrow<->Pandas conversion Author: Uwe L. Korn Closes #51 from xhochy/arrow-89 and squashes the following commits: bd6a7cb [Uwe L. Korn] Split benchmarks and add one for a float64 column with NaNs 8f74528 [Uwe L. Korn] ARROW-89: [Python] Add benchmarks for Arrow<->Pandas conversion --- python/benchmarks/array.py | 55 ++++++++++++++++++++++++++++++++++---- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/python/benchmarks/array.py b/python/benchmarks/array.py index 6ab73d18d1f..4268f0073f2 100644 --- a/python/benchmarks/array.py +++ b/python/benchmarks/array.py @@ -15,22 +15,67 @@ # specific language governing permissions and limitations # under the License. -import pyarrow +import numpy as np +import pandas as pd +import pyarrow as A -class Conversions(object): + +class PyListConversions(object): + param_names = ('size',) params = (1, 10 ** 5, 10 ** 6, 10 ** 7) + def setup(self, n): + self.data = list(range(n)) + def time_from_pylist(self, n): - pyarrow.from_pylist(list(range(n))) + A.from_pylist(self.data) def peakmem_from_pylist(self, n): - pyarrow.from_pylist(list(range(n))) + A.from_pylist(self.data) + + +class PandasConversionsBase(object): + def setup(self, n, dtype): + if dtype == 'float64_nans': + arr = np.arange(n).astype('float64') + arr[arr % 10 == 0] = np.nan + else: + arr = np.arange(n).astype(dtype) + self.data = pd.DataFrame({'column': arr}) + + +class PandasConversionsToArrow(PandasConversionsBase): + param_names = ('size', 'dtype') + params = ((1, 10 ** 5, 10 ** 6, 10 ** 7), ('int64', 'float64', 'float64_nans', 'str')) + + def time_from_series(self, n, dtype): + A.from_pandas_dataframe(self.data) + + def peakmem_from_series(self, n, dtype): + A.from_pandas_dataframe(self.data) + + +class PandasConversionsFromArrow(PandasConversionsBase): + param_names = ('size', 'dtype') + params = ((1, 10 ** 5, 10 ** 6, 10 ** 7), ('int64', 'float64', 'float64_nans', 'str')) + + def setup(self, n, dtype): + super(PandasConversionsFromArrow, self).setup(n, dtype) + self.arrow_data = A.from_pandas_dataframe(self.data) + + def time_to_series(self, n, dtype): + self.arrow_data.to_pandas() + + def peakmem_to_series(self, n, dtype): + self.arrow_data.to_pandas() + class ScalarAccess(object): + param_names = ('size',) params = (1, 10 ** 5, 10 ** 6, 10 ** 7) def setUp(self, n): - self._array = pyarrow.from_pylist(list(range(n))) + self._array = A.from_pylist(list(range(n))) def time_as_py(self, n): for i in range(n): From 6d31d5928f4ec5ced14a105b5b05d46a7dab5264 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 31 Mar 2016 17:47:42 -0700 Subject: [PATCH 055/210] ARROW-49: [Python] Add Column and Table wrapper interface After https://github.com/apache/arrow/pull/52 is merged, I'd like to split Column and Table into separate .pyx files, array.pyx seems a bit overcrowded. Author: Uwe L. Korn Closes #53 from xhochy/arrow-49 and squashes the following commits: b01b201 [Uwe L. Korn] Use correct number of chunks e422faf [Uwe L. Korn] Incoportate PR feedback, Add ChunkedArray interface e8f84a9 [Uwe L. Korn] ARROW-49: [Python] Add Column and Table wrapper interface --- python/CMakeLists.txt | 1 + python/pyarrow/__init__.py | 4 +- python/pyarrow/array.pxd | 2 + python/pyarrow/array.pyx | 75 +------- python/pyarrow/includes/libarrow.pxd | 5 +- python/pyarrow/schema.pxd | 2 + python/pyarrow/schema.pyx | 9 + python/pyarrow/table.pxd | 46 +++++ python/pyarrow/table.pyx | 264 +++++++++++++++++++++++++++ python/pyarrow/tests/test_column.py | 49 +++++ python/pyarrow/tests/test_table.py | 39 ++++ python/setup.py | 2 +- 12 files changed, 422 insertions(+), 76 deletions(-) create mode 100644 python/pyarrow/table.pxd create mode 100644 python/pyarrow/table.pyx create mode 100644 python/pyarrow/tests/test_column.py diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ebe825f65c4..2173232d4ef 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -444,6 +444,7 @@ set(CYTHON_EXTENSIONS error scalar schema + table ) foreach(module ${CYTHON_EXTENSIONS}) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index c343f5ba5f1..40a09c2feae 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -41,4 +41,6 @@ list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.array import RowBatch, Table, from_pandas_dataframe +from pyarrow.array import RowBatch, from_pandas_dataframe + +from pyarrow.table import Column, Table diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd index de3c7741962..8cd15cd4502 100644 --- a/python/pyarrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -36,6 +36,8 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array) cdef getitem(self, int i) +cdef object box_arrow_array(const shared_ptr[CArray]& sp_array) + cdef class BooleanArray(Array): pass diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 255efc268fe..456bf6d1da8 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -33,6 +33,8 @@ from pyarrow.scalar import NA from pyarrow.schema cimport Schema import pyarrow.schema as schema +from pyarrow.table cimport Table + def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.GetMemoryPool() return pool.bytes_allocated() @@ -287,76 +289,3 @@ cdef class RowBatch: return self.arrays[i] -cdef class Table: - ''' - Do not call this class's constructor directly. - ''' - cdef: - shared_ptr[CTable] sp_table - CTable* table - - def __cinit__(self): - pass - - cdef init(self, const shared_ptr[CTable]& table): - self.sp_table = table - self.table = table.get() - - @staticmethod - def from_pandas(df, name=None): - pass - - @staticmethod - def from_arrays(names, arrays, name=None): - cdef: - Array arr - Table result - c_string c_name - vector[shared_ptr[CField]] fields - vector[shared_ptr[CColumn]] columns - shared_ptr[CSchema] schema - shared_ptr[CTable] table - - cdef int K = len(arrays) - - fields.resize(K) - columns.resize(K) - for i in range(K): - arr = arrays[i] - c_name = tobytes(names[i]) - - fields[i].reset(new CField(c_name, arr.type.sp_type, True)) - columns[i].reset(new CColumn(fields[i], arr.sp_array)) - - if name is None: - c_name = '' - else: - c_name = tobytes(name) - - schema.reset(new CSchema(fields)) - table.reset(new CTable(c_name, schema, columns)) - - result = Table() - result.init(table) - - return result - - def to_pandas(self): - """ - Convert the arrow::Table to a pandas DataFrame - """ - cdef: - PyObject* arr - shared_ptr[CColumn] col - - import pandas as pd - - names = [] - data = [] - for i in range(self.table.num_columns()): - col = self.table.column(i) - check_status(pyarrow.ArrowToPandas(col, &arr)) - names.append(frombytes(col.get().name())) - data.append( arr) - - return pd.DataFrame(dict(zip(names, data)), columns=names) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 42f1f25073d..b2ef45a347b 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -149,7 +149,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string GetString(int i) cdef cppclass CChunkedArray" arrow::ChunkedArray": - pass + int64_t length() + int64_t null_count() + int num_chunks() + const shared_ptr[CArray]& chunk(int i) cdef cppclass CColumn" arrow::Column": CColumn(const shared_ptr[CField]& field, diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd index 61458b765c7..f2cb776eb2e 100644 --- a/python/pyarrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -41,5 +41,7 @@ cdef class Schema: CSchema* schema cdef init(self, const vector[shared_ptr[CField]]& fields) + cdef init_schema(self, const shared_ptr[CSchema]& schema) cdef DataType box_data_type(const shared_ptr[CDataType]& type) +cdef Schema box_schema(const shared_ptr[CSchema]& schema) diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index b3bf02aad76..22ddf0cf17e 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -106,6 +106,10 @@ cdef class Schema: self.schema = new CSchema(fields) self.sp_schema.reset(self.schema) + cdef init_schema(self, const shared_ptr[CSchema]& schema): + self.schema = schema.get() + self.sp_schema = schema + @classmethod def from_fields(cls, fields): cdef: @@ -223,3 +227,8 @@ cdef DataType box_data_type(const shared_ptr[CDataType]& type): cdef DataType out = DataType() out.init(type) return out + +cdef Schema box_schema(const shared_ptr[CSchema]& type): + cdef Schema out = Schema() + out.init_schema(type) + return out diff --git a/python/pyarrow/table.pxd b/python/pyarrow/table.pxd new file mode 100644 index 00000000000..0a5c122c95c --- /dev/null +++ b/python/pyarrow/table.pxd @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.includes.common cimport shared_ptr +from pyarrow.includes.libarrow cimport CChunkedArray, CColumn, CTable + + +cdef class ChunkedArray: + cdef: + shared_ptr[CChunkedArray] sp_chunked_array + CChunkedArray* chunked_array + + cdef init(self, const shared_ptr[CChunkedArray]& chunked_array) + cdef _check_nullptr(self) + + +cdef class Column: + cdef: + shared_ptr[CColumn] sp_column + CColumn* column + + cdef init(self, const shared_ptr[CColumn]& column) + cdef _check_nullptr(self) + + +cdef class Table: + cdef: + shared_ptr[CTable] sp_table + CTable* table + + cdef init(self, const shared_ptr[CTable]& table) + cdef _check_nullptr(self) diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx new file mode 100644 index 00000000000..4c4816f0c7e --- /dev/null +++ b/python/pyarrow/table.pyx @@ -0,0 +1,264 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from pyarrow.includes.libarrow cimport * +cimport pyarrow.includes.pyarrow as pyarrow + +import pyarrow.config + +from pyarrow.array cimport Array, box_arrow_array +from pyarrow.compat import frombytes, tobytes +from pyarrow.error cimport check_status +from pyarrow.schema cimport box_data_type, box_schema + +cdef class ChunkedArray: + ''' + Do not call this class's constructor directly. + ''' + + def __cinit__(self): + self.chunked_array = NULL + + cdef init(self, const shared_ptr[CChunkedArray]& chunked_array): + self.sp_chunked_array = chunked_array + self.chunked_array = chunked_array.get() + + cdef _check_nullptr(self): + if self.chunked_array == NULL: + raise ReferenceError("ChunkedArray object references a NULL pointer." + "Not initialized.") + + def length(self): + self._check_nullptr() + return self.chunked_array.length() + + def __len__(self): + return self.length() + + property null_count: + + def __get__(self): + self._check_nullptr() + return self.chunked_array.null_count() + + property num_chunks: + + def __get__(self): + self._check_nullptr() + return self.chunked_array.num_chunks() + + def chunk(self, i): + self._check_nullptr() + return box_arrow_array(self.chunked_array.chunk(i)) + + + def iterchunks(self): + for i in range(self.num_chunks): + yield self.chunk(i) + + +cdef class Column: + ''' + Do not call this class's constructor directly. + ''' + + def __cinit__(self): + self.column = NULL + + cdef init(self, const shared_ptr[CColumn]& column): + self.sp_column = column + self.column = column.get() + + def to_pandas(self): + """ + Convert the arrow::Column to a pandas Series + """ + cdef: + PyObject* arr + + import pandas as pd + + check_status(pyarrow.ArrowToPandas(self.sp_column, &arr)) + return pd.Series(arr, name=self.name) + + cdef _check_nullptr(self): + if self.column == NULL: + raise ReferenceError("Column object references a NULL pointer." + "Not initialized.") + + def __len__(self): + self._check_nullptr() + return self.column.length() + + def length(self): + self._check_nullptr() + return self.column.length() + + property shape: + + def __get__(self): + self._check_nullptr() + return (self.length(),) + + property null_count: + + def __get__(self): + self._check_nullptr() + return self.column.null_count() + + property name: + + def __get__(self): + return frombytes(self.column.name()) + + property type: + + def __get__(self): + return box_data_type(self.column.type()) + + property data: + + def __get__(self): + cdef ChunkedArray chunked_array = ChunkedArray() + chunked_array.init(self.column.data()) + return chunked_array + + +cdef class Table: + ''' + Do not call this class's constructor directly. + ''' + + def __cinit__(self): + self.table = NULL + + cdef init(self, const shared_ptr[CTable]& table): + self.sp_table = table + self.table = table.get() + + cdef _check_nullptr(self): + if self.table == NULL: + raise ReferenceError("Table object references a NULL pointer." + "Not initialized.") + + @staticmethod + def from_pandas(df, name=None): + pass + + @staticmethod + def from_arrays(names, arrays, name=None): + cdef: + Array arr + Table result + c_string c_name + vector[shared_ptr[CField]] fields + vector[shared_ptr[CColumn]] columns + shared_ptr[CSchema] schema + shared_ptr[CTable] table + + cdef int K = len(arrays) + + fields.resize(K) + columns.resize(K) + for i in range(K): + arr = arrays[i] + c_name = tobytes(names[i]) + + fields[i].reset(new CField(c_name, arr.type.sp_type, True)) + columns[i].reset(new CColumn(fields[i], arr.sp_array)) + + if name is None: + c_name = '' + else: + c_name = tobytes(name) + + schema.reset(new CSchema(fields)) + table.reset(new CTable(c_name, schema, columns)) + + result = Table() + result.init(table) + + return result + + def to_pandas(self): + """ + Convert the arrow::Table to a pandas DataFrame + """ + cdef: + PyObject* arr + shared_ptr[CColumn] col + + import pandas as pd + + names = [] + data = [] + for i in range(self.table.num_columns()): + col = self.table.column(i) + check_status(pyarrow.ArrowToPandas(col, &arr)) + names.append(frombytes(col.get().name())) + data.append( arr) + + return pd.DataFrame(dict(zip(names, data)), columns=names) + + property name: + + def __get__(self): + self._check_nullptr() + return frombytes(self.table.name()) + + property schema: + + def __get__(self): + raise box_schema(self.table.schema()) + + def column(self, index): + self._check_nullptr() + cdef Column column = Column() + column.init(self.table.column(index)) + return column + + def __getitem__(self, i): + return self.column(i) + + def itercolumns(self): + for i in range(self.num_columns): + yield self.column(i) + + property num_columns: + + def __get__(self): + self._check_nullptr() + return self.table.num_columns() + + property num_rows: + + def __get__(self): + self._check_nullptr() + return self.table.num_rows() + + def __len__(self): + return self.num_rows + + property shape: + + def __get__(self): + return (self.num_rows, self.num_columns) + diff --git a/python/pyarrow/tests/test_column.py b/python/pyarrow/tests/test_column.py new file mode 100644 index 00000000000..b62f58236e0 --- /dev/null +++ b/python/pyarrow/tests/test_column.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.compat import unittest +import pyarrow as arrow + +A = arrow + +import pandas as pd + + +class TestColumn(unittest.TestCase): + + def test_basics(self): + data = [ + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a'), data, 'table_name') + column = table.column(0) + assert column.name == 'a' + assert column.length() == 5 + assert len(column) == 5 + assert column.shape == (5,) + + def test_pandas(self): + data = [ + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a'), data, 'table_name') + column = table.column(0) + series = column.to_pandas() + assert series.name == 'a' + assert series.shape == (5,) + assert series.iloc[0] == -10 + diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 2e24445bd0c..83fcbb8faff 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -20,6 +20,8 @@ A = arrow +import pandas as pd + class TestRowBatch(unittest.TestCase): @@ -38,3 +40,40 @@ def test_basics(self): assert len(batch) == num_rows assert batch.num_rows == num_rows assert batch.num_columns == len(data) + + +class TestTable(unittest.TestCase): + + def test_basics(self): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + assert table.name == 'table_name' + assert len(table) == 5 + assert table.num_rows == 5 + assert table.num_columns == 2 + assert table.shape == (5, 2) + + for col in table.itercolumns(): + for chunk in col.data.iterchunks(): + assert chunk is not None + + def test_pandas(self): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + + # TODO: Use this part once from_pandas is implemented + # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} + # df = pd.DataFrame(data) + # A.Table.from_pandas(df) + + df = table.to_pandas() + assert set(df.columns) == set(('a', 'b')) + assert df.shape == (5, 2) + assert df.ix[0, 'b'] == -10 + diff --git a/python/setup.py b/python/setup.py index 5cc871aba9f..ebd80de46b4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -214,7 +214,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['array', 'config', 'error', 'scalar', 'schema'] + return ['array', 'config', 'error', 'scalar', 'schema', 'table'] def get_names(self): return self._found_names From 79fddd1138ff69953e943f5980533dc01eabbb97 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 31 Mar 2016 17:48:38 -0700 Subject: [PATCH 056/210] ARROW-90: [C++] Check for SIMD instruction set support This also adds an option to disable the usage of a specific instruction set, e.g. you compile on a machine that supports SSE3 but you want to use the binary also on machines without SSE3. (Distribution packagers will love that option!) Author: Uwe L. Korn Closes #50 from xhochy/arrow-90 and squashes the following commits: 6fd80d3 [Uwe L. Korn] ARROW-90: Check for SIMD instruction set support --- cpp/CMakeLists.txt | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6ed2768d139..26d12d24247 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -66,6 +66,14 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow IPC extensions" ON) + option(ARROW_SSE3 + "Build Arrow with SSE3" + ON) + + option(ARROW_ALTIVEC + "Build Arrow with Altivec" + ON) + endif() if(NOT ARROW_BUILD_TESTS) @@ -81,9 +89,25 @@ endif() # Compiler flags ############################################################ +# Check if the target architecture and compiler supports some special +# instruction sets that would boost performance. +include(CheckCXXCompilerFlag) +# x86/amd64 compiler flags +CHECK_CXX_COMPILER_FLAG("-msse3" CXX_SUPPORTS_SSE3) +# power compiler flags +CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC) + # compiler flags that are common across debug/release builds # - Wall: Enable all warnings. -set(CXX_COMMON_FLAGS "-std=c++11 -msse3 -Wall") +set(CXX_COMMON_FLAGS "-std=c++11 -Wall") + +# Only enable additional instruction sets if they are supported +if (CXX_SUPPORTS_SSE3 AND ARROW_SSE3) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse3") +endif() +if (CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC) + set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -maltivec") +endif() if (APPLE) # Depending on the default OSX_DEPLOYMENT_TARGET (< 10.9), libstdc++ may be From 5d129991b3369b0e45cb79d1efe6ba2fd8dd21d0 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 1 Apr 2016 21:40:20 -0700 Subject: [PATCH 057/210] ARROW-71: [C++] Add clang-tidy and clang-format to the the tool chain. I changed the ubuntu flavor for building to precise because https://github.com/travis-ci/apt-source-whitelist/issues/199 is currently blocking using trusty. I also expect there might be a couple of iterations on settings for clang-format and clang-tidy (or if we even want them as standard parts of the toolchain). @wesm I noticed the lint target explicitly turns off some checks, I don't know if these were copy and pasted or you really don't like them. If the latter I can do a first pass of turning the same ones off for clang-tidy. In terms of reviewing: It is likely useful, to look at the PR commit by commit, since the last two commits are 99% driven by the first commit. The main chunk of code that wasn't machine fixed is FatalLog in logging. The good news is clang-tidy caught one potential corner case segfault when a column happened to be null :) Author: Micah Kornfield Closes #55 from emkornfield/emk_add_clang_tidy_PR and squashes the following commits: 2fafb10 [Micah Kornfield] adjust line length from 88 to 90, turn on bin packing of parameters. increase penality for before first call parameter 169352f [Micah Kornfield] add llvm tool chain as travis source e7723d1 [Micah Kornfield] upgrade to precise to verify if build works. address self comments d3f76d8 [Micah Kornfield] clang format change 9c556ef [Micah Kornfield] cleanup from clang-tidy 26945e9 [Micah Kornfield] add more failure checks for build_thirdparty 4dd0b81 [Micah Kornfield] Add clang-format and clang-tidy targets to toolchain --- .travis.yml | 5 +- ci/travis_script_cpp.sh | 4 + cpp/CMakeLists.txt | 39 +++++- cpp/README.md | 16 +++ cpp/build-support/run-clang-format.sh | 42 ++++++ cpp/build-support/run-clang-tidy.sh | 40 ++++++ cpp/cmake_modules/FindClangTools.cmake | 60 +++++++++ cpp/src/.clang-format | 65 ++++++++++ cpp/src/.clang-tidy | 14 ++ cpp/src/arrow/api.h | 2 +- cpp/src/arrow/array-test.cc | 16 +-- cpp/src/arrow/array.cc | 17 +-- cpp/src/arrow/array.h | 28 ++-- cpp/src/arrow/builder.cc | 2 +- cpp/src/arrow/builder.h | 40 +++--- cpp/src/arrow/column-benchmark.cc | 23 ++-- cpp/src/arrow/column-test.cc | 2 +- cpp/src/arrow/column.cc | 27 ++-- cpp/src/arrow/column.h | 41 ++---- cpp/src/arrow/ipc/adapter.cc | 50 ++++---- cpp/src/arrow/ipc/adapter.h | 18 +-- cpp/src/arrow/ipc/ipc-adapter-test.cc | 20 ++- cpp/src/arrow/ipc/ipc-memory-test.cc | 15 +-- cpp/src/arrow/ipc/ipc-metadata-test.cc | 8 +- cpp/src/arrow/ipc/memory.cc | 46 +++---- cpp/src/arrow/ipc/memory.h | 22 ++-- cpp/src/arrow/ipc/metadata-internal.cc | 70 +++++----- cpp/src/arrow/ipc/metadata-internal.h | 12 +- cpp/src/arrow/ipc/metadata.cc | 72 ++++------- cpp/src/arrow/ipc/metadata.h | 20 +-- cpp/src/arrow/ipc/test-common.h | 10 +- cpp/src/arrow/parquet/parquet-schema-test.cc | 63 ++++----- cpp/src/arrow/parquet/schema.cc | 15 +-- cpp/src/arrow/parquet/schema.h | 11 +- cpp/src/arrow/schema-test.cc | 6 +- cpp/src/arrow/schema.cc | 20 +-- cpp/src/arrow/schema.h | 10 +- cpp/src/arrow/table-test.cc | 16 +-- cpp/src/arrow/table.cc | 31 ++--- cpp/src/arrow/table.h | 38 ++---- cpp/src/arrow/test-util.h | 58 ++++----- cpp/src/arrow/type.cc | 8 +- cpp/src/arrow/type.h | 94 +++++--------- cpp/src/arrow/types/binary.h | 6 +- cpp/src/arrow/types/collection.h | 12 +- cpp/src/arrow/types/construct.cc | 42 +++--- cpp/src/arrow/types/construct.h | 11 +- cpp/src/arrow/types/datetime.h | 39 ++---- cpp/src/arrow/types/decimal-test.cc | 2 +- cpp/src/arrow/types/decimal.cc | 3 +- cpp/src/arrow/types/decimal.h | 11 +- cpp/src/arrow/types/json.cc | 5 +- cpp/src/arrow/types/json.h | 8 +- cpp/src/arrow/types/list-test.cc | 11 +- cpp/src/arrow/types/list.cc | 25 ++-- cpp/src/arrow/types/list.h | 65 ++++------ cpp/src/arrow/types/primitive-test.cc | 107 +++++++--------- cpp/src/arrow/types/primitive.cc | 75 ++++------- cpp/src/arrow/types/primitive.h | 128 +++++++------------ cpp/src/arrow/types/string-test.cc | 20 +-- cpp/src/arrow/types/string.cc | 10 +- cpp/src/arrow/types/string.h | 48 +++---- cpp/src/arrow/types/struct-test.cc | 4 +- cpp/src/arrow/types/struct.cc | 4 +- cpp/src/arrow/types/struct.h | 6 +- cpp/src/arrow/types/test-common.h | 9 +- cpp/src/arrow/types/union.cc | 6 +- cpp/src/arrow/types/union.h | 17 +-- cpp/src/arrow/util/bit-util-test.cc | 2 +- cpp/src/arrow/util/bit-util.cc | 10 +- cpp/src/arrow/util/bit-util.h | 6 +- cpp/src/arrow/util/buffer-test.cc | 5 +- cpp/src/arrow/util/buffer.cc | 16 +-- cpp/src/arrow/util/buffer.h | 61 +++------ cpp/src/arrow/util/logging.h | 78 +++++++---- cpp/src/arrow/util/macros.h | 6 +- cpp/src/arrow/util/memory-pool-test.cc | 2 +- cpp/src/arrow/util/memory-pool.cc | 2 +- cpp/src/arrow/util/memory-pool.h | 4 +- cpp/src/arrow/util/random.h | 27 ++-- cpp/src/arrow/util/status.cc | 10 +- cpp/src/arrow/util/status.h | 45 ++++--- cpp/src/arrow/util/test_main.cc | 2 +- cpp/thirdparty/build_thirdparty.sh | 4 +- 84 files changed, 1015 insertions(+), 1155 deletions(-) create mode 100755 cpp/build-support/run-clang-format.sh create mode 100755 cpp/build-support/run-clang-tidy.sh create mode 100644 cpp/cmake_modules/FindClangTools.cmake create mode 100644 cpp/src/.clang-format create mode 100644 cpp/src/.clang-tidy diff --git a/.travis.yml b/.travis.yml index d89a200b892..a0138a79598 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,14 @@ sudo: required -dist: trusty +dist: precise addons: apt: sources: - ubuntu-toolchain-r-test - kalakris-cmake + - llvm-toolchain-precise-3.7 packages: + - clang-format-3.7 + - clang-tidy-3.7 - gcc-4.9 # Needed for C++11 - g++-4.9 # Needed for C++11 - gdb diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index 997bdf35e83..c9b3b5f1442 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -7,6 +7,10 @@ set -e pushd $CPP_BUILD_DIR make lint +if [ $TRAVIS_OS_NAME == "linux" ]; then + make check-format + make check-clang-tidy +fi ctest -L unittest diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 26d12d24247..f803c0fb3e4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -30,10 +30,11 @@ set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty") # Must be declared in the top-level CMakeLists.txt. set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) -# Generate a Clang compile_commands.json "compilation database" file for use -# with various development tools, such as Vim's YouCompleteMe plugin. -# See http://clang.llvm.org/docs/JSONCompilationDatabase.html -if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1") +find_package(ClangTools) +if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) + # Generate a Clang compile_commands.json "compilation database" file for use + # with various development tools, such as Vim's YouCompleteMe plugin. + # See http://clang.llvm.org/docs/JSONCompilationDatabase.html set(CMAKE_EXPORT_COMPILE_COMMANDS 1) endif() @@ -540,6 +541,36 @@ if (UNIX) `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) endif (UNIX) + +############################################################ +# "make format" and "make check-format" targets +############################################################ +if (${CLANG_FORMAT_FOUND}) + # runs clang format and updates files in place. + add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1 + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) + + # runs clang format and exits with a non-zero exit code if any files need to be reformatted + add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0 + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) +endif() + + +############################################################ +# "make clang-tidy" and "make check-clang-tidy" targets +############################################################ +if (${CLANG_TIDY_FOUND}) + # runs clang-tidy and attempts to fix any warning automatically + add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1 + `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) + # runs clang-tidy and exits with a non-zero exit code if any errors are found. + add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json + 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) + +endif() + + + ############################################################ # Subdirectories ############################################################ diff --git a/cpp/README.md b/cpp/README.md index 9026cf963f8..3f5da21b7d4 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -61,3 +61,19 @@ variables * Googletest: `GTEST_HOME` (only required to build the unit tests) * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) * Flatbuffers: `FLATBUFFERS_HOME` (only required for the IPC extensions) + +## Continuous Integration + +Pull requests are run through travis-ci for continuous integration. You can avoid +build failures by running the following checks before submitting your pull request: + + make unittest + make lint + # The next two commands may change your code. It is recommended you commit + # before running them. + make clang-tidy # requires clang-tidy is installed + make format # requires clang-format is installed + +Note that the clang-tidy target may take a while to run. You might consider +running clang-tidy separately on the files you have added/changed before +invoking the make target to reduce iteration time. diff --git a/cpp/build-support/run-clang-format.sh b/cpp/build-support/run-clang-format.sh new file mode 100755 index 00000000000..ba525dfc33c --- /dev/null +++ b/cpp/build-support/run-clang-format.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Runs clang format in the given directory +# Arguments: +# $1 - Path to the source tree +# $2 - Path to the clang format binary +# $3 - Apply fixes (will raise an error if false and not there where changes) +# $ARGN - Files to run clang format on +# +SOURCE_DIR=$1 +shift +CLANG_FORMAT=$1 +shift +APPLY_FIXES=$1 +shift + +# clang format will only find its configuration if we are in +# the source tree or in a path relative to the source tree +pushd $SOURCE_DIR +if [ "$APPLY_FIXES" == "1" ]; then + $CLANG_FORMAT -i $@ +else + + NUM_CORRECTIONS=`$CLANG_FORMAT -output-replacements-xml $@ | grep offset | wc -l` + if [ "$NUM_CORRECTIONS" -gt "0" ]; then + echo "clang-format suggested changes, please run 'make format'!!!!" + exit 1 + fi +fi +popd diff --git a/cpp/build-support/run-clang-tidy.sh b/cpp/build-support/run-clang-tidy.sh new file mode 100755 index 00000000000..4ba8ab8cd76 --- /dev/null +++ b/cpp/build-support/run-clang-tidy.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# +# Runs clang format in the given directory +# Arguments: +# $1 - Path to the clang tidy binary +# $2 - Path to the compile_commands.json to use +# $3 - Apply fixes (will raise an error if false and not there where changes) +# $ARGN - Files to run clang-tidy on +# +CLANG_TIDY=$1 +shift +COMPILE_COMMANDS=$1 +shift +APPLY_FIXES=$1 +shift + +# clang format will only find its configuration if we are in +# the source tree or in a path relative to the source tree +if [ "$APPLY_FIXES" == "1" ]; then + $CLANG_TIDY -p $COMPILE_COMMANDS -fix $@ +else + NUM_CORRECTIONS=`$CLANG_TIDY -p $COMPILE_COMMANDS $@ 2>&1 | grep -v Skipping | grep "warnings* generated" | wc -l` + if [ "$NUM_CORRECTIONS" -gt "0" ]; then + echo "clang-tidy had suggested fixes. Please fix these!!!" + exit 1 + fi +fi diff --git a/cpp/cmake_modules/FindClangTools.cmake b/cpp/cmake_modules/FindClangTools.cmake new file mode 100644 index 00000000000..c07c7d24449 --- /dev/null +++ b/cpp/cmake_modules/FindClangTools.cmake @@ -0,0 +1,60 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find the clang-tidy and clang-format modules +# +# Usage of this module as follows: +# +# find_package(ClangTools) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# ClangToolsBin_HOME - +# When set, this path is inspected instead of standard library binary locations +# to find clang-tidy and clang-format +# +# This module defines +# CLANG_TIDY_BIN, The path to the clang tidy binary +# CLANG_TIDY_FOUND, Whether clang tidy was found +# CLANG_FORMAT_BIN, The path to the clang format binary +# CLANG_TIDY_FOUND, Whether clang format was found + +find_program(CLANG_TIDY_BIN + NAMES clang-tidy-3.8 clang-tidy-3.7 clang-tidy-3.6 clang-tidy + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + NO_DEFAULT_PATH +) + +if ( "${CLANG_TIDY_BIN}" STREQUAL "CLANG_TIDY_BIN-NOTFOUND" ) + set(CLANG_TIDY_FOUND 0) + message("clang-tidy not found") +else() + set(CLANG_TIDY_FOUND 1) + message("clang-tidy found at ${CLANG_TIDY_BIN}") +endif() + +find_program(CLANG_FORMAT_BIN + NAMES clang-format-3.8 clang-format-3.7 clang-format-3.6 clang-format + PATHS ${ClangTools_PATH} $ENV{CLANG_TOOLS_PATH} /usr/local/bin /usr/bin + NO_DEFAULT_PATH +) + +if ( "${CLANG_FORMAT_BIN}" STREQUAL "CLANG_FORMAT_BIN-NOTFOUND" ) + set(CLANG_FORMAT_FOUND 0) + message("clang-format not found") +else() + set(CLANG_FORMAT_FOUND 1) + message("clang-format found at ${CLANG_FORMAT_BIN}") +endif() + diff --git a/cpp/src/.clang-format b/cpp/src/.clang-format new file mode 100644 index 00000000000..7d5b3cf30ef --- /dev/null +++ b/cpp/src/.clang-format @@ -0,0 +1,65 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: false +AlignConsecutiveAssignments: false +AlignEscapedNewlinesLeft: true +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: true +BinPackParameters: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +ColumnLimit: 90 +CommentPragmas: '^ IWYU pragma:' +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] +IndentCaseLabels: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakBeforeFirstCallParameter: 1000 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +SpaceAfterCStyleCast: false +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never diff --git a/cpp/src/.clang-tidy b/cpp/src/.clang-tidy new file mode 100644 index 00000000000..deaa9bdf97f --- /dev/null +++ b/cpp/src/.clang-tidy @@ -0,0 +1,14 @@ +--- +Checks: 'clang-diagnostic-*,clang-analyzer-*,-clang-analyzer-alpha*,google-.*,modernize-.*,readablity-.*' +HeaderFilterRegex: 'arrow/.*' +AnalyzeTemporaryDtors: true +CheckOptions: + - key: google-readability-braces-around-statements.ShortStatementLines + value: '1' + - key: google-readability-function-size.StatementThreshold + value: '800' + - key: google-readability-namespace-comments.ShortNamespaceLines + value: '10' + - key: google-readability-namespace-comments.SpacesBeforeComments + value: '2' + diff --git a/cpp/src/arrow/api.h b/cpp/src/arrow/api.h index 2ae80f642f2..2d317b49cb7 100644 --- a/cpp/src/arrow/api.h +++ b/cpp/src/arrow/api.h @@ -37,4 +37,4 @@ #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" -#endif // ARROW_API_H +#endif // ARROW_API_H diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 121b802d994..b4c727997ee 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -33,15 +33,12 @@ namespace arrow { class TestArray : public ::testing::Test { public: - void SetUp() { - pool_ = default_memory_pool(); - } + void SetUp() { pool_ = default_memory_pool(); } protected: MemoryPool* pool_; }; - TEST_F(TestArray, TestNullCount) { auto data = std::make_shared(pool_); auto null_bitmap = std::make_shared(pool_); @@ -53,7 +50,6 @@ TEST_F(TestArray, TestNullCount) { ASSERT_EQ(0, arr_no_nulls->null_count()); } - TEST_F(TestArray, TestLength) { auto data = std::make_shared(pool_); std::unique_ptr arr(new Int32Array(100, data)); @@ -61,14 +57,16 @@ TEST_F(TestArray, TestLength) { } TEST_F(TestArray, TestIsNull) { + // clang-format off std::vector null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1}; + // clang-format on int32_t null_count = 0; for (uint8_t x : null_bitmap) { - if (x == 0) ++null_count; + if (x == 0) { ++null_count; } } std::shared_ptr null_buf = test::bytes_to_null_buffer(null_bitmap); @@ -85,8 +83,6 @@ TEST_F(TestArray, TestIsNull) { } } +TEST_F(TestArray, TestCopy) {} -TEST_F(TestArray, TestCopy) { -} - -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 3736732740b..a1536861a20 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -32,30 +32,25 @@ Array::Array(const TypePtr& type, int32_t length, int32_t null_count, length_ = length; null_count_ = null_count; null_bitmap_ = null_bitmap; - if (null_bitmap_) { - null_bitmap_data_ = null_bitmap_->data(); - } + if (null_bitmap_) { null_bitmap_data_ = null_bitmap_->data(); } } bool Array::EqualsExact(const Array& other) const { - if (this == &other) return true; + if (this == &other) { return true; } if (length_ != other.length_ || null_count_ != other.null_count_ || type_enum() != other.type_enum()) { return false; } if (null_count_ > 0) { return null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); - } else { - return true; } + return true; } bool NullArray::Equals(const std::shared_ptr& arr) const { - if (this == arr.get()) return true; - if (Type::NA != arr->type_enum()) { - return false; - } + if (this == arr.get()) { return true; } + if (Type::NA != arr->type_enum()) { return false; } return arr->length() == length_; } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 097634d74f8..c6735f87d8f 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -36,8 +36,7 @@ class Buffer; // count is greater than 0 class Array { public: - Array(const std::shared_ptr& type, int32_t length, - int32_t null_count = 0, + Array(const std::shared_ptr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); virtual ~Array() {} @@ -47,19 +46,15 @@ class Array { return null_count_ > 0 && util::bit_not_set(null_bitmap_data_, i); } - int32_t length() const { return length_;} - int32_t null_count() const { return null_count_;} + int32_t length() const { return length_; } + int32_t null_count() const { return null_count_; } - const std::shared_ptr& type() const { return type_;} - Type::type type_enum() const { return type_->type;} + const std::shared_ptr& type() const { return type_; } + Type::type type_enum() const { return type_->type; } - const std::shared_ptr& null_bitmap() const { - return null_bitmap_; - } + const std::shared_ptr& null_bitmap() const { return null_bitmap_; } - const uint8_t* null_bitmap_data() const { - return null_bitmap_data_; - } + const uint8_t* null_bitmap_data() const { return null_bitmap_data_; } bool EqualsExact(const Array& arr) const; virtual bool Equals(const std::shared_ptr& arr) const = 0; @@ -80,17 +75,16 @@ class Array { // Degenerate null type Array class NullArray : public Array { public: - NullArray(const std::shared_ptr& type, int32_t length) : - Array(type, length, length, nullptr) {} + NullArray(const std::shared_ptr& type, int32_t length) + : Array(type, length, length, nullptr) {} - explicit NullArray(int32_t length) : - NullArray(std::make_shared(), length) {} + explicit NullArray(int32_t length) : NullArray(std::make_shared(), length) {} bool Equals(const std::shared_ptr& arr) const override; }; typedef std::shared_ptr ArrayPtr; -} // namespace arrow +} // namespace arrow #endif diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 4061f35fd5e..1447078f760 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -62,4 +62,4 @@ Status ArrayBuilder::Reserve(int32_t elements) { return Status::OK(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index d1a49dce799..21a6341ef50 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -37,30 +37,26 @@ static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; // Base class for all data array builders class ArrayBuilder { public: - explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : - pool_(pool), - type_(type), - null_bitmap_(nullptr), - null_count_(0), - null_bitmap_data_(nullptr), - length_(0), - capacity_(0) {} + explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) + : pool_(pool), + type_(type), + null_bitmap_(nullptr), + null_count_(0), + null_bitmap_data_(nullptr), + length_(0), + capacity_(0) {} virtual ~ArrayBuilder() {} // For nested types. Since the objects are owned by this class instance, we // skip shared pointers and just return a raw pointer - ArrayBuilder* child(int i) { - return children_[i].get(); - } + ArrayBuilder* child(int i) { return children_[i].get(); } - int num_children() const { - return children_.size(); - } + int num_children() const { return children_.size(); } - int32_t length() const { return length_;} - int32_t null_count() const { return null_count_;} - int32_t capacity() const { return capacity_;} + int32_t length() const { return length_; } + int32_t null_count() const { return null_count_; } + int32_t capacity() const { return capacity_; } // Allocates requires memory at this level, but children need to be // initialized independently @@ -76,15 +72,13 @@ class ArrayBuilder { // this function responsibly. Status Advance(int32_t elements); - const std::shared_ptr& null_bitmap() const { return null_bitmap_;} + const std::shared_ptr& null_bitmap() const { return null_bitmap_; } // Creates new array object to hold the contents of the builder and transfers // ownership of the data virtual std::shared_ptr Finish() = 0; - const std::shared_ptr& type() const { - return type_; - } + const std::shared_ptr& type() const { return type_; } protected: MemoryPool* pool_; @@ -107,6 +101,6 @@ class ArrayBuilder { DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_BUILDER_H_ +#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc index 335d581782a..edea0948860 100644 --- a/cpp/src/arrow/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. - #include "benchmark/benchmark.h" #include "arrow/test-util.h" @@ -24,19 +23,19 @@ namespace arrow { namespace { - template - std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { - auto pool = default_memory_pool(); - auto data = std::make_shared(pool); - auto null_bitmap = std::make_shared(pool); - data->Resize(length * sizeof(typename ArrayType::value_type)); - null_bitmap->Resize(util::bytes_for_bits(length)); - return std::make_shared(length, data, 10, null_bitmap); - } +template +std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { + auto pool = default_memory_pool(); + auto data = std::make_shared(pool); + auto null_bitmap = std::make_shared(pool); + data->Resize(length * sizeof(typename ArrayType::value_type)); + null_bitmap->Resize(util::bytes_for_bits(length)); + return std::make_shared(length, data, 10, null_bitmap); +} } // anonymous namespace - -static void BM_BuildInt32ColumnByChunk(benchmark::State& state) { //NOLINT non-const reference +static void BM_BuildInt32ColumnByChunk( + benchmark::State& state) { // NOLINT non-const reference ArrayVector arrays; for (int chunk_n = 0; chunk_n < state.range_x(); ++chunk_n) { arrays.push_back(MakePrimitive(100, 10)); diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc index 0630785630e..1edf313d49b 100644 --- a/cpp/src/arrow/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -72,4 +72,4 @@ TEST_F(TestColumn, ChunksInhomogeneous) { ASSERT_RAISES(Invalid, column_->ValidateData()); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/column.cc b/cpp/src/arrow/column.cc index 46acf8df2ff..52e4c58e1dc 100644 --- a/cpp/src/arrow/column.cc +++ b/cpp/src/arrow/column.cc @@ -26,8 +26,7 @@ namespace arrow { -ChunkedArray::ChunkedArray(const ArrayVector& chunks) : - chunks_(chunks) { +ChunkedArray::ChunkedArray(const ArrayVector& chunks) : chunks_(chunks) { length_ = 0; null_count_ = 0; for (const std::shared_ptr& chunk : chunks) { @@ -36,35 +35,31 @@ ChunkedArray::ChunkedArray(const ArrayVector& chunks) : } } -Column::Column(const std::shared_ptr& field, const ArrayVector& chunks) : - field_(field) { +Column::Column(const std::shared_ptr& field, const ArrayVector& chunks) + : field_(field) { data_ = std::make_shared(chunks); } -Column::Column(const std::shared_ptr& field, - const std::shared_ptr& data) : - field_(field) { +Column::Column(const std::shared_ptr& field, const std::shared_ptr& data) + : field_(field) { data_ = std::make_shared(ArrayVector({data})); } -Column::Column(const std::shared_ptr& field, - const std::shared_ptr& data) : - field_(field), - data_(data) {} +Column::Column( + const std::shared_ptr& field, const std::shared_ptr& data) + : field_(field), data_(data) {} Status Column::ValidateData() { for (int i = 0; i < data_->num_chunks(); ++i) { const std::shared_ptr& type = data_->chunk(i)->type(); if (!this->type()->Equals(type)) { std::stringstream ss; - ss << "In chunk " << i << " expected type " - << this->type()->ToString() - << " but saw " - << type->ToString(); + ss << "In chunk " << i << " expected type " << this->type()->ToString() + << " but saw " << type->ToString(); return Status::Invalid(ss.str()); } } return Status::OK(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h index 1ad97b20863..22becc34547 100644 --- a/cpp/src/arrow/column.h +++ b/cpp/src/arrow/column.h @@ -39,21 +39,13 @@ class ChunkedArray { explicit ChunkedArray(const ArrayVector& chunks); // @returns: the total length of the chunked array; computed on construction - int64_t length() const { - return length_; - } + int64_t length() const { return length_; } - int64_t null_count() const { - return null_count_; - } + int64_t null_count() const { return null_count_; } - int num_chunks() const { - return chunks_.size(); - } + int num_chunks() const { return chunks_.size(); } - const std::shared_ptr& chunk(int i) const { - return chunks_[i]; - } + const std::shared_ptr& chunk(int i) const { return chunks_[i]; } protected: ArrayVector chunks_; @@ -67,33 +59,22 @@ class ChunkedArray { class Column { public: Column(const std::shared_ptr& field, const ArrayVector& chunks); - Column(const std::shared_ptr& field, - const std::shared_ptr& data); + Column(const std::shared_ptr& field, const std::shared_ptr& data); Column(const std::shared_ptr& field, const std::shared_ptr& data); - int64_t length() const { - return data_->length(); - } + int64_t length() const { return data_->length(); } - int64_t null_count() const { - return data_->null_count(); - } + int64_t null_count() const { return data_->null_count(); } // @returns: the column's name in the passed metadata - const std::string& name() const { - return field_->name; - } + const std::string& name() const { return field_->name; } // @returns: the column's type according to the metadata - const std::shared_ptr& type() const { - return field_->type; - } + const std::shared_ptr& type() const { return field_->type; } // @returns: the column's data as a chunked logical array - const std::shared_ptr& data() const { - return data_; - } + const std::shared_ptr& data() const { return data_; } // Verify that the column's array data is consistent with the passed field's // metadata Status ValidateData(); @@ -103,6 +84,6 @@ class Column { std::shared_ptr data_; }; -} // namespace arrow +} // namespace arrow #endif // ARROW_COLUMN_H diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index c79e8469530..2f72c3aa846 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -94,8 +94,7 @@ Status VisitArray(const Array* arr, std::vector* field_nodes class RowBatchWriter { public: - explicit RowBatchWriter(const RowBatch* batch) : - batch_(batch) {} + explicit RowBatchWriter(const RowBatch* batch) : batch_(batch) {} Status AssemblePayload() { // Perform depth-first traversal of the row-batch @@ -138,12 +137,12 @@ class RowBatchWriter { // determine the data header size then request a buffer such that you can // construct the flatbuffer data accessor object (see arrow::ipc::Message) std::shared_ptr data_header; - RETURN_NOT_OK(WriteDataHeader(batch_->num_rows(), offset, - field_nodes_, buffer_meta_, &data_header)); + RETURN_NOT_OK(WriteDataHeader( + batch_->num_rows(), offset, field_nodes_, buffer_meta_, &data_header)); // Write the data header at the end - RETURN_NOT_OK(dst->Write(position + offset, data_header->data(), - data_header->size())); + RETURN_NOT_OK( + dst->Write(position + offset, data_header->data(), data_header->size())); *data_header_offset = position + offset; return Status::OK(); @@ -174,8 +173,8 @@ class RowBatchWriter { std::vector> buffers_; }; -Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, - int64_t* header_offset) { +Status WriteRowBatch( + MemorySource* dst, const RowBatch* batch, int64_t position, int64_t* header_offset) { RowBatchWriter serializer(batch); RETURN_NOT_OK(serializer.AssemblePayload()); return serializer.Write(dst, position, header_offset); @@ -187,15 +186,14 @@ static constexpr int64_t INIT_METADATA_SIZE = 4096; class RowBatchReader::Impl { public: - Impl(MemorySource* source, const std::shared_ptr& metadata) : - source_(source), - metadata_(metadata) { + Impl(MemorySource* source, const std::shared_ptr& metadata) + : source_(source), metadata_(metadata) { num_buffers_ = metadata->num_buffers(); num_flattened_fields_ = metadata->num_fields(); } - Status AssembleBatch(const std::shared_ptr& schema, - std::shared_ptr* out) { + Status AssembleBatch( + const std::shared_ptr& schema, std::shared_ptr* out) { std::vector> arrays(schema->num_fields()); // The field_index and buffer_index are incremented in NextArray based on @@ -208,8 +206,7 @@ class RowBatchReader::Impl { RETURN_NOT_OK(NextArray(field, &arrays[i])); } - *out = std::make_shared(schema, metadata_->length(), - arrays); + *out = std::make_shared(schema, metadata_->length(), arrays); return Status::OK(); } @@ -243,11 +240,10 @@ class RowBatchReader::Impl { } else { data.reset(new Buffer(nullptr, 0)); } - return MakePrimitiveArray(type, field_meta.length, data, - field_meta.null_count, null_bitmap, out); - } else { - return Status::NotImplemented("Non-primitive types not complete yet"); + return MakePrimitiveArray( + type, field_meta.length, data, field_meta.null_count, null_bitmap, out); } + return Status::NotImplemented("Non-primitive types not complete yet"); } Status GetBuffer(int buffer_index, std::shared_ptr* out) { @@ -264,8 +260,8 @@ class RowBatchReader::Impl { int num_flattened_fields_; }; -Status RowBatchReader::Open(MemorySource* source, int64_t position, - std::shared_ptr* out) { +Status RowBatchReader::Open( + MemorySource* source, int64_t position, std::shared_ptr* out) { std::shared_ptr metadata; RETURN_NOT_OK(source->ReadAt(position, INIT_METADATA_SIZE, &metadata)); @@ -274,8 +270,7 @@ Status RowBatchReader::Open(MemorySource* source, int64_t position, // We may not need to call source->ReadAt again if (metadata_size > static_cast(INIT_METADATA_SIZE - sizeof(int32_t))) { // We don't have enough data, read the indicated metadata size. - RETURN_NOT_OK(source->ReadAt(position + sizeof(int32_t), - metadata_size, &metadata)); + RETURN_NOT_OK(source->ReadAt(position + sizeof(int32_t), metadata_size, &metadata)); } // TODO(wesm): buffer slicing here would be better in case ReadAt returns @@ -297,11 +292,10 @@ Status RowBatchReader::Open(MemorySource* source, int64_t position, return Status::OK(); } -Status RowBatchReader::GetRowBatch(const std::shared_ptr& schema, - std::shared_ptr* out) { +Status RowBatchReader::GetRowBatch( + const std::shared_ptr& schema, std::shared_ptr* out) { return impl_->AssembleBatch(schema, out); } - -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index 26dea6d04b8..d453fa05f49 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -52,8 +52,8 @@ class RecordBatchMessage; // // Finally, the memory offset to the start of the metadata / data header is // returned in an out-variable -Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, - int64_t* header_offset); +Status WriteRowBatch( + MemorySource* dst, const RowBatch* batch, int64_t position, int64_t* header_offset); // int64_t GetRowBatchMetadata(const RowBatch* batch); @@ -67,20 +67,20 @@ int64_t GetRowBatchSize(const RowBatch* batch); class RowBatchReader { public: - static Status Open(MemorySource* source, int64_t position, - std::shared_ptr* out); + static Status Open( + MemorySource* source, int64_t position, std::shared_ptr* out); // Reassemble the row batch. A Schema is required to be able to construct the // right array containers - Status GetRowBatch(const std::shared_ptr& schema, - std::shared_ptr* out); + Status GetRowBatch( + const std::shared_ptr& schema, std::shared_ptr* out); private: class Impl; std::unique_ptr impl_; }; -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow -#endif // ARROW_IPC_MEMORY_H +#endif // ARROW_IPC_MEMORY_H diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 79b4d710d28..fbdda77e491 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -42,12 +42,8 @@ namespace ipc { class TestWriteRowBatch : public ::testing::Test, public MemoryMapFixture { public: - void SetUp() { - pool_ = default_memory_pool(); - } - void TearDown() { - MemoryMapFixture::TearDown(); - } + void SetUp() { pool_ = default_memory_pool(); } + void TearDown() { MemoryMapFixture::TearDown(); } void InitMemoryMap(int64_t size) { std::string path = "test-write-row-batch"; @@ -83,8 +79,8 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { test::random_bytes(null_bytes, 0, null_bitmap->mutable_data()); auto a0 = std::make_shared(length, data); - auto a1 = std::make_shared(length, data, - test::bitmap_popcount(null_bitmap->data(), length), null_bitmap); + auto a1 = std::make_shared( + length, data, test::bitmap_popcount(null_bitmap->data(), length), null_bitmap); RowBatch batch(schema, length, {a0, a1}); @@ -103,10 +99,10 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); for (int i = 0; i < batch.num_columns(); ++i) { - EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) - << i << batch.column_name(i); + EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) << i + << batch.column_name(i); } } -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-memory-test.cc b/cpp/src/arrow/ipc/ipc-memory-test.cc index 332ad2a2b80..19339212225 100644 --- a/cpp/src/arrow/ipc/ipc-memory-test.cc +++ b/cpp/src/arrow/ipc/ipc-memory-test.cc @@ -35,13 +35,10 @@ namespace ipc { class TestMemoryMappedSource : public ::testing::Test, public MemoryMapFixture { public: - void TearDown() { - MemoryMapFixture::TearDown(); - } + void TearDown() { MemoryMapFixture::TearDown(); } }; -TEST_F(TestMemoryMappedSource, InvalidUsages) { -} +TEST_F(TestMemoryMappedSource, InvalidUsages) {} TEST_F(TestMemoryMappedSource, WriteRead) { const int64_t buffer_size = 1024; @@ -74,9 +71,9 @@ TEST_F(TestMemoryMappedSource, InvalidFile) { std::string non_existent_path = "invalid-file-name-asfd"; std::shared_ptr result; - ASSERT_RAISES(IOError, MemoryMappedSource::Open(non_existent_path, - MemorySource::READ_ONLY, &result)); + ASSERT_RAISES(IOError, + MemoryMappedSource::Open(non_existent_path, MemorySource::READ_ONLY, &result)); } -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-metadata-test.cc b/cpp/src/arrow/ipc/ipc-metadata-test.cc index ceabec0fa7c..51d79cfb4c4 100644 --- a/cpp/src/arrow/ipc/ipc-metadata-test.cc +++ b/cpp/src/arrow/ipc/ipc-metadata-test.cc @@ -86,14 +86,12 @@ TEST_F(TestSchemaMessage, NestedFields) { auto type = std::make_shared(std::make_shared()); auto f0 = std::make_shared("f0", type); - std::shared_ptr type2(new StructType({ - std::make_shared("k1", INT32), - std::make_shared("k2", INT32), - std::make_shared("k3", INT32)})); + std::shared_ptr type2(new StructType({std::make_shared("k1", INT32), + std::make_shared("k2", INT32), std::make_shared("k3", INT32)})); auto f1 = std::make_shared("f1", type2); Schema schema({f0, f1}); CheckRoundtrip(&schema); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index e630ccd109b..2b077e97929 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -17,7 +17,7 @@ #include "arrow/ipc/memory.h" -#include // For memory-mapping +#include // For memory-mapping #include #include #include @@ -32,8 +32,7 @@ namespace arrow { namespace ipc { -MemorySource::MemorySource(AccessMode access_mode) : - access_mode_(access_mode) {} +MemorySource::MemorySource(AccessMode access_mode) : access_mode_(access_mode) {} MemorySource::~MemorySource() {} @@ -41,10 +40,7 @@ MemorySource::~MemorySource() {} class MemoryMappedSource::Impl { public: - Impl() : - file_(nullptr), - is_open_(false), - data_(nullptr) {} + Impl() : file_(nullptr), is_open_(false), data_(nullptr) {} ~Impl() { if (is_open_) { @@ -54,9 +50,7 @@ class MemoryMappedSource::Impl { } Status Open(const std::string& path, MemorySource::AccessMode mode) { - if (is_open_) { - return Status::IOError("A file is already open"); - } + if (is_open_) { return Status::IOError("A file is already open"); } path_ = path; @@ -72,18 +66,15 @@ class MemoryMappedSource::Impl { } fseek(file_, 0L, SEEK_END); - if (ferror(file_)) { - return Status::IOError("Unable to seek to end of file"); - } + if (ferror(file_)) { return Status::IOError("Unable to seek to end of file"); } size_ = ftell(file_); fseek(file_, 0L, SEEK_SET); is_open_ = true; // TODO(wesm): Add read-only version of this - data_ = reinterpret_cast(mmap(nullptr, size_, - PROT_READ | PROT_WRITE, - MAP_SHARED, fileno(file_), 0)); + data_ = reinterpret_cast( + mmap(nullptr, size_, PROT_READ | PROT_WRITE, MAP_SHARED, fileno(file_), 0)); if (data_ == nullptr) { std::stringstream ss; ss << "Memory mapping file failed, errno: " << errno; @@ -93,13 +84,9 @@ class MemoryMappedSource::Impl { return Status::OK(); } - int64_t size() const { - return size_; - } + int64_t size() const { return size_; } - uint8_t* data() { - return data_; - } + uint8_t* data() { return data_; } private: std::string path_; @@ -111,8 +98,8 @@ class MemoryMappedSource::Impl { uint8_t* data_; }; -MemoryMappedSource::MemoryMappedSource(AccessMode access_mode) : - MemorySource(access_mode) {} +MemoryMappedSource::MemoryMappedSource(AccessMode access_mode) + : MemorySource(access_mode) {} Status MemoryMappedSource::Open(const std::string& path, AccessMode access_mode, std::shared_ptr* out) { @@ -134,8 +121,8 @@ Status MemoryMappedSource::Close() { return Status::OK(); } -Status MemoryMappedSource::ReadAt(int64_t position, int64_t nbytes, - std::shared_ptr* out) { +Status MemoryMappedSource::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { if (position < 0 || position >= impl_->size()) { return Status::Invalid("position is out of bounds"); } @@ -145,8 +132,7 @@ Status MemoryMappedSource::ReadAt(int64_t position, int64_t nbytes, return Status::OK(); } -Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, - int64_t nbytes) { +Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { if (position < 0 || position >= impl_->size()) { return Status::Invalid("position is out of bounds"); } @@ -158,5 +144,5 @@ Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, return Status::OK(); } -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h index 0b4d8347c34..e529603dc6e 100644 --- a/cpp/src/arrow/ipc/memory.h +++ b/cpp/src/arrow/ipc/memory.h @@ -52,8 +52,8 @@ class OutputStream { // memory map class BufferOutputStream : public OutputStream { public: - explicit BufferOutputStream(const std::shared_ptr& buffer): - buffer_(buffer) {} + explicit BufferOutputStream(const std::shared_ptr& buffer) + : buffer_(buffer) {} // Implement the OutputStream interface Status Close() override; @@ -72,10 +72,7 @@ class BufferOutputStream : public OutputStream { class MemorySource { public: // Indicates the access permissions of the memory source - enum AccessMode { - READ_ONLY, - READ_WRITE - }; + enum AccessMode { READ_ONLY, READ_WRITE }; virtual ~MemorySource(); @@ -83,8 +80,8 @@ class MemorySource { // the indicated location // @returns: arrow::Status indicating success / failure. The buffer is set // into the *out argument - virtual Status ReadAt(int64_t position, int64_t nbytes, - std::shared_ptr* out) = 0; + virtual Status ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) = 0; virtual Status Close() = 0; @@ -110,8 +107,7 @@ class MemoryMappedSource : public MemorySource { Status Close() override; - Status ReadAt(int64_t position, int64_t nbytes, - std::shared_ptr* out) override; + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; Status Write(int64_t position, const uint8_t* data, int64_t nbytes) override; @@ -125,7 +121,7 @@ class MemoryMappedSource : public MemorySource { std::unique_ptr impl_; }; -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow -#endif // ARROW_IPC_MEMORY_H +#endif // ARROW_IPC_MEMORY_H diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 14b186906c3..ad5951d17e2 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -52,11 +52,12 @@ const std::shared_ptr UINT64 = std::make_shared(); const std::shared_ptr FLOAT = std::make_shared(); const std::shared_ptr DOUBLE = std::make_shared(); -static Status IntFromFlatbuffer(const flatbuf::Int* int_data, - std::shared_ptr* out) { +static Status IntFromFlatbuffer( + const flatbuf::Int* int_data, std::shared_ptr* out) { if (int_data->bitWidth() % 8 != 0) { return Status::NotImplemented("Integers not in cstdint are not implemented"); - } else if (int_data->bitWidth() > 64) { + } + if (int_data->bitWidth() > 64) { return Status::NotImplemented("Integers with more than 64 bits not implemented"); } @@ -80,8 +81,8 @@ static Status IntFromFlatbuffer(const flatbuf::Int* int_data, return Status::OK(); } -static Status FloatFromFlatuffer(const flatbuf::FloatingPoint* float_data, - std::shared_ptr* out) { +static Status FloatFromFlatuffer( + const flatbuf::FloatingPoint* float_data, std::shared_ptr* out) { if (float_data->precision() == flatbuf::Precision_SINGLE) { *out = FLOAT; } else { @@ -90,9 +91,8 @@ static Status FloatFromFlatuffer(const flatbuf::FloatingPoint* float_data, return Status::OK(); } -static Status TypeFromFlatbuffer(flatbuf::Type type, - const void* type_data, const std::vector>& children, - std::shared_ptr* out) { +static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, + const std::vector>& children, std::shared_ptr* out) { switch (type) { case flatbuf::Type_NONE: return Status::Invalid("Type metadata cannot be none"); @@ -101,8 +101,8 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, case flatbuf::Type_Bit: return Status::NotImplemented("Type is not implemented"); case flatbuf::Type_FloatingPoint: - return FloatFromFlatuffer(static_cast(type_data), - out); + return FloatFromFlatuffer( + static_cast(type_data), out); case flatbuf::Type_Binary: case flatbuf::Type_Utf8: return Status::NotImplemented("Type is not implemented"); @@ -128,16 +128,14 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, } // Forward declaration -static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, - FieldOffset* offset); +static Status FieldToFlatbuffer( + FBB& fbb, const std::shared_ptr& field, FieldOffset* offset); -static Offset IntToFlatbuffer(FBB& fbb, int bitWidth, - bool is_signed) { +static Offset IntToFlatbuffer(FBB& fbb, int bitWidth, bool is_signed) { return flatbuf::CreateInt(fbb, bitWidth, is_signed).Union(); } -static Offset FloatToFlatbuffer(FBB& fbb, - flatbuf::Precision precision) { +static Offset FloatToFlatbuffer(FBB& fbb, flatbuf::Precision precision) { return flatbuf::CreateFloatingPoint(fbb, precision).Union(); } @@ -166,10 +164,8 @@ static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr& type *offset = IntToFlatbuffer(fbb, BIT_WIDTH, IS_SIGNED); \ break; - static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, - std::vector* children, - flatbuf::Type* out_type, Offset* offset) { + std::vector* children, flatbuf::Type* out_type, Offset* offset) { switch (type->type) { case Type::BOOL: *out_type = flatbuf::Type_Bool; @@ -206,16 +202,16 @@ static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, *out_type = flatbuf::Type_Tuple; return StructToFlatbuffer(fbb, type, children, offset); default: + *out_type = flatbuf::Type_NONE; // Make clang-tidy happy std::stringstream ss; - ss << "Unable to convert type: " << type->ToString() - << std::endl; + ss << "Unable to convert type: " << type->ToString() << std::endl; return Status::NotImplemented(ss.str()); } return Status::OK(); } -static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, - FieldOffset* offset) { +static Status FieldToFlatbuffer( + FBB& fbb, const std::shared_ptr& field, FieldOffset* offset) { auto fb_name = fbb.CreateString(field->name); flatbuf::Type type_enum; @@ -225,14 +221,13 @@ static Status FieldToFlatbuffer(FBB& fbb, const std::shared_ptr& field, RETURN_NOT_OK(TypeToFlatbuffer(fbb, field->type, &children, &type_enum, &type_data)); auto fb_children = fbb.CreateVector(children); - *offset = flatbuf::CreateField(fbb, fb_name, field->nullable, type_enum, - type_data, fb_children); + *offset = flatbuf::CreateField( + fbb, fb_name, field->nullable, type_enum, type_data, fb_children); return Status::OK(); } -Status FieldFromFlatbuffer(const flatbuf::Field* field, - std::shared_ptr* out) { +Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr* out) { std::shared_ptr type; auto children = field->children(); @@ -241,8 +236,8 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, RETURN_NOT_OK(FieldFromFlatbuffer(children->Get(i), &child_fields[i])); } - RETURN_NOT_OK(TypeFromFlatbuffer(field->type_type(), - field->type(), child_fields, &type)); + RETURN_NOT_OK( + TypeFromFlatbuffer(field->type_type(), field->type(), child_fields, &type)); *out = std::make_shared(field->name()->str(), type); return Status::OK(); @@ -270,19 +265,17 @@ Status MessageBuilder::SetRecordBatch(int32_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers) { header_type_ = flatbuf::MessageHeader_RecordBatch; - header_ = flatbuf::CreateRecordBatch(fbb_, length, - fbb_.CreateVectorOfStructs(nodes), - fbb_.CreateVectorOfStructs(buffers)).Union(); + header_ = flatbuf::CreateRecordBatch(fbb_, length, fbb_.CreateVectorOfStructs(nodes), + fbb_.CreateVectorOfStructs(buffers)) + .Union(); body_length_ = body_length; return Status::OK(); } - Status WriteDataHeader(int32_t length, int64_t body_length, const std::vector& nodes, - const std::vector& buffers, - std::shared_ptr* out) { + const std::vector& buffers, std::shared_ptr* out) { MessageBuilder message; RETURN_NOT_OK(message.SetRecordBatch(length, body_length, nodes, buffers)); RETURN_NOT_OK(message.Finish()); @@ -290,8 +283,7 @@ Status WriteDataHeader(int32_t length, int64_t body_length, } Status MessageBuilder::Finish() { - auto message = flatbuf::CreateMessage(fbb_, header_type_, header_, - body_length_); + auto message = flatbuf::CreateMessage(fbb_, header_type_, header_, body_length_); fbb_.Finish(message); return Status::OK(); } @@ -313,5 +305,5 @@ Status MessageBuilder::GetBuffer(std::shared_ptr* out) { return Status::OK(); } -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index f7365d2a49f..779c5a30a04 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -36,8 +36,7 @@ class Status; namespace ipc { -Status FieldFromFlatbuffer(const flatbuf::Field* field, - std::shared_ptr* out); +Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr* out); class MessageBuilder { public: @@ -60,10 +59,9 @@ class MessageBuilder { Status WriteDataHeader(int32_t length, int64_t body_length, const std::vector& nodes, - const std::vector& buffers, - std::shared_ptr* out); + const std::vector& buffers, std::shared_ptr* out); -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow -#endif // ARROW_IPC_METADATA_INTERNAL_H +#endif // ARROW_IPC_METADATA_INTERNAL_H diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index 642f21a41e6..bcf104f0b8b 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -48,10 +48,8 @@ Status WriteSchema(const Schema* schema, std::shared_ptr* out) { class Message::Impl { public: - explicit Impl(const std::shared_ptr& buffer, - const flatbuf::Message* message) : - buffer_(buffer), - message_(message) {} + explicit Impl(const std::shared_ptr& buffer, const flatbuf::Message* message) + : buffer_(buffer), message_(message) {} Message::Type type() const { switch (message_->header_type()) { @@ -66,13 +64,9 @@ class Message::Impl { } } - const void* header() const { - return message_->header(); - } + const void* header() const { return message_->header(); } - int64_t body_length() const { - return message_->bodyLength(); - } + int64_t body_length() const { return message_->bodyLength(); } private: // Owns the memory this message accesses @@ -83,16 +77,12 @@ class Message::Impl { class SchemaMessage::Impl { public: - explicit Impl(const void* schema) : - schema_(static_cast(schema)) {} + explicit Impl(const void* schema) + : schema_(static_cast(schema)) {} - const flatbuf::Field* field(int i) const { - return schema_->fields()->Get(i); - } + const flatbuf::Field* field(int i) const { return schema_->fields()->Get(i); } - int num_fields() const { - return schema_->fields()->size(); - } + int num_fields() const { return schema_->fields()->size(); } private: const flatbuf::Schema* schema_; @@ -100,8 +90,8 @@ class SchemaMessage::Impl { Message::Message() {} -Status Message::Open(const std::shared_ptr& buffer, - std::shared_ptr* out) { +Status Message::Open( + const std::shared_ptr& buffer, std::shared_ptr* out) { std::shared_ptr result(new Message()); // The buffer is prefixed by its size as int32_t @@ -128,12 +118,11 @@ std::shared_ptr Message::get_shared_ptr() { } std::shared_ptr Message::GetSchema() { - return std::make_shared(this->shared_from_this(), - impl_->header()); + return std::make_shared(this->shared_from_this(), impl_->header()); } -SchemaMessage::SchemaMessage(const std::shared_ptr& message, - const void* schema) { +SchemaMessage::SchemaMessage( + const std::shared_ptr& message, const void* schema) { message_ = message; impl_.reset(new Impl(schema)); } @@ -158,31 +147,21 @@ Status SchemaMessage::GetSchema(std::shared_ptr* out) const { class RecordBatchMessage::Impl { public: - explicit Impl(const void* batch) : - batch_(static_cast(batch)) { + explicit Impl(const void* batch) + : batch_(static_cast(batch)) { nodes_ = batch_->nodes(); buffers_ = batch_->buffers(); } - const flatbuf::FieldNode* field(int i) const { - return nodes_->Get(i); - } + const flatbuf::FieldNode* field(int i) const { return nodes_->Get(i); } - const flatbuf::Buffer* buffer(int i) const { - return buffers_->Get(i); - } + const flatbuf::Buffer* buffer(int i) const { return buffers_->Get(i); } - int32_t length() const { - return batch_->length(); - } + int32_t length() const { return batch_->length(); } - int num_buffers() const { - return batch_->buffers()->size(); - } + int num_buffers() const { return batch_->buffers()->size(); } - int num_fields() const { - return batch_->nodes()->size(); - } + int num_fields() const { return batch_->nodes()->size(); } private: const flatbuf::RecordBatch* batch_; @@ -191,12 +170,11 @@ class RecordBatchMessage::Impl { }; std::shared_ptr Message::GetRecordBatch() { - return std::make_shared(this->shared_from_this(), - impl_->header()); + return std::make_shared(this->shared_from_this(), impl_->header()); } -RecordBatchMessage::RecordBatchMessage(const std::shared_ptr& message, - const void* batch) { +RecordBatchMessage::RecordBatchMessage( + const std::shared_ptr& message, const void* batch) { message_ = message; impl_.reset(new Impl(batch)); } @@ -234,5 +212,5 @@ int RecordBatchMessage::num_fields() const { return impl_->num_fields(); } -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata.h b/cpp/src/arrow/ipc/metadata.h index c7288529b9f..838a4a676ea 100644 --- a/cpp/src/arrow/ipc/metadata.h +++ b/cpp/src/arrow/ipc/metadata.h @@ -85,8 +85,7 @@ struct BufferMetadata { class RecordBatchMessage { public: // Accepts an opaque flatbuffer pointer - RecordBatchMessage(const std::shared_ptr& message, - const void* batch_meta); + RecordBatchMessage(const std::shared_ptr& message, const void* batch_meta); FieldMetadata field(int i) const; BufferMetadata buffer(int i) const; @@ -111,15 +110,10 @@ class DictionaryBatchMessage { class Message : public std::enable_shared_from_this { public: - enum Type { - NONE, - SCHEMA, - DICTIONARY_BATCH, - RECORD_BATCH - }; + enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH }; - static Status Open(const std::shared_ptr& buffer, - std::shared_ptr* out); + static Status Open( + const std::shared_ptr& buffer, std::shared_ptr* out); std::shared_ptr get_shared_ptr(); @@ -140,7 +134,7 @@ class Message : public std::enable_shared_from_this { std::unique_ptr impl_; }; -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow -#endif // ARROW_IPC_METADATA_H +#endif // ARROW_IPC_METADATA_H diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 0fccce94107..65c837dc8b1 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -36,9 +36,7 @@ class MemoryMapFixture { void CreateFile(const std::string path, int64_t size) { FILE* file = fopen(path.c_str(), "w"); - if (file != nullptr) { - tmp_files_.push_back(path); - } + if (file != nullptr) { tmp_files_.push_back(path); } ftruncate(fileno(file), size); fclose(file); } @@ -47,7 +45,7 @@ class MemoryMapFixture { std::vector tmp_files_; }; -} // namespace ipc -} // namespace arrow +} // namespace ipc +} // namespace arrow -#endif // ARROW_IPC_TEST_COMMON_H +#endif // ARROW_IPC_TEST_COMMON_H diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index a289ddbfde6..e2280f41189 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -45,8 +45,7 @@ const auto INT64 = std::make_shared(); const auto FLOAT = std::make_shared(); const auto DOUBLE = std::make_shared(); const auto UTF8 = std::make_shared(); -const auto BINARY = std::make_shared( - std::make_shared("", UINT8)); +const auto BINARY = std::make_shared(std::make_shared("", UINT8)); const auto DECIMAL_8_4 = std::make_shared(8, 4); class TestConvertParquetSchema : public ::testing::Test { @@ -58,8 +57,8 @@ class TestConvertParquetSchema : public ::testing::Test { for (int i = 0; i < expected_schema->num_fields(); ++i) { auto lhs = result_schema_->field(i); auto rhs = expected_schema->field(i); - EXPECT_TRUE(lhs->Equals(rhs)) - << i << " " << lhs->ToString() << " != " << rhs->ToString(); + EXPECT_TRUE(lhs->Equals(rhs)) << i << " " << lhs->ToString() + << " != " << rhs->ToString(); } } @@ -99,20 +98,15 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { arrow_fields.push_back(std::make_shared("double", DOUBLE)); parquet_fields.push_back( - PrimitiveNode::Make("binary", Repetition::OPTIONAL, - ParquetType::BYTE_ARRAY)); + PrimitiveNode::Make("binary", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY)); arrow_fields.push_back(std::make_shared("binary", BINARY)); - parquet_fields.push_back( - PrimitiveNode::Make("string", Repetition::OPTIONAL, - ParquetType::BYTE_ARRAY, - LogicalType::UTF8)); + parquet_fields.push_back(PrimitiveNode::Make( + "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8)); arrow_fields.push_back(std::make_shared("string", UTF8)); - parquet_fields.push_back( - PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, - ParquetType::FIXED_LEN_BYTE_ARRAY, - LogicalType::NONE, 12)); + parquet_fields.push_back(PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, + ParquetType::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, 12)); arrow_fields.push_back(std::make_shared("flba-binary", BINARY)); auto arrow_schema = std::make_shared(arrow_fields); @@ -125,28 +119,20 @@ TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) { std::vector parquet_fields; std::vector> arrow_fields; - parquet_fields.push_back( - PrimitiveNode::Make("flba-decimal", Repetition::OPTIONAL, - ParquetType::FIXED_LEN_BYTE_ARRAY, - LogicalType::DECIMAL, 4, 8, 4)); + parquet_fields.push_back(PrimitiveNode::Make("flba-decimal", Repetition::OPTIONAL, + ParquetType::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 4, 8, 4)); arrow_fields.push_back(std::make_shared("flba-decimal", DECIMAL_8_4)); - parquet_fields.push_back( - PrimitiveNode::Make("binary-decimal", Repetition::OPTIONAL, - ParquetType::BYTE_ARRAY, - LogicalType::DECIMAL, -1, 8, 4)); + parquet_fields.push_back(PrimitiveNode::Make("binary-decimal", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, LogicalType::DECIMAL, -1, 8, 4)); arrow_fields.push_back(std::make_shared("binary-decimal", DECIMAL_8_4)); - parquet_fields.push_back( - PrimitiveNode::Make("int32-decimal", Repetition::OPTIONAL, - ParquetType::INT32, - LogicalType::DECIMAL, -1, 8, 4)); + parquet_fields.push_back(PrimitiveNode::Make("int32-decimal", Repetition::OPTIONAL, + ParquetType::INT32, LogicalType::DECIMAL, -1, 8, 4)); arrow_fields.push_back(std::make_shared("int32-decimal", DECIMAL_8_4)); - parquet_fields.push_back( - PrimitiveNode::Make("int64-decimal", Repetition::OPTIONAL, - ParquetType::INT64, - LogicalType::DECIMAL, -1, 8, 4)); + parquet_fields.push_back(PrimitiveNode::Make("int64-decimal", Repetition::OPTIONAL, + ParquetType::INT64, LogicalType::DECIMAL, -1, 8, 4)); arrow_fields.push_back(std::make_shared("int64-decimal", DECIMAL_8_4)); auto arrow_schema = std::make_shared(arrow_fields); @@ -164,22 +150,19 @@ TEST_F(TestConvertParquetSchema, UnsupportedThings) { unsupported_nodes.push_back( GroupNode::Make("repeated-group", Repetition::REPEATED, {})); - unsupported_nodes.push_back( - PrimitiveNode::Make("int32", Repetition::OPTIONAL, - ParquetType::INT32, LogicalType::DATE)); + unsupported_nodes.push_back(PrimitiveNode::Make( + "int32", Repetition::OPTIONAL, ParquetType::INT32, LogicalType::DATE)); - unsupported_nodes.push_back( - PrimitiveNode::Make("int64", Repetition::OPTIONAL, - ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); + unsupported_nodes.push_back(PrimitiveNode::Make( + "int64", Repetition::OPTIONAL, ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); for (const NodePtr& node : unsupported_nodes) { ASSERT_RAISES(NotImplemented, ConvertSchema({node})); } } -TEST(TestNodeConversion, DateAndTime) { -} +TEST(TestNodeConversion, DateAndTime) {} -} // namespace parquet +} // namespace parquet -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 14f4f5be53c..066388b4d0e 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -43,8 +43,7 @@ const auto INT64 = std::make_shared(); const auto FLOAT = std::make_shared(); const auto DOUBLE = std::make_shared(); const auto UTF8 = std::make_shared(); -const auto BINARY = std::make_shared( - std::make_shared("", UINT8)); +const auto BINARY = std::make_shared(std::make_shared("", UINT8)); TypePtr MakeDecimalType(const PrimitiveNode* node) { int precision = node->decimal_metadata().precision; @@ -167,12 +166,12 @@ Status NodeToField(const NodePtr& node, std::shared_ptr* out) { return Status::OK(); } -Status FromParquetSchema(const ::parquet::SchemaDescriptor* parquet_schema, - std::shared_ptr* out) { +Status FromParquetSchema( + const ::parquet::SchemaDescriptor* parquet_schema, std::shared_ptr* out) { // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes // from the root Parquet node - const GroupNode* schema_node = static_cast( - parquet_schema->schema().get()); + const GroupNode* schema_node = + static_cast(parquet_schema->schema().get()); std::vector> fields(schema_node->field_count()); for (int i = 0; i < schema_node->field_count(); i++) { @@ -183,6 +182,6 @@ Status FromParquetSchema(const ::parquet::SchemaDescriptor* parquet_schema, return Status::OK(); } -} // namespace parquet +} // namespace parquet -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h index a8408970ede..a44a9a4b6a8 100644 --- a/cpp/src/arrow/parquet/schema.h +++ b/cpp/src/arrow/parquet/schema.h @@ -31,14 +31,13 @@ class Status; namespace parquet { -Status NodeToField(const ::parquet::schema::NodePtr& node, - std::shared_ptr* out); +Status NodeToField(const ::parquet::schema::NodePtr& node, std::shared_ptr* out); -Status FromParquetSchema(const ::parquet::SchemaDescriptor* parquet_schema, - std::shared_ptr* out); +Status FromParquetSchema( + const ::parquet::SchemaDescriptor* parquet_schema, std::shared_ptr* out); -} // namespace parquet +} // namespace parquet -} // namespace arrow +} // namespace arrow #endif diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc index a1de1dc5ac8..8cc80be120a 100644 --- a/cpp/src/arrow/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -86,8 +86,8 @@ TEST_F(TestSchema, ToString) { auto f0 = std::make_shared("f0", INT32); auto f1 = std::make_shared("f1", std::make_shared(), false); auto f2 = std::make_shared("f2", std::make_shared()); - auto f3 = std::make_shared("f3", - std::make_shared(std::make_shared())); + auto f3 = std::make_shared( + "f3", std::make_shared(std::make_shared())); vector> fields = {f0, f1, f2, f3}; auto schema = std::make_shared(fields); @@ -101,4 +101,4 @@ f3: list)"; ASSERT_EQ(expected, result); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc index 18aad0e806f..a38acaa94ba 100644 --- a/cpp/src/arrow/schema.cc +++ b/cpp/src/arrow/schema.cc @@ -26,18 +26,14 @@ namespace arrow { -Schema::Schema(const std::vector>& fields) : - fields_(fields) {} +Schema::Schema(const std::vector>& fields) : fields_(fields) {} bool Schema::Equals(const Schema& other) const { - if (this == &other) return true; - if (num_fields() != other.num_fields()) { - return false; - } + if (this == &other) { return true; } + + if (num_fields() != other.num_fields()) { return false; } for (int i = 0; i < num_fields(); ++i) { - if (!field(i)->Equals(*other.field(i).get())) { - return false; - } + if (!field(i)->Equals(*other.field(i).get())) { return false; } } return true; } @@ -51,13 +47,11 @@ std::string Schema::ToString() const { int i = 0; for (auto field : fields_) { - if (i > 0) { - buffer << std::endl; - } + if (i > 0) { buffer << std::endl; } buffer << field->ToString(); ++i; } return buffer.str(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h index 52f3c1ceae4..a8b0d8444ac 100644 --- a/cpp/src/arrow/schema.h +++ b/cpp/src/arrow/schema.h @@ -35,21 +35,17 @@ class Schema { bool Equals(const std::shared_ptr& other) const; // Return the ith schema element. Does not boundscheck - const std::shared_ptr& field(int i) const { - return fields_[i]; - } + const std::shared_ptr& field(int i) const { return fields_[i]; } // Render a string representation of the schema suitable for debugging std::string ToString() const; - int num_fields() const { - return fields_.size(); - } + int num_fields() const { return fields_.size(); } private: std::vector> fields_; }; -} // namespace arrow +} // namespace arrow #endif // ARROW_FIELD_H diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index 4c7b8f80486..385e7d83150 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -49,10 +49,9 @@ class TestTable : public TestBase { schema_ = std::make_shared(fields); columns_ = { - std::make_shared(schema_->field(0), MakePrimitive(length)), - std::make_shared(schema_->field(1), MakePrimitive(length)), - std::make_shared(schema_->field(2), MakePrimitive(length)) - }; + std::make_shared(schema_->field(0), MakePrimitive(length)), + std::make_shared(schema_->field(1), MakePrimitive(length)), + std::make_shared(schema_->field(2), MakePrimitive(length))}; } protected: @@ -116,13 +115,12 @@ TEST_F(TestTable, InvalidColumns) { ASSERT_RAISES(Invalid, table_->ValidateColumns()); columns_ = { - std::make_shared(schema_->field(0), MakePrimitive(length)), - std::make_shared(schema_->field(1), MakePrimitive(length)), - std::make_shared(schema_->field(2), MakePrimitive(length - 1)) - }; + std::make_shared(schema_->field(0), MakePrimitive(length)), + std::make_shared(schema_->field(1), MakePrimitive(length)), + std::make_shared(schema_->field(2), MakePrimitive(length - 1))}; table_.reset(new Table("data", schema_, columns_, length)); ASSERT_RAISES(Invalid, table_->ValidateColumns()); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index e405c1d508c..d9573eae74d 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -28,20 +28,16 @@ namespace arrow { RowBatch::RowBatch(const std::shared_ptr& schema, int num_rows, - const std::vector>& columns) : - schema_(schema), - num_rows_(num_rows), - columns_(columns) {} + const std::vector>& columns) + : schema_(schema), num_rows_(num_rows), columns_(columns) {} const std::string& RowBatch::column_name(int i) const { return schema_->field(i)->name; } Table::Table(const std::string& name, const std::shared_ptr& schema, - const std::vector>& columns) : - name_(name), - schema_(schema), - columns_(columns) { + const std::vector>& columns) + : name_(name), schema_(schema), columns_(columns) { if (columns.size() == 0) { num_rows_ = 0; } else { @@ -50,11 +46,8 @@ Table::Table(const std::string& name, const std::shared_ptr& schema, } Table::Table(const std::string& name, const std::shared_ptr& schema, - const std::vector>& columns, int64_t num_rows) : - name_(name), - schema_(schema), - columns_(columns), - num_rows_(num_rows) {} + const std::vector>& columns, int64_t num_rows) + : name_(name), schema_(schema), columns_(columns), num_rows_(num_rows) {} Status Table::ValidateColumns() const { if (num_columns() != schema_->num_fields()) { @@ -66,21 +59,17 @@ Status Table::ValidateColumns() const { const Column* col = columns_[i].get(); if (col == nullptr) { std::stringstream ss; - ss << "Column " << i << " named " << col->name() - << " was null"; + ss << "Column " << i << " was null"; return Status::Invalid(ss.str()); } if (col->length() != num_rows_) { std::stringstream ss; - ss << "Column " << i << " named " << col->name() - << " expected length " - << num_rows_ - << " but got length " - << col->length(); + ss << "Column " << i << " named " << col->name() << " expected length " << num_rows_ + << " but got length " << col->length(); return Status::Invalid(ss.str()); } } return Status::OK(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index e2f73a2eedd..756b2a19593 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -42,27 +42,19 @@ class RowBatch { const std::vector>& columns); // @returns: the table's schema - const std::shared_ptr& schema() const { - return schema_; - } + const std::shared_ptr& schema() const { return schema_; } // @returns: the i-th column // Note: Does not boundscheck - const std::shared_ptr& column(int i) const { - return columns_[i]; - } + const std::shared_ptr& column(int i) const { return columns_[i]; } const std::string& column_name(int i) const; // @returns: the number of columns in the table - int num_columns() const { - return columns_.size(); - } + int num_columns() const { return columns_.size(); } // @returns: the number of rows (the corresponding length of each column) - int64_t num_rows() const { - return num_rows_; - } + int64_t num_rows() const { return num_rows_; } private: std::shared_ptr schema_; @@ -85,30 +77,20 @@ class Table { const std::vector>& columns, int64_t num_rows); // @returns: the table's name, if any (may be length 0) - const std::string& name() const { - return name_; - } + const std::string& name() const { return name_; } // @returns: the table's schema - const std::shared_ptr& schema() const { - return schema_; - } + const std::shared_ptr& schema() const { return schema_; } // Note: Does not boundscheck // @returns: the i-th column - const std::shared_ptr& column(int i) const { - return columns_[i]; - } + const std::shared_ptr& column(int i) const { return columns_[i]; } // @returns: the number of columns in the table - int num_columns() const { - return columns_.size(); - } + int num_columns() const { return columns_.size(); } // @returns: the number of rows (the corresponding length of each column) - int64_t num_rows() const { - return num_rows_; - } + int64_t num_rows() const { return num_rows_; } // After construction, perform any checks to validate the input arguments Status ValidateColumns() const; @@ -123,6 +105,6 @@ class Table { int64_t num_rows_; }; -} // namespace arrow +} // namespace arrow #endif // ARROW_TABLE_H diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index b2bce269992..538d9b233d9 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -36,38 +36,29 @@ #include "arrow/util/random.h" #include "arrow/util/status.h" -#define ASSERT_RAISES(ENUM, expr) \ - do { \ - Status s = (expr); \ - if (!s.Is##ENUM()) { \ - FAIL() << s.ToString(); \ - } \ +#define ASSERT_RAISES(ENUM, expr) \ + do { \ + Status s = (expr); \ + if (!s.Is##ENUM()) { FAIL() << s.ToString(); } \ } while (0) - -#define ASSERT_OK(expr) \ - do { \ - Status s = (expr); \ - if (!s.ok()) { \ - FAIL() << s.ToString(); \ - } \ +#define ASSERT_OK(expr) \ + do { \ + Status s = (expr); \ + if (!s.ok()) { FAIL() << s.ToString(); } \ } while (0) - -#define EXPECT_OK(expr) \ - do { \ - Status s = (expr); \ - EXPECT_TRUE(s.ok()); \ +#define EXPECT_OK(expr) \ + do { \ + Status s = (expr); \ + EXPECT_TRUE(s.ok()); \ } while (0) - namespace arrow { class TestBase : public ::testing::Test { public: - void SetUp() { - pool_ = default_memory_pool(); - } + void SetUp() { pool_ = default_memory_pool(); } template std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { @@ -97,10 +88,8 @@ void randint(int64_t N, T lower, T upper, std::vector* out) { } } - template -void random_real(int n, uint32_t seed, T min_value, T max_value, - std::vector* out) { +void random_real(int n, uint32_t seed, T min_value, T max_value, std::vector* out) { std::mt19937 gen(seed); std::uniform_real_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { @@ -108,11 +97,10 @@ void random_real(int n, uint32_t seed, T min_value, T max_value, } } - template std::shared_ptr to_buffer(const std::vector& values) { - return std::make_shared(reinterpret_cast(values.data()), - values.size() * sizeof(T)); + return std::make_shared( + reinterpret_cast(values.data()), values.size() * sizeof(T)); } void random_null_bitmap(int64_t n, double pct_null, uint8_t* null_bitmap) { @@ -143,8 +131,8 @@ void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { static inline int bitmap_popcount(const uint8_t* data, int length) { int count = 0; for (int i = 0; i < length; ++i) { - // TODO: accelerate this - if (util::get_bit(data, i)) ++count; + // TODO(wesm): accelerate this + if (util::get_bit(data, i)) { ++count; } } return count; } @@ -152,9 +140,7 @@ static inline int bitmap_popcount(const uint8_t* data, int length) { static inline int null_count(const std::vector& valid_bytes) { int result = 0; for (size_t i = 0; i < valid_bytes.size(); ++i) { - if (valid_bytes[i] == 0) { - ++result; - } + if (valid_bytes[i] == 0) { ++result; } } return result; } @@ -167,7 +153,7 @@ std::shared_ptr bytes_to_null_buffer(const std::vector& bytes) return out; } -} // namespace test -} // namespace arrow +} // namespace test +} // namespace arrow -#endif // ARROW_TEST_UTIL_H_ +#endif // ARROW_TEST_UTIL_H_ diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f7f835e96a7..4e686d9cf4a 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -25,9 +25,7 @@ namespace arrow { std::string Field::ToString() const { std::stringstream ss; ss << this->name << ": " << this->type->ToString(); - if (!this->nullable) { - ss << " not null"; - } + if (!this->nullable) { ss << " not null"; } return ss.str(); } @@ -50,7 +48,7 @@ std::string StructType::ToString() const { std::stringstream s; s << "struct<"; for (int i = 0; i < this->num_children(); ++i) { - if (i > 0) s << ", "; + if (i > 0) { s << ", "; } const std::shared_ptr& field = this->child(i); s << field->name << ": " << field->type->ToString(); } @@ -58,4 +56,4 @@ std::string StructType::ToString() const { return s.str(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 86e47791b7c..051ab46b199 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -110,8 +110,7 @@ struct DataType { std::vector> children_; - explicit DataType(Type::type type) : - type(type) {} + explicit DataType(Type::type type) : type(type) {} virtual ~DataType(); @@ -120,21 +119,13 @@ struct DataType { return this == other || (this->type == other->type); } - bool Equals(const std::shared_ptr& other) { - return Equals(other.get()); - } + bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } - const std::shared_ptr& child(int i) const { - return children_[i]; - } + const std::shared_ptr& child(int i) const { return children_[i]; } - int num_children() const { - return children_.size(); - } + int num_children() const { return children_.size(); } - virtual int value_size() const { - return -1; - } + virtual int value_size() const { return -1; } virtual std::string ToString() const = 0; }; @@ -153,28 +144,20 @@ struct Field { // Fields can be nullable bool nullable; - Field(const std::string& name, const TypePtr& type, bool nullable = true) : - name(name), - type(type), - nullable(nullable) {} + Field(const std::string& name, const TypePtr& type, bool nullable = true) + : name(name), type(type), nullable(nullable) {} - bool operator==(const Field& other) const { - return this->Equals(other); - } + bool operator==(const Field& other) const { return this->Equals(other); } - bool operator!=(const Field& other) const { - return !this->Equals(other); - } + bool operator!=(const Field& other) const { return !this->Equals(other); } bool Equals(const Field& other) const { - return (this == &other) || (this->name == other.name && - this->nullable == other.nullable && - this->type->Equals(other.type.get())); + return (this == &other) || + (this->name == other.name && this->nullable == other.nullable && + this->type->Equals(other.type.get())); } - bool Equals(const std::shared_ptr& other) const { - return Equals(*other.get()); - } + bool Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } std::string ToString() const; }; @@ -192,20 +175,15 @@ inline std::string PrimitiveType::ToString() const { return result; } -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr Type::type type_enum = Type::ENUM; \ - \ - TYPENAME() \ - : PrimitiveType() {} \ - \ - virtual int value_size() const { \ - return SIZE; \ - } \ - \ - static const char* name() { \ - return NAME; \ - } +#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ + typedef C_TYPE c_type; \ + static constexpr Type::type type_enum = Type::ENUM; \ + \ + TYPENAME() : PrimitiveType() {} \ + \ + virtual int value_size() const { return SIZE; } \ + \ + static const char* name() { return NAME; } struct NullType : public PrimitiveType { PRIMITIVE_DECL(NullType, void, NA, 0, "null"); @@ -257,27 +235,19 @@ struct DoubleType : public PrimitiveType { struct ListType : public DataType { // List can contain any other logical value type - explicit ListType(const std::shared_ptr& value_type) - : DataType(Type::LIST) { + explicit ListType(const std::shared_ptr& value_type) : DataType(Type::LIST) { children_ = {std::make_shared("item", value_type)}; } - explicit ListType(const std::shared_ptr& value_field) - : DataType(Type::LIST) { + explicit ListType(const std::shared_ptr& value_field) : DataType(Type::LIST) { children_ = {value_field}; } - const std::shared_ptr& value_field() const { - return children_[0]; - } + const std::shared_ptr& value_field() const { return children_[0]; } - const std::shared_ptr& value_type() const { - return children_[0]->type; - } + const std::shared_ptr& value_type() const { return children_[0]->type; } - static char const *name() { - return "list"; - } + static char const* name() { return "list"; } std::string ToString() const override; }; @@ -286,9 +256,7 @@ struct ListType : public DataType { struct StringType : public DataType { StringType(); - static char const *name() { - return "string"; - } + static char const* name() { return "string"; } std::string ToString() const override; }; @@ -304,10 +272,8 @@ struct StructType : public DataType { // These will be defined elsewhere template -struct type_traits { -}; - +struct type_traits {}; -} // namespace arrow +} // namespace arrow #endif // ARROW_TYPE_H diff --git a/cpp/src/arrow/types/binary.h b/cpp/src/arrow/types/binary.h index 1fd675e5fde..201fbb6e795 100644 --- a/cpp/src/arrow/types/binary.h +++ b/cpp/src/arrow/types/binary.h @@ -23,8 +23,6 @@ #include "arrow/type.h" -namespace arrow { +namespace arrow {} // namespace arrow -} // namespace arrow - -#endif // ARROW_TYPES_BINARY_H +#endif // ARROW_TYPES_BINARY_H diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h index 46d84f1f183..1712030203f 100644 --- a/cpp/src/arrow/types/collection.h +++ b/cpp/src/arrow/types/collection.h @@ -31,15 +31,11 @@ struct CollectionType : public DataType { CollectionType() : DataType(T) {} - const TypePtr& child(int i) const { - return child_types_[i]; - } + const TypePtr& child(int i) const { return child_types_[i]; } - int num_children() const { - return child_types_.size(); - } + int num_children() const { return child_types_.size(); } }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_COLLECTION_H +#endif // ARROW_TYPES_COLLECTION_H diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 34647a5005b..0a30929b97c 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -30,10 +30,10 @@ namespace arrow { class ArrayBuilder; -#define BUILDER_CASE(ENUM, BuilderType) \ - case Type::ENUM: \ - out->reset(new BuilderType(pool, type)); \ - return Status::OK(); +#define BUILDER_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + out->reset(new BuilderType(pool, type)); \ + return Status::OK(); // Initially looked at doing this with vtables, but shared pointers makes it // difficult @@ -58,30 +58,28 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(STRING, StringBuilder); - case Type::LIST: - { - std::shared_ptr value_builder; + case Type::LIST: { + std::shared_ptr value_builder; - const std::shared_ptr& value_type = static_cast( - type.get())->value_type(); - RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); - out->reset(new ListBuilder(pool, type, value_builder)); - return Status::OK(); - } + const std::shared_ptr& value_type = + static_cast(type.get())->value_type(); + RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); + out->reset(new ListBuilder(pool, type, value_builder)); + return Status::OK(); + } default: return Status::NotImplemented(type->ToString()); } } -#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ - case Type::ENUM: \ - out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ - return Status::OK(); +#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ + case Type::ENUM: \ + out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ + return Status::OK(); -Status MakePrimitiveArray(const std::shared_ptr& type, - int32_t length, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& null_bitmap, - std::shared_ptr* out) { +Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& data, int32_t null_count, + const std::shared_ptr& null_bitmap, std::shared_ptr* out) { switch (type->type) { MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); @@ -99,4 +97,4 @@ Status MakePrimitiveArray(const std::shared_ptr& type, } } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 228faeccc4e..27fb7bd2149 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -33,11 +33,10 @@ class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); -Status MakePrimitiveArray(const std::shared_ptr& type, - int32_t length, const std::shared_ptr& data, - int32_t null_count, const std::shared_ptr& null_bitmap, - std::shared_ptr* out); +Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& data, int32_t null_count, + const std::shared_ptr& null_bitmap, std::shared_ptr* out); -} // namespace arrow +} // namespace arrow -#endif // ARROW_BUILDER_H_ +#endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index e57b66ab46a..b782455546c 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -23,49 +23,30 @@ namespace arrow { struct DateType : public DataType { - enum class Unit: char { - DAY = 0, - MONTH = 1, - YEAR = 2 - }; + enum class Unit : char { DAY = 0, MONTH = 1, YEAR = 2 }; Unit unit; - explicit DateType(Unit unit = Unit::DAY) - : DataType(Type::DATE), - unit(unit) {} + explicit DateType(Unit unit = Unit::DAY) : DataType(Type::DATE), unit(unit) {} - DateType(const DateType& other) - : DateType(other.unit) {} + DateType(const DateType& other) : DateType(other.unit) {} - static char const *name() { - return "date"; - } + static char const* name() { return "date"; } }; - struct TimestampType : public DataType { - enum class Unit: char { - SECOND = 0, - MILLI = 1, - MICRO = 2, - NANO = 3 - }; + enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; Unit unit; explicit TimestampType(Unit unit = Unit::MILLI) - : DataType(Type::TIMESTAMP), - unit(unit) {} + : DataType(Type::TIMESTAMP), unit(unit) {} - TimestampType(const TimestampType& other) - : TimestampType(other.unit) {} + TimestampType(const TimestampType& other) : TimestampType(other.unit) {} - static char const *name() { - return "timestamp"; - } + static char const* name() { return "timestamp"; } }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_DATETIME_H +#endif // ARROW_TYPES_DATETIME_H diff --git a/cpp/src/arrow/types/decimal-test.cc b/cpp/src/arrow/types/decimal-test.cc index 89896c8b425..7296ff81761 100644 --- a/cpp/src/arrow/types/decimal-test.cc +++ b/cpp/src/arrow/types/decimal-test.cc @@ -37,4 +37,4 @@ TEST(TypesTest, TestDecimalType) { ASSERT_EQ(t2.scale, 4); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/decimal.cc b/cpp/src/arrow/types/decimal.cc index f120c1a9dfd..1d9a5e50e46 100644 --- a/cpp/src/arrow/types/decimal.cc +++ b/cpp/src/arrow/types/decimal.cc @@ -28,5 +28,4 @@ std::string DecimalType::ToString() const { return s.str(); } -} // namespace arrow - +} // namespace arrow diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 26243b42b0e..1be489d4f51 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -26,18 +26,15 @@ namespace arrow { struct DecimalType : public DataType { explicit DecimalType(int precision_, int scale_) - : DataType(Type::DECIMAL), precision(precision_), - scale(scale_) { } + : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} int precision; int scale; - static char const *name() { - return "decimal"; - } + static char const* name() { return "decimal"; } std::string ToString() const override; }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_DECIMAL_H +#endif // ARROW_TYPES_DECIMAL_H diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc index fb731edd607..a4e0d085620 100644 --- a/cpp/src/arrow/types/json.cc +++ b/cpp/src/arrow/types/json.cc @@ -30,9 +30,8 @@ static const TypePtr String(new StringType()); static const TypePtr Double(new DoubleType()); static const TypePtr Bool(new BooleanType()); -static const std::vector json_types = {Null, Int32, String, - Double, Bool}; +static const std::vector json_types = {Null, Int32, String, Double, Bool}; TypePtr JSONScalar::dense_type = TypePtr(new DenseUnionType(json_types)); TypePtr JSONScalar::sparse_type = TypePtr(new SparseUnionType(json_types)); -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h index 9c850afac0a..9de961f79a6 100644 --- a/cpp/src/arrow/types/json.h +++ b/cpp/src/arrow/types/json.h @@ -28,11 +28,9 @@ struct JSONScalar : public DataType { static TypePtr dense_type; static TypePtr sparse_type; - explicit JSONScalar(bool dense = true) - : DataType(Type::JSON_SCALAR), - dense(dense) {} + explicit JSONScalar(bool dense = true) : DataType(Type::JSON_SCALAR), dense(dense) {} }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_JSON_H +#endif // ARROW_TYPES_JSON_H diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 4eb560ea522..aa34f23cc02 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -76,9 +76,7 @@ class TestListBuilder : public TestBuilder { builder_ = std::dynamic_pointer_cast(tmp); } - void Done() { - result_ = std::dynamic_pointer_cast(builder_->Finish()); - } + void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: TypePtr value_type_; @@ -88,9 +86,7 @@ class TestListBuilder : public TestBuilder { shared_ptr result_; }; - -TEST_F(TestListBuilder, TestResize) { -} +TEST_F(TestListBuilder, TestResize) {} TEST_F(TestListBuilder, TestAppendNull) { ASSERT_OK(builder_->AppendNull()); @@ -155,5 +151,4 @@ TEST_F(TestListBuilder, TestZeroLength) { Done(); } - -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index d64c06d90c1..23f12ddc4ec 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -20,32 +20,25 @@ namespace arrow { bool ListArray::EqualsExact(const ListArray& other) const { - if (this == &other) return true; - if (null_count_ != other.null_count_) { - return false; - } + if (this == &other) { return true; } + if (null_count_ != other.null_count_) { return false; } - bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, - length_ + 1); + bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, length_ + 1); bool equal_null_bitmap = true; if (null_count_ > 0) { - equal_null_bitmap = null_bitmap_->Equals(*other.null_bitmap_, - util::bytes_for_bits(length_)); + equal_null_bitmap = + null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); } - if (!(equal_offsets && equal_null_bitmap)) { - return false; - } + if (!(equal_offsets && equal_null_bitmap)) { return false; } return values()->Equals(other.values()); } bool ListArray::Equals(const std::shared_ptr& arr) const { - if (this == arr.get()) return true; - if (this->type_enum() != arr->type_enum()) { - return false; - } + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 8073b512176..6b815460ecb 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -37,13 +37,12 @@ class MemoryPool; class ListArray : public Array { public: ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, - const ArrayPtr& values, - int32_t null_count = 0, - std::shared_ptr null_bitmap = nullptr) : - Array(type, length, null_count, null_bitmap) { + const ArrayPtr& values, int32_t null_count = 0, + std::shared_ptr null_bitmap = nullptr) + : Array(type, length, null_count, null_bitmap) { offset_buf_ = offsets; - offsets_ = offsets == nullptr? nullptr : - reinterpret_cast(offset_buf_->data()); + offsets_ = offsets == nullptr ? nullptr + : reinterpret_cast(offset_buf_->data()); values_ = values; } @@ -51,19 +50,17 @@ class ListArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. - const std::shared_ptr& values() const {return values_;} + const std::shared_ptr& values() const { return values_; } - const std::shared_ptr& value_type() const { - return values_->type(); - } + const std::shared_ptr& value_type() const { return values_->type(); } - const int32_t* offsets() const { return offsets_;} + const int32_t* offsets() const { return offsets_; } - int32_t offset(int i) const { return offsets_[i];} + int32_t offset(int i) const { return offsets_[i]; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) { return offsets_[i];} - int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i];} + int32_t value_offset(int i) { return offsets_[i]; } + int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i]; } bool EqualsExact(const ListArray& other) const; bool Equals(const std::shared_ptr& arr) const override; @@ -77,7 +74,6 @@ class ListArray : public Array { // ---------------------------------------------------------------------- // Array builder - // Builder class for variable-length list array value types // // To use this class, you must append values to the child array builder and use @@ -85,10 +81,9 @@ class ListArray : public Array { // have been appended to the child array) class ListBuilder : public Int32Builder { public: - ListBuilder(MemoryPool* pool, const TypePtr& type, - std::shared_ptr value_builder) - : Int32Builder(pool, type), - value_builder_(value_builder) {} + ListBuilder( + MemoryPool* pool, const TypePtr& type, std::shared_ptr value_builder) + : Int32Builder(pool, type), value_builder_(value_builder) {} Status Init(int32_t elements) { // One more than requested. @@ -116,12 +111,9 @@ class ListBuilder : public Int32Builder { int32_t new_capacity = util::next_power2(length_ + length); RETURN_NOT_OK(Resize(new_capacity)); } - memcpy(raw_data_ + length_, values, - type_traits::bytes_required(length)); + memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); - if (valid_bytes != nullptr) { - AppendNulls(valid_bytes, length); - } + if (valid_bytes != nullptr) { AppendNulls(valid_bytes, length); } length_ += length; return Status::OK(); @@ -132,12 +124,10 @@ class ListBuilder : public Int32Builder { std::shared_ptr items = value_builder_->Finish(); // Add final offset if the length is non-zero - if (length_) { - raw_data_[length_] = items->length(); - } + if (length_) { raw_data_[length_] = items->length(); } - auto result = std::make_shared(type_, length_, data_, items, - null_count_, null_bitmap_); + auto result = std::make_shared( + type_, length_, data_, items, null_count_, null_bitmap_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -145,9 +135,7 @@ class ListBuilder : public Int32Builder { return result; } - std::shared_ptr Finish() override { - return Transfer(); - } + std::shared_ptr Finish() override { return Transfer(); } // Start a new variable-length list slot // @@ -167,19 +155,14 @@ class ListBuilder : public Int32Builder { return Status::OK(); } - Status AppendNull() { - return Append(true); - } + Status AppendNull() { return Append(true); } - const std::shared_ptr& value_builder() const { - return value_builder_; - } + const std::shared_ptr& value_builder() const { return value_builder_; } protected: std::shared_ptr value_builder_; }; +} // namespace arrow -} // namespace arrow - -#endif // ARROW_TYPES_LIST_H +#endif // ARROW_TYPES_LIST_H diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 761845d9381..6bd9e73eb46 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -41,15 +41,15 @@ namespace arrow { class Array; -#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ - TEST(TypesTest, TestPrimitive_##ENUM) { \ - KLASS tp; \ - \ - ASSERT_EQ(tp.type, Type::ENUM); \ - ASSERT_EQ(tp.name(), string(NAME)); \ - \ - KLASS tp_copy = tp; \ - ASSERT_EQ(tp_copy.type, Type::ENUM); \ +#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ + TEST(TypesTest, TestPrimitive_##ENUM) { \ + KLASS tp; \ + \ + ASSERT_EQ(tp.type, Type::ENUM); \ + ASSERT_EQ(tp.name(), string(NAME)); \ + \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, Type::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -108,8 +108,8 @@ class TestPrimitiveBuilder : public TestBuilder { void Check(const std::shared_ptr& builder, bool nullable) { int size = builder->length(); - auto ex_data = std::make_shared(reinterpret_cast(draws_.data()), - size * sizeof(T)); + auto ex_data = std::make_shared( + reinterpret_cast(draws_.data()), size * sizeof(T)); std::shared_ptr ex_null_bitmap; int32_t ex_null_count = 0; @@ -121,10 +121,10 @@ class TestPrimitiveBuilder : public TestBuilder { ex_null_bitmap = nullptr; } - auto expected = std::make_shared(size, ex_data, ex_null_count, - ex_null_bitmap); - std::shared_ptr result = std::dynamic_pointer_cast( - builder->Finish()); + auto expected = + std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); + std::shared_ptr result = + std::dynamic_pointer_cast(builder->Finish()); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -145,30 +145,30 @@ class TestPrimitiveBuilder : public TestBuilder { vector valid_bytes_; }; -#define PTYPE_DECL(CapType, c_type) \ - typedef CapType##Array ArrayType; \ - typedef CapType##Builder BuilderType; \ - typedef CapType##Type Type; \ - typedef c_type T; \ - \ - static std::shared_ptr type() { \ - return std::shared_ptr(new Type()); \ +#define PTYPE_DECL(CapType, c_type) \ + typedef CapType##Array ArrayType; \ + typedef CapType##Builder BuilderType; \ + typedef CapType##Type Type; \ + typedef c_type T; \ + \ + static std::shared_ptr type() { \ + return std::shared_ptr(new Type()); \ } -#define PINT_DECL(CapType, c_type, LOWER, UPPER) \ - struct P##CapType { \ - PTYPE_DECL(CapType, c_type); \ - static void draw(int N, vector* draws) { \ - test::randint(N, LOWER, UPPER, draws); \ - } \ +#define PINT_DECL(CapType, c_type, LOWER, UPPER) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type); \ + static void draw(int N, vector* draws) { \ + test::randint(N, LOWER, UPPER, draws); \ + } \ } -#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ - struct P##CapType { \ - PTYPE_DECL(CapType, c_type); \ - static void draw(int N, vector* draws) { \ - test::random_real(N, 0, LOWER, UPPER, draws); \ - } \ +#define PFLOAT_DECL(CapType, c_type, LOWER, UPPER) \ + struct P##CapType { \ + PTYPE_DECL(CapType, c_type); \ + static void draw(int N, vector* draws) { \ + test::random_real(N, 0, LOWER, UPPER, draws); \ + } \ } PINT_DECL(UInt8, uint8_t, 0, UINT8_MAX); @@ -214,10 +214,10 @@ void TestPrimitiveBuilder::Check( ex_null_bitmap = nullptr; } - auto expected = std::make_shared(size, ex_data, ex_null_count, - ex_null_bitmap); - std::shared_ptr result = std::dynamic_pointer_cast( - builder->Finish()); + auto expected = + std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); + std::shared_ptr result = + std::dynamic_pointer_cast(builder->Finish()); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -230,31 +230,23 @@ void TestPrimitiveBuilder::Check( ASSERT_EQ(expected->length(), result->length()); for (int i = 0; i < result->length(); ++i) { - if (nullable) { - ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; - } + if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; } bool actual = util::get_bit(result->raw_data(), i); ASSERT_EQ(static_cast(draws_[i]), actual) << i; } ASSERT_TRUE(result->EqualsExact(*expected.get())); } -typedef ::testing::Types Primitives; +typedef ::testing::Types Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); -#define DECL_T() \ - typedef typename TestFixture::T T; +#define DECL_T() typedef typename TestFixture::T T; -#define DECL_TYPE() \ - typedef typename TestFixture::Type Type; - -#define DECL_ARRAYTYPE() \ - typedef typename TestFixture::ArrayType ArrayType; +#define DECL_TYPE() typedef typename TestFixture::Type Type; +#define DECL_ARRAYTYPE() typedef typename TestFixture::ArrayType ArrayType; TYPED_TEST(TestPrimitiveBuilder, TestInit) { DECL_TYPE(); @@ -369,7 +361,6 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { this->Check(this->builder_nn_, false); } - TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { DECL_T(); @@ -424,8 +415,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { ASSERT_EQ(cap, this->builder_->capacity()); ASSERT_EQ(type_traits::bytes_required(cap), this->builder_->data()->size()); - ASSERT_EQ(util::bytes_for_bits(cap), - this->builder_->null_bitmap()->size()); + ASSERT_EQ(util::bytes_for_bits(cap), this->builder_->null_bitmap()->size()); } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { @@ -437,8 +427,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_OK(this->builder_->Advance(100)); ASSERT_OK(this->builder_->Reserve(MIN_BUILDER_CAPACITY)); - ASSERT_EQ(util::next_power2(MIN_BUILDER_CAPACITY + 100), - this->builder_->capacity()); + ASSERT_EQ(util::next_power2(MIN_BUILDER_CAPACITY + 100), this->builder_->capacity()); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index c54d0757c47..9549c47b411 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -28,26 +28,21 @@ namespace arrow { // Primitive array base PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length, - const std::shared_ptr& data, - int32_t null_count, - const std::shared_ptr& null_bitmap) : - Array(type, length, null_count, null_bitmap) { + const std::shared_ptr& data, int32_t null_count, + const std::shared_ptr& null_bitmap) + : Array(type, length, null_count, null_bitmap) { data_ = data; - raw_data_ = data == nullptr? nullptr : data_->data(); + raw_data_ = data == nullptr ? nullptr : data_->data(); } bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { - if (this == &other) return true; - if (null_count_ != other.null_count_) { - return false; - } + if (this == &other) { return true; } + if (null_count_ != other.null_count_) { return false; } if (null_count_ > 0) { - bool equal_bitmap = null_bitmap_->Equals(*other.null_bitmap_, - util::ceil_byte(length_) / 8); - if (!equal_bitmap) { - return false; - } + bool equal_bitmap = + null_bitmap_->Equals(*other.null_bitmap_, util::ceil_byte(length_) / 8); + if (!equal_bitmap) { return false; } const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; @@ -56,9 +51,7 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { DCHECK_GT(value_size, 0); for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && memcmp(this_data, other_data, value_size)) { - return false; - } + if (!IsNull(i) && memcmp(this_data, other_data, value_size)) { return false; } this_data += value_size; other_data += value_size; } @@ -69,10 +62,8 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { } bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { - if (this == arr.get()) return true; - if (this->type_enum() != arr->type_enum()) { - return false; - } + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } @@ -92,9 +83,7 @@ Status PrimitiveBuilder::Init(int32_t capacity) { template Status PrimitiveBuilder::Resize(int32_t capacity) { // XXX: Set floor size for now - if (capacity < MIN_BUILDER_CAPACITY) { - capacity = MIN_BUILDER_CAPACITY; - } + if (capacity < MIN_BUILDER_CAPACITY) { capacity = MIN_BUILDER_CAPACITY; } if (capacity_ == 0) { RETURN_NOT_OK(Init(capacity)); @@ -122,8 +111,8 @@ Status PrimitiveBuilder::Reserve(int32_t elements) { } template -Status PrimitiveBuilder::Append(const value_type* values, int32_t length, - const uint8_t* valid_bytes) { +Status PrimitiveBuilder::Append( + const value_type* values, int32_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(PrimitiveBuilder::Reserve(length)); if (length > 0) { @@ -156,9 +145,8 @@ void PrimitiveBuilder::AppendNulls(const uint8_t* valid_bytes, int32_t length template std::shared_ptr PrimitiveBuilder::Finish() { - std::shared_ptr result = std::make_shared< - typename type_traits::ArrayType>( - type_, length_, data_, null_count_, null_bitmap_); + std::shared_ptr result = std::make_shared::ArrayType>( + type_, length_, data_, null_count_, null_bitmap_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; @@ -166,8 +154,8 @@ std::shared_ptr PrimitiveBuilder::Finish() { } template <> -Status PrimitiveBuilder::Append(const uint8_t* values, int32_t length, - const uint8_t* valid_bytes) { +Status PrimitiveBuilder::Append( + const uint8_t* values, int32_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); for (int i = 0; i < length; ++i) { @@ -202,23 +190,18 @@ template class PrimitiveBuilder; template class PrimitiveBuilder; BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, - int32_t null_count, - const std::shared_ptr& null_bitmap) : - PrimitiveArray(std::make_shared(), length, - data, null_count, null_bitmap) {} + int32_t null_count, const std::shared_ptr& null_bitmap) + : PrimitiveArray( + std::make_shared(), length, data, null_count, null_bitmap) {} bool BooleanArray::EqualsExact(const BooleanArray& other) const { if (this == &other) return true; - if (null_count_ != other.null_count_) { - return false; - } + if (null_count_ != other.null_count_) { return false; } if (null_count_ > 0) { - bool equal_bitmap = null_bitmap_->Equals(*other.null_bitmap_, - util::bytes_for_bits(length_)); - if (!equal_bitmap) { - return false; - } + bool equal_bitmap = + null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + if (!equal_bitmap) { return false; } const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; @@ -236,10 +219,8 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const { bool BooleanArray::Equals(const std::shared_ptr& arr) const { if (this == arr.get()) return true; - if (Type::BOOL != arr->type_enum()) { - return false; - } + if (Type::BOOL != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index ec6fee35513..fcd3db4e96e 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -34,17 +34,14 @@ namespace arrow { class MemoryPool; - // Base class for fixed-size logical types class PrimitiveArray : public Array { public: - PrimitiveArray(const TypePtr& type, int32_t length, - const std::shared_ptr& data, - int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); + PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); virtual ~PrimitiveArray() {} - const std::shared_ptr& data() const { return data_;} + const std::shared_ptr& data() const { return data_; } bool EqualsExact(const PrimitiveArray& other) const; bool Equals(const std::shared_ptr& arr) const override; @@ -54,31 +51,25 @@ class PrimitiveArray : public Array { const uint8_t* raw_data_; }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ -class NAME : public PrimitiveArray { \ - public: \ - using value_type = T; \ - using PrimitiveArray::PrimitiveArray; \ - \ - NAME(int32_t length, const std::shared_ptr& data, \ - int32_t null_count = 0, \ - const std::shared_ptr& null_bitmap = nullptr) : \ - PrimitiveArray(std::make_shared(), length, \ - data, null_count, null_bitmap) {} \ - \ - bool EqualsExact(const NAME& other) const { \ - return PrimitiveArray::EqualsExact( \ - *static_cast(&other)); \ - } \ - \ - const T* raw_data() const { \ - return reinterpret_cast(raw_data_); \ - } \ - \ - T Value(int i) const { \ - return raw_data()[i]; \ - } \ -}; +#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ + class NAME : public PrimitiveArray { \ + public: \ + using value_type = T; \ + using PrimitiveArray::PrimitiveArray; \ + \ + NAME(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, \ + const std::shared_ptr& null_bitmap = nullptr) \ + : PrimitiveArray( \ + std::make_shared(), length, data, null_count, null_bitmap) {} \ + \ + bool EqualsExact(const NAME& other) const { \ + return PrimitiveArray::EqualsExact(*static_cast(&other)); \ + } \ + \ + const T* raw_data() const { return reinterpret_cast(raw_data_); } \ + \ + T Value(int i) const { return raw_data()[i]; } \ + }; NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type, uint8_t); NUMERIC_ARRAY_DECL(Int8Array, Int8Type, int8_t); @@ -96,9 +87,8 @@ class PrimitiveBuilder : public ArrayBuilder { public: typedef typename Type::c_type value_type; - explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : - ArrayBuilder(pool, type), - data_(nullptr) {} + explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) + : ArrayBuilder(pool, type), data_(nullptr) {} virtual ~PrimitiveBuilder() {} @@ -117,16 +107,14 @@ class PrimitiveBuilder : public ArrayBuilder { return Status::OK(); } - std::shared_ptr data() const { - return data_; - } + std::shared_ptr data() const { return data_; } // Vector append // // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot - Status Append(const value_type* values, int32_t length, - const uint8_t* valid_bytes = nullptr); + Status Append( + const value_type* values, int32_t length, const uint8_t* valid_bytes = nullptr); // Ensure that builder can accommodate an additional number of // elements. Resizes if the current capacity is not sufficient @@ -172,89 +160,69 @@ template <> struct type_traits { typedef UInt8Array ArrayType; - static inline int bytes_required(int elements) { - return elements; - } + static inline int bytes_required(int elements) { return elements; } }; template <> struct type_traits { typedef Int8Array ArrayType; - static inline int bytes_required(int elements) { - return elements; - } + static inline int bytes_required(int elements) { return elements; } }; template <> struct type_traits { typedef UInt16Array ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(uint16_t); - } + static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } }; template <> struct type_traits { typedef Int16Array ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(int16_t); - } + static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } }; template <> struct type_traits { typedef UInt32Array ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(uint32_t); - } + static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } }; template <> struct type_traits { typedef Int32Array ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(int32_t); - } + static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } }; template <> struct type_traits { typedef UInt64Array ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(uint64_t); - } + static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } }; template <> struct type_traits { typedef Int64Array ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(int64_t); - } + static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; template <> struct type_traits { typedef FloatArray ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(float); - } + static inline int bytes_required(int elements) { return elements * sizeof(float); } }; template <> struct type_traits { typedef DoubleArray ArrayType; - static inline int bytes_required(int elements) { - return elements * sizeof(double); - } + static inline int bytes_required(int elements) { return elements * sizeof(double); } }; // Builders @@ -272,25 +240,19 @@ typedef NumericBuilder Int64Builder; typedef NumericBuilder FloatBuilder; typedef NumericBuilder DoubleBuilder; - class BooleanArray : public PrimitiveArray { public: using PrimitiveArray::PrimitiveArray; BooleanArray(int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr); + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); bool EqualsExact(const BooleanArray& other) const; bool Equals(const std::shared_ptr& arr) const override; - const uint8_t* raw_data() const { - return reinterpret_cast(raw_data_); - } + const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } - bool Value(int i) const { - return util::get_bit(raw_data(), i); - } + bool Value(int i) const { return util::get_bit(raw_data(), i); } }; template <> @@ -304,8 +266,8 @@ struct type_traits { class BooleanBuilder : public PrimitiveBuilder { public: - explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) : - PrimitiveBuilder(pool, type) {} + explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) + : PrimitiveBuilder(pool, type) {} virtual ~BooleanBuilder() {} @@ -322,11 +284,9 @@ class BooleanBuilder : public PrimitiveBuilder { ++length_; } - void Append(uint8_t val) { - Append(static_cast(val)); - } + void Append(uint8_t val) { Append(static_cast(val)); } }; -} // namespace arrow +} // namespace arrow #endif // ARROW_TYPES_PRIMITIVE_H diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index d3a4cc37f9c..ee4307c4d16 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -48,7 +48,6 @@ TEST(TypesTest, TestCharType) { ASSERT_EQ(t2.size, 5); } - TEST(TypesTest, TestVarcharType) { VarcharType t1(5); @@ -72,7 +71,7 @@ TEST(TypesTest, TestStringType) { // ---------------------------------------------------------------------- // String container -class TestStringContainer : public ::testing::Test { +class TestStringContainer : public ::testing::Test { public: void SetUp() { chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; @@ -95,8 +94,8 @@ class TestStringContainer : public ::testing::Test { null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); - strings_ = std::make_shared(length_, offsets_buf_, values_, - null_count_, null_bitmap_); + strings_ = std::make_shared( + length_, offsets_buf_, values_, null_count_, null_bitmap_); } protected: @@ -117,7 +116,6 @@ class TestStringContainer : public ::testing::Test { std::shared_ptr strings_; }; - TEST_F(TestStringContainer, TestArrayBasics) { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); @@ -130,7 +128,6 @@ TEST_F(TestStringContainer, TestType) { ASSERT_EQ(Type::STRING, strings_->type_enum()); } - TEST_F(TestStringContainer, TestListFunctions) { int pos = 0; for (size_t i = 0; i < expected_.size(); ++i) { @@ -140,10 +137,9 @@ TEST_F(TestStringContainer, TestListFunctions) { } } - TEST_F(TestStringContainer, TestDestructor) { - auto arr = std::make_shared(length_, offsets_buf_, values_, - null_count_, null_bitmap_); + auto arr = std::make_shared( + length_, offsets_buf_, values_, null_count_, null_bitmap_); } TEST_F(TestStringContainer, TestGetString) { @@ -167,9 +163,7 @@ class TestStringBuilder : public TestBuilder { builder_.reset(new StringBuilder(pool_, type_)); } - void Done() { - result_ = std::dynamic_pointer_cast(builder_->Finish()); - } + void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } protected: TypePtr type_; @@ -222,4 +216,4 @@ TEST_F(TestStringBuilder, TestZeroLength) { Done(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 80b075cdfbb..29d97d03947 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -26,11 +26,10 @@ namespace arrow { const std::shared_ptr STRING(new StringType()); -StringArray::StringArray(int32_t length, - const std::shared_ptr& offsets, +StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, - const std::shared_ptr& null_bitmap) : - StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + const std::shared_ptr& null_bitmap) + : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} std::string CharType::ToString() const { std::stringstream s; @@ -38,7 +37,6 @@ std::string CharType::ToString() const { return s.str(); } - std::string VarcharType::ToString() const { std::stringstream s; s << "varchar(" << size << ")"; @@ -47,4 +45,4 @@ std::string VarcharType::ToString() const { TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type()); -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 84cd0326ec8..c5cbe1058c7 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -37,48 +37,37 @@ class MemoryPool; struct CharType : public DataType { int size; - explicit CharType(int size) - : DataType(Type::CHAR), - size(size) {} + explicit CharType(int size) : DataType(Type::CHAR), size(size) {} - CharType(const CharType& other) - : CharType(other.size) {} + CharType(const CharType& other) : CharType(other.size) {} virtual std::string ToString() const; }; - // Variable-length, null-terminated strings, up to a certain length struct VarcharType : public DataType { int size; - explicit VarcharType(int size) - : DataType(Type::VARCHAR), - size(size) {} - VarcharType(const VarcharType& other) - : VarcharType(other.size) {} + explicit VarcharType(int size) : DataType(Type::VARCHAR), size(size) {} + VarcharType(const VarcharType& other) : VarcharType(other.size) {} virtual std::string ToString() const; }; -// TODO: add a BinaryArray layer in between +// TODO(wesm): add a BinaryArray layer in between class StringArray : public ListArray { public: - StringArray(const TypePtr& type, int32_t length, - const std::shared_ptr& offsets, - const ArrayPtr& values, - int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) : - ListArray(type, length, offsets, values, null_count, null_bitmap) { + StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr) + : ListArray(type, length, offsets, values, null_count, null_bitmap) { // For convenience bytes_ = static_cast(values.get()); raw_bytes_ = bytes_->raw_data(); } - StringArray(int32_t length, - const std::shared_ptr& offsets, - const ArrayPtr& values, - int32_t null_count = 0, + StringArray(int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); // Compute the pointer t @@ -103,21 +92,18 @@ class StringArray : public ListArray { // Array builder class StringBuilder : public ListBuilder { public: - explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : - ListBuilder(pool, type, std::make_shared(pool, value_type_)) { + explicit StringBuilder(MemoryPool* pool, const TypePtr& type) + : ListBuilder(pool, type, std::make_shared(pool, value_type_)) { byte_builder_ = static_cast(value_builder_.get()); } - Status Append(const std::string& value) { - return Append(value.c_str(), value.size()); - } + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } Status Append(const char* value, int32_t length) { RETURN_NOT_OK(ListBuilder::Append()); return byte_builder_->Append(reinterpret_cast(value), length); } - Status Append(const std::vector& values, - uint8_t* null_bytes); + Status Append(const std::vector& values, uint8_t* null_bytes); std::shared_ptr Finish() override { return ListBuilder::Transfer(); @@ -130,6 +116,6 @@ class StringBuilder : public ListBuilder { static TypePtr value_type_; }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_STRING_H +#endif // ARROW_TYPES_STRING_H diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index d94396f42c5..79d560e19bc 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -49,7 +49,7 @@ TEST(TestStructType, Basics) { ASSERT_EQ(struct_type.ToString(), "struct"); - // TODO: out of bounds for field(...) + // TODO(wesm): out of bounds for field(...) } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 02af600b017..04a277a86fa 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,6 +17,4 @@ #include "arrow/types/struct.h" -namespace arrow { - -} // namespace arrow +namespace arrow {} // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 5842534d35b..17e32993bf9 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -24,8 +24,6 @@ #include "arrow/type.h" -namespace arrow { +namespace arrow {} // namespace arrow -} // namespace arrow - -#endif // ARROW_TYPES_STRUCT_H +#endif // ARROW_TYPES_STRUCT_H diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 227aca632ef..1957636b141 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -28,10 +28,10 @@ #include "arrow/type.h" #include "arrow/util/memory-pool.h" -using std::unique_ptr; - namespace arrow { +using std::unique_ptr; + class TestBuilder : public ::testing::Test { public: void SetUp() { @@ -40,6 +40,7 @@ class TestBuilder : public ::testing::Test { builder_.reset(new UInt8Builder(pool_, type_)); builder_nn_.reset(new UInt8Builder(pool_, type_)); } + protected: MemoryPool* pool_; @@ -48,6 +49,6 @@ class TestBuilder : public ::testing::Test { unique_ptr builder_nn_; }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_TEST_COMMON_H +#endif // ARROW_TYPES_TEST_COMMON_H diff --git a/cpp/src/arrow/types/union.cc b/cpp/src/arrow/types/union.cc index db3f81795ea..c891b4a5357 100644 --- a/cpp/src/arrow/types/union.cc +++ b/cpp/src/arrow/types/union.cc @@ -30,7 +30,7 @@ static inline std::string format_union(const std::vector& child_types) std::stringstream s; s << "union<"; for (size_t i = 0; i < child_types.size(); ++i) { - if (i) s << ", "; + if (i) { s << ", "; } s << child_types[i]->ToString(); } s << ">"; @@ -41,10 +41,8 @@ std::string DenseUnionType::ToString() const { return format_union(child_types_); } - std::string SparseUnionType::ToString() const { return format_union(child_types_); } - -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 29cda90b972..d2ee9bde04d 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -33,27 +33,23 @@ class Buffer; struct DenseUnionType : public CollectionType { typedef CollectionType Base; - explicit DenseUnionType(const std::vector& child_types) : - Base() { + explicit DenseUnionType(const std::vector& child_types) : Base() { child_types_ = child_types; } virtual std::string ToString() const; }; - struct SparseUnionType : public CollectionType { typedef CollectionType Base; - explicit SparseUnionType(const std::vector& child_types) : - Base() { + explicit SparseUnionType(const std::vector& child_types) : Base() { child_types_ = child_types; } virtual std::string ToString() const; }; - class UnionArray : public Array { protected: // The data are types encoded as int16 @@ -61,16 +57,13 @@ class UnionArray : public Array { std::vector> children_; }; - class DenseUnionArray : public UnionArray { protected: Buffer* offset_buf_; }; +class SparseUnionArray : public UnionArray {}; -class SparseUnionArray : public UnionArray { -}; - -} // namespace arrow +} // namespace arrow -#endif // ARROW_TYPES_UNION_H +#endif // ARROW_TYPES_UNION_H diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 220bff084fd..26554d2c906 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -41,4 +41,4 @@ TEST(UtilTests, TestNextPower2) { ASSERT_EQ(1LL << 62, next_power2((1LL << 62) - 1)); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 6c6d5330eab..475576e87ca 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -26,14 +26,12 @@ namespace arrow { void util::bytes_to_bits(const std::vector& bytes, uint8_t* bits) { for (size_t i = 0; i < bytes.size(); ++i) { - if (bytes[i] > 0) { - set_bit(bits, i); - } + if (bytes[i] > 0) { set_bit(bits, i); } } } -Status util::bytes_to_bits(const std::vector& bytes, - std::shared_ptr* out) { +Status util::bytes_to_bits( + const std::vector& bytes, std::shared_ptr* out) { int bit_length = util::bytes_for_bits(bytes.size()); auto buffer = std::make_shared(); @@ -45,4 +43,4 @@ Status util::bytes_to_bits(const std::vector& bytes, return Status::OK(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 8d6287130dd..1f0f08c4d88 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -74,8 +74,8 @@ static inline int64_t next_power2(int64_t n) { void bytes_to_bits(const std::vector& bytes, uint8_t* bits); Status bytes_to_bits(const std::vector&, std::shared_ptr*); -} // namespace util +} // namespace util -} // namespace arrow +} // namespace arrow -#endif // ARROW_UTIL_BIT_UTIL_H +#endif // ARROW_UTIL_BIT_UTIL_H diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc index 1d58226d84a..dad0f7461d9 100644 --- a/cpp/src/arrow/util/buffer-test.cc +++ b/cpp/src/arrow/util/buffer-test.cc @@ -29,8 +29,7 @@ using std::string; namespace arrow { -class TestBuffer : public ::testing::Test { -}; +class TestBuffer : public ::testing::Test {}; TEST_F(TestBuffer, Resize) { PoolBuffer buf; @@ -54,4 +53,4 @@ TEST_F(TestBuffer, ResizeOOM) { ASSERT_RAISES(OutOfMemory, buf.Resize(to_alloc)); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 04cdcd75cd4..bc9c22c10de 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -24,8 +24,7 @@ namespace arrow { -Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, - int64_t size) { +Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size) { data_ = parent->data() + offset; size_ = size; parent_ = parent; @@ -37,18 +36,13 @@ std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } -PoolBuffer::PoolBuffer(MemoryPool* pool) : - ResizableBuffer(nullptr, 0) { - if (pool == nullptr) { - pool = default_memory_pool(); - } +PoolBuffer::PoolBuffer(MemoryPool* pool) : ResizableBuffer(nullptr, 0) { + if (pool == nullptr) { pool = default_memory_pool(); } pool_ = pool; } PoolBuffer::~PoolBuffer() { - if (mutable_data_ != nullptr) { - pool_->Free(mutable_data_, capacity_); - } + if (mutable_data_ != nullptr) { pool_->Free(mutable_data_, capacity_); } } Status PoolBuffer::Reserve(int64_t new_capacity) { @@ -74,4 +68,4 @@ Status PoolBuffer::Resize(int64_t new_size) { return Status::OK(); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index c15f9b630cd..94e53b61f2e 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -38,9 +38,7 @@ class Status; // class instance class Buffer : public std::enable_shared_from_this { public: - Buffer(const uint8_t* data, int64_t size) : - data_(data), - size_(size) {} + Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be @@ -48,40 +46,28 @@ class Buffer : public std::enable_shared_from_this { // parent buffer have been destroyed Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size); - std::shared_ptr get_shared_ptr() { - return shared_from_this(); - } + std::shared_ptr get_shared_ptr() { return shared_from_this(); } // Return true if both buffers are the same size and contain the same bytes // up to the number of compared bytes bool Equals(const Buffer& other, int64_t nbytes) const { - return this == &other || - (size_ >= nbytes && other.size_ >= nbytes && - !memcmp(data_, other.data_, nbytes)); + return this == &other || (size_ >= nbytes && other.size_ >= nbytes && + !memcmp(data_, other.data_, nbytes)); } bool Equals(const Buffer& other) const { - return this == &other || - (size_ == other.size_ && !memcmp(data_, other.data_, size_)); + return this == &other || (size_ == other.size_ && !memcmp(data_, other.data_, size_)); } - const uint8_t* data() const { - return data_; - } + const uint8_t* data() const { return data_; } - int64_t size() const { - return size_; - } + int64_t size() const { return size_; } // Returns true if this Buffer is referencing memory (possibly) owned by some // other buffer - bool is_shared() const { - return static_cast(parent_); - } + bool is_shared() const { return static_cast(parent_); } - const std::shared_ptr parent() const { - return parent_; - } + const std::shared_ptr parent() const { return parent_; } protected: const uint8_t* data_; @@ -97,22 +83,17 @@ class Buffer : public std::enable_shared_from_this { // A Buffer whose contents can be mutated. May or may not own its data. class MutableBuffer : public Buffer { public: - MutableBuffer(uint8_t* data, int64_t size) : - Buffer(data, size) { + MutableBuffer(uint8_t* data, int64_t size) : Buffer(data, size) { mutable_data_ = data; } - uint8_t* mutable_data() { - return mutable_data_; - } + uint8_t* mutable_data() { return mutable_data_; } // Get a read-only view of this buffer std::shared_ptr GetImmutableView(); protected: - MutableBuffer() : - Buffer(nullptr, 0), - mutable_data_(nullptr) {} + MutableBuffer() : Buffer(nullptr, 0), mutable_data_(nullptr) {} uint8_t* mutable_data_; }; @@ -128,9 +109,8 @@ class ResizableBuffer : public MutableBuffer { virtual Status Reserve(int64_t new_capacity) = 0; protected: - ResizableBuffer(uint8_t* data, int64_t size) : - MutableBuffer(data, size), - capacity_(size) {} + ResizableBuffer(uint8_t* data, int64_t size) + : MutableBuffer(data, size), capacity_(size) {} int64_t capacity_; }; @@ -152,16 +132,11 @@ static constexpr int64_t MIN_BUFFER_CAPACITY = 1024; class BufferBuilder { public: - explicit BufferBuilder(MemoryPool* pool) : - pool_(pool), - capacity_(0), - size_(0) {} + explicit BufferBuilder(MemoryPool* pool) : pool_(pool), capacity_(0), size_(0) {} Status Append(const uint8_t* data, int length) { if (capacity_ < length + size_) { - if (capacity_ == 0) { - buffer_ = std::make_shared(pool_); - } + if (capacity_ == 0) { buffer_ = std::make_shared(pool_); } capacity_ = std::max(MIN_BUFFER_CAPACITY, capacity_); while (capacity_ < length + size_) { capacity_ *= 2; @@ -188,6 +163,6 @@ class BufferBuilder { int64_t size_; }; -} // namespace arrow +} // namespace arrow -#endif // ARROW_UTIL_BUFFER_H +#endif // ARROW_UTIL_BUFFER_H diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 3ce4ccc1e9c..527ce423e77 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -19,6 +19,7 @@ #define ARROW_UTIL_LOGGING_H #include +#include namespace arrow { @@ -37,19 +38,34 @@ namespace arrow { #define ARROW_LOG_INTERNAL(level) arrow::internal::CerrLog(level) #define ARROW_LOG(level) ARROW_LOG_INTERNAL(ARROW_##level) -#define ARROW_CHECK(condition) \ - (condition) ? 0 : ARROW_LOG(FATAL) << "Check failed: " #condition " " +#define ARROW_CHECK(condition) \ + (condition) ? 0 : ::arrow::internal::FatalLog(ARROW_FATAL) \ + << __FILE__ << __LINE__ << "Check failed: " #condition " " #ifdef NDEBUG #define ARROW_DFATAL ARROW_WARNING -#define DCHECK(condition) while (false) arrow::internal::NullLog() -#define DCHECK_EQ(val1, val2) while (false) arrow::internal::NullLog() -#define DCHECK_NE(val1, val2) while (false) arrow::internal::NullLog() -#define DCHECK_LE(val1, val2) while (false) arrow::internal::NullLog() -#define DCHECK_LT(val1, val2) while (false) arrow::internal::NullLog() -#define DCHECK_GE(val1, val2) while (false) arrow::internal::NullLog() -#define DCHECK_GT(val1, val2) while (false) arrow::internal::NullLog() +#define DCHECK(condition) \ + while (false) \ + arrow::internal::NullLog() +#define DCHECK_EQ(val1, val2) \ + while (false) \ + arrow::internal::NullLog() +#define DCHECK_NE(val1, val2) \ + while (false) \ + arrow::internal::NullLog() +#define DCHECK_LE(val1, val2) \ + while (false) \ + arrow::internal::NullLog() +#define DCHECK_LT(val1, val2) \ + while (false) \ + arrow::internal::NullLog() +#define DCHECK_GE(val1, val2) \ + while (false) \ + arrow::internal::NullLog() +#define DCHECK_GT(val1, val2) \ + while (false) \ + arrow::internal::NullLog() #else #define ARROW_DFATAL ARROW_FATAL @@ -62,13 +78,13 @@ namespace arrow { #define DCHECK_GE(val1, val2) ARROW_CHECK((val1) >= (val2)) #define DCHECK_GT(val1, val2) ARROW_CHECK((val1) > (val2)) -#endif // NDEBUG +#endif // NDEBUG namespace internal { class NullLog { public: - template + template NullLog& operator<<(const T& t) { return *this; } @@ -76,34 +92,42 @@ class NullLog { class CerrLog { public: - CerrLog(int severity) // NOLINT(runtime/explicit) - : severity_(severity), - has_logged_(false) { - } + CerrLog(int severity) // NOLINT(runtime/explicit) + : severity_(severity), + has_logged_(false) {} - ~CerrLog() { - if (has_logged_) { - std::cerr << std::endl; - } - if (severity_ == ARROW_FATAL) { - exit(1); - } + virtual ~CerrLog() { + if (has_logged_) { std::cerr << std::endl; } + if (severity_ == ARROW_FATAL) { std::exit(1); } } - template + template CerrLog& operator<<(const T& t) { has_logged_ = true; std::cerr << t; return *this; } - private: + protected: const int severity_; bool has_logged_; }; -} // namespace internal +// Clang-tidy isn't smart enough to determine that DCHECK using CerrLog doesn't +// return so we create a new class to give it a hint. +class FatalLog : public CerrLog { + public: + FatalLog(int /* severity */) // NOLINT + : CerrLog(ARROW_FATAL) {} + + [[noreturn]] ~FatalLog() { + if (has_logged_) { std::cerr << std::endl; } + std::exit(1); + } +}; + +} // namespace internal -} // namespace arrow +} // namespace arrow -#endif // ARROW_UTIL_LOGGING_H +#endif // ARROW_UTIL_LOGGING_H diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 069e627c90e..51e605ee50a 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -19,8 +19,8 @@ #define ARROW_UTIL_MACROS_H // From Google gutil -#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&) = delete; \ +#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ + TypeName(const TypeName&) = delete; \ void operator=(const TypeName&) = delete -#endif // ARROW_UTIL_MACROS_H +#endif // ARROW_UTIL_MACROS_H diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index 6ef07a07ada..e4600a9bd9b 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -45,4 +45,4 @@ TEST(DefaultMemoryPool, OOM) { ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index 0b885e9376a..fb417e74daf 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -75,4 +75,4 @@ MemoryPool* default_memory_pool() { return &default_memory_pool_; } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool.h b/cpp/src/arrow/util/memory-pool.h index 0d2478686f5..824c7248e2e 100644 --- a/cpp/src/arrow/util/memory-pool.h +++ b/cpp/src/arrow/util/memory-pool.h @@ -36,6 +36,6 @@ class MemoryPool { MemoryPool* default_memory_pool(); -} // namespace arrow +} // namespace arrow -#endif // ARROW_UTIL_MEMORY_POOL_H +#endif // ARROW_UTIL_MEMORY_POOL_H diff --git a/cpp/src/arrow/util/random.h b/cpp/src/arrow/util/random.h index 64c197ef080..31f2b0680fe 100644 --- a/cpp/src/arrow/util/random.h +++ b/cpp/src/arrow/util/random.h @@ -15,10 +15,10 @@ namespace arrow { namespace random_internal { -static const uint32_t M = 2147483647L; // 2^31-1 +static const uint32_t M = 2147483647L; // 2^31-1 const double kTwoPi = 6.283185307179586476925286; -} // namespace random_internal +} // namespace random_internal // A very simple random number generator. Not especially good at // generating truly random bits, but good enough for our needs in this @@ -27,9 +27,7 @@ class Random { public: explicit Random(uint32_t s) : seed_(s & 0x7fffffffu) { // Avoid bad seeds. - if (seed_ == 0 || seed_ == random_internal::M) { - seed_ = 1; - } + if (seed_ == 0 || seed_ == random_internal::M) { seed_ = 1; } } // Next pseudo-random 32-bit unsigned integer. @@ -50,9 +48,7 @@ class Random { // The first reduction may overflow by 1 bit, so we may need to // repeat. mod == M is not possible; using > allows the faster // sign-bit-based test. - if (seed_ > random_internal::M) { - seed_ -= random_internal::M; - } + if (seed_ > random_internal::M) { seed_ -= random_internal::M; } return seed_; } @@ -91,9 +87,7 @@ class Random { // Skewed: pick "base" uniformly from range [0,max_log] and then // return "base" random bits. The effect is to pick a number in the // range [0,2^max_log-1] with exponential bias towards smaller numbers. - uint32_t Skewed(int max_log) { - return Uniform(1 << Uniform(max_log + 1)); - } + uint32_t Skewed(int max_log) { return Uniform(1 << Uniform(max_log + 1)); } // Creates a normal distribution variable using the // Box-Muller transform. See: @@ -103,8 +97,9 @@ class Random { double Normal(double mean, double std_dev) { double uniform1 = (Next() + 1.0) / (random_internal::M + 1.0); double uniform2 = (Next() + 1.0) / (random_internal::M + 1.0); - return (mean + std_dev * sqrt(-2 * ::log(uniform1)) * - cos(random_internal::kTwoPi * uniform2)); + return ( + mean + + std_dev * sqrt(-2 * ::log(uniform1)) * cos(random_internal::kTwoPi * uniform2)); } // Return a random number between 0.0 and 1.0 inclusive. @@ -116,13 +111,11 @@ class Random { uint32_t seed_; }; - uint32_t random_seed() { - // TODO: use system time to get a reasonably random seed + // TODO(wesm): use system time to get a reasonably random seed return 0; } - -} // namespace arrow +} // namespace arrow #endif // ARROW_UTIL_RANDOM_H_ diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index 43cb87e1a8c..d194ed5572f 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -36,9 +36,7 @@ const char* Status::CopyState(const char* state) { } std::string Status::CodeAsString() const { - if (state_ == NULL) { - return "OK"; - } + if (state_ == NULL) { return "OK"; } const char* type; switch (code()) { @@ -66,9 +64,7 @@ std::string Status::CodeAsString() const { std::string Status::ToString() const { std::string result(CodeAsString()); - if (state_ == NULL) { - return result; - } + if (state_ == NULL) { return result; } result.append(": "); @@ -78,4 +74,4 @@ std::string Status::ToString() const { return result; } -} // namespace arrow +} // namespace arrow diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index 4e273edcb8f..6ddc177a9a5 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -20,32 +20,36 @@ #include // Return the given status if it is not OK. -#define ARROW_RETURN_NOT_OK(s) do { \ - ::arrow::Status _s = (s); \ - if (!_s.ok()) return _s; \ +#define ARROW_RETURN_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + if (!_s.ok()) { return _s; } \ } while (0); // Return the given status if it is not OK, but first clone it and // prepend the given message. -#define ARROW_RETURN_NOT_OK_PREPEND(s, msg) do { \ - ::arrow::Status _s = (s); \ +#define ARROW_RETURN_NOT_OK_PREPEND(s, msg) \ + do { \ + ::arrow::Status _s = (s); \ if (::gutil::PREDICT_FALSE(!_s.ok())) return _s.CloneAndPrepend(msg); \ } while (0); // Return 'to_return' if 'to_call' returns a bad status. // The substitution for 'to_return' may reference the variable // 's' for the bad status. -#define ARROW_RETURN_NOT_OK_RET(to_call, to_return) do { \ - ::arrow::Status s = (to_call); \ - if (::gutil::PREDICT_FALSE(!s.ok())) return (to_return); \ +#define ARROW_RETURN_NOT_OK_RET(to_call, to_return) \ + do { \ + ::arrow::Status s = (to_call); \ + if (::gutil::PREDICT_FALSE(!s.ok())) return (to_return); \ } while (0); // If 'to_call' returns a bad status, CHECK immediately with a logged message // of 'msg' followed by the status. -#define ARROW_CHECK_OK_PREPEND(to_call, msg) do { \ -::arrow::Status _s = (to_call); \ -ARROW_CHECK(_s.ok()) << (msg) << ": " << _s.ToString(); \ -} while (0); +#define ARROW_CHECK_OK_PREPEND(to_call, msg) \ + do { \ + ::arrow::Status _s = (to_call); \ + ARROW_CHECK(_s.ok()) << (msg) << ": " << _s.ToString(); \ + } while (0); // If the status is bad, CHECK immediately, appending the status to the // logged message. @@ -53,12 +57,13 @@ ARROW_CHECK(_s.ok()) << (msg) << ": " << _s.ToString(); \ namespace arrow { -#define RETURN_NOT_OK(s) do { \ - Status _s = (s); \ - if (!_s.ok()) return _s; \ +#define RETURN_NOT_OK(s) \ + do { \ + Status _s = (s); \ + if (!_s.ok()) { return _s; } \ } while (0); -enum class StatusCode: char { +enum class StatusCode : char { OK = 0, OutOfMemory = 1, KeyError = 2, @@ -71,7 +76,7 @@ enum class StatusCode: char { class Status { public: // Create a success status. - Status() : state_(NULL) { } + Status() : state_(NULL) {} ~Status() { delete[] state_; } // Copy the specified status. @@ -132,8 +137,7 @@ class Status { const char* state_; StatusCode code() const { - return ((state_ == NULL) ? - StatusCode::OK : static_cast(state_[4])); + return ((state_ == NULL) ? StatusCode::OK : static_cast(state_[4])); } Status(StatusCode code, const std::string& msg, int16_t posix_code); @@ -155,5 +159,4 @@ inline void Status::operator=(const Status& s) { } // namespace arrow - -#endif // ARROW_STATUS_H_ +#endif // ARROW_STATUS_H_ diff --git a/cpp/src/arrow/util/test_main.cc b/cpp/src/arrow/util/test_main.cc index adc8466fb0b..f9280470239 100644 --- a/cpp/src/arrow/util/test_main.cc +++ b/cpp/src/arrow/util/test_main.cc @@ -17,7 +17,7 @@ #include "gtest/gtest.h" -int main(int argc, char **argv) { +int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); int ret = RUN_ALL_TESTS(); diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index 3d5f532b163..f1738ff7482 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -84,8 +84,8 @@ if [ -n "$F_ALL" -o -n "$F_FLATBUFFERS" ]; then cd $TP_DIR/$FLATBUFFERS_BASEDIR CXXFLAGS=-fPIC cmake -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX -DFLATBUFFERS_BUILD_TESTS=OFF . || { echo "cmake $FLATBUFFERS_ERROR" ; exit 1; } - make -j$PARALLEL - make install + make VERBOSE=1 -j$PARALLEL || { echo "make $FLATBUFFERS_ERROR" ; exit 1; } + make install || { echo "install $FLATBUFFERS_ERROR" ; exit 1; } fi echo "---------------------" From 9d88a50c55d18860c5543dfa6ddc8f4f162dd5e5 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 3 Apr 2016 13:10:17 -0700 Subject: [PATCH 058/210] ARROW-86: [Python] Implement zero-copy Arrow-to-Pandas conversion We can create zero-copy NumPy arrays for floats and ints if we have no nulls. Each numpy-arrow-view has a reference to the underlying column to ensure that the Arrow structure lives at least as long as the newly created NumPy array. Author: Uwe L. Korn Closes #52 from xhochy/arrow-86 and squashes the following commits: ee29e90 [Uwe L. Korn] Remove duplicate ref counting 2cb4c7d [Uwe L. Korn] Release instead of reset reference 9d35528 [Uwe L. Korn] Handle reference counting with OwnedRef 327b368 [Uwe L. Korn] ARROW-86: [Python] Implement zero-copy Arrow-to-Pandas conversion --- python/pyarrow/array.pyx | 1 - python/pyarrow/includes/pyarrow.pxd | 2 +- python/pyarrow/table.pyx | 6 ++- python/src/pyarrow/adapters/pandas.cc | 67 ++++++++++++++++++++------- python/src/pyarrow/adapters/pandas.h | 3 +- python/src/pyarrow/common.h | 4 ++ 6 files changed, 60 insertions(+), 23 deletions(-) diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 456bf6d1da8..a80b3ce8398 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -288,4 +288,3 @@ cdef class RowBatch: def __getitem__(self, i): return self.arrays[i] - diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 1066b8034be..92c814706fd 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -46,6 +46,6 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, shared_ptr[CArray]* out) - Status ArrowToPandas(const shared_ptr[CColumn]& arr, PyObject** out) + Status ArrowToPandas(const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) MemoryPool* GetMemoryPool() diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 4c4816f0c7e..f02d36f520b 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -96,7 +96,7 @@ cdef class Column: import pandas as pd - check_status(pyarrow.ArrowToPandas(self.sp_column, &arr)) + check_status(pyarrow.ArrowToPandas(self.sp_column, self, &arr)) return pd.Series(arr, name=self.name) cdef _check_nullptr(self): @@ -205,6 +205,7 @@ cdef class Table: cdef: PyObject* arr shared_ptr[CColumn] col + Column column import pandas as pd @@ -212,7 +213,8 @@ cdef class Table: data = [] for i in range(self.table.num_columns()): col = self.table.column(i) - check_status(pyarrow.ArrowToPandas(col, &arr)) + column = self.column(i) + check_status(pyarrow.ArrowToPandas(col, column, &arr)) names.append(frombytes(col.get().name())) data.append( arr) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 22f1d7575f8..b39fde92034 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -520,8 +520,8 @@ static inline PyObject* make_pystring(const uint8_t* data, int32_t length) { template class ArrowDeserializer { public: - ArrowDeserializer(const std::shared_ptr& col) : - col_(col) {} + ArrowDeserializer(const std::shared_ptr& col, PyObject* py_ref) : + col_(col), py_ref_(py_ref) {} Status Convert(PyObject** out) { const std::shared_ptr data = col_->data(); @@ -548,6 +548,33 @@ class ArrowDeserializer { return Status::OK(); } + Status OutputFromData(int type, void* data) { + // Zero-Copy. We can pass the data pointer directly to NumPy. + Py_INCREF(py_ref_); + OwnedRef py_ref(py_ref); + npy_intp dims[1] = {col_->length()}; + out_ = reinterpret_cast(PyArray_SimpleNewFromData(1, dims, + type, data)); + + if (out_ == NULL) { + // Error occurred, trust that SimpleNew set the error state + return Status::OK(); + } + + if (PyArray_SetBaseObject(out_, py_ref_) == -1) { + // Error occurred, trust that SetBaseObject set the error state + return Status::OK(); + } else { + // PyArray_SetBaseObject steals our reference to py_ref_ + py_ref.release(); + } + + // Arrow data is immutable. + PyArray_CLEARFLAGS(out_, NPY_ARRAY_WRITEABLE); + + return Status::OK(); + } + template inline typename std::enable_if< arrow_traits::is_floating, Status>::type @@ -556,18 +583,20 @@ class ArrowDeserializer { arrow::PrimitiveArray* prim_arr = static_cast( arr.get()); - - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + const T* in_values = reinterpret_cast(prim_arr->data()->data()); if (arr->null_count() > 0) { + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + T* out_values = reinterpret_cast(PyArray_DATA(out_)); - const T* in_values = reinterpret_cast(prim_arr->data()->data()); for (int64_t i = 0; i < arr->length(); ++i) { out_values[i] = arr->IsNull(i) ? NAN : in_values[i]; } } else { - memcpy(PyArray_DATA(out_), prim_arr->data()->data(), - arr->length() * arr->type()->value_size()); + // Zero-Copy. We can pass the data pointer directly to NumPy. + void* data = const_cast(in_values); + int type = arrow_traits::npy_type; + RETURN_NOT_OK(OutputFromData(type, data)); } return Status::OK(); @@ -594,10 +623,10 @@ class ArrowDeserializer { out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i]; } } else { - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); - - memcpy(PyArray_DATA(out_), in_values, - arr->length() * arr->type()->value_size()); + // Zero-Copy. We can pass the data pointer directly to NumPy. + void* data = const_cast(in_values); + int type = arrow_traits::npy_type; + RETURN_NOT_OK(OutputFromData(type, data)); } return Status::OK(); @@ -680,18 +709,20 @@ class ArrowDeserializer { } private: std::shared_ptr col_; + PyObject* py_ref_; PyArrayObject* out_; }; -#define FROM_ARROW_CASE(TYPE) \ - case arrow::Type::TYPE: \ - { \ - ArrowDeserializer converter(col); \ - return converter.Convert(out); \ - } \ +#define FROM_ARROW_CASE(TYPE) \ + case arrow::Type::TYPE: \ + { \ + ArrowDeserializer converter(col, py_ref); \ + return converter.Convert(out); \ + } \ break; -Status ArrowToPandas(const std::shared_ptr& col, PyObject** out) { +Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, + PyObject** out) { switch(col->type()->type) { FROM_ARROW_CASE(BOOL); FROM_ARROW_CASE(INT8); diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index 58eb3ca61cd..17922349de6 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -36,7 +36,8 @@ namespace pyarrow { class Status; -Status ArrowToPandas(const std::shared_ptr& col, PyObject** out); +Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, + PyObject** out); Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out); diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index cc9ad9ec5bb..0211e8948f2 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -53,6 +53,10 @@ class OwnedRef { obj_ = obj; } + void release() { + obj_ = nullptr; + } + PyObject* obj() const{ return obj_; } From 7b2153b0430b825730a6e986993bb290ef29d22a Mon Sep 17 00:00:00 2001 From: Kai Zheng Date: Mon, 11 Apr 2016 10:35:50 +0200 Subject: [PATCH 059/210] =?UTF-8?q?ARROW-85:=20memcmp=20can=20be=20avoided?= =?UTF-8?q?=20in=20Equal=20when=20comparing=20with=20the=20same=20?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid memcmp when possible, if the two underlying buffers are the same. Author: Kai Zheng Closes #57 from drankye/upstream and squashes the following commits: 2a70944 [Kai Zheng] Free test buffers afterwards 6a8bef5 [Kai Zheng] Fixed some comments b83f989 [Kai Zheng] ARROW-85. Corrected another format issue by clang-format 0ddcd01 [Kai Zheng] ARROW-85. Fixed another format issue 1b48663 [Kai Zheng] ARROW-85. Fixed checking styles 9f239a3 [Kai Zheng] ARROW-85. Added tests for Buffer and the new behavior 4d04c27 [Kai Zheng] ARROW-85 memcmp can be avoided in Equal when comparing with the same Buffer --- cpp/src/arrow/util/buffer-test.cc | 43 +++++++++++++++++++++++++++++++ cpp/src/arrow/util/buffer.h | 9 ++++--- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc index dad0f7461d9..cc4ec98e4fb 100644 --- a/cpp/src/arrow/util/buffer-test.cc +++ b/cpp/src/arrow/util/buffer-test.cc @@ -53,4 +53,47 @@ TEST_F(TestBuffer, ResizeOOM) { ASSERT_RAISES(OutOfMemory, buf.Resize(to_alloc)); } +TEST_F(TestBuffer, EqualsWithSameContent) { + MemoryPool* pool = default_memory_pool(); + const int32_t bufferSize = 128 * 1024; + uint8_t* rawBuffer1; + ASSERT_OK(pool->Allocate(bufferSize, &rawBuffer1)); + memset(rawBuffer1, 12, bufferSize); + uint8_t* rawBuffer2; + ASSERT_OK(pool->Allocate(bufferSize, &rawBuffer2)); + memset(rawBuffer2, 12, bufferSize); + uint8_t* rawBuffer3; + ASSERT_OK(pool->Allocate(bufferSize, &rawBuffer3)); + memset(rawBuffer3, 3, bufferSize); + + Buffer buffer1(rawBuffer1, bufferSize); + Buffer buffer2(rawBuffer2, bufferSize); + Buffer buffer3(rawBuffer3, bufferSize); + ASSERT_TRUE(buffer1.Equals(buffer2)); + ASSERT_FALSE(buffer1.Equals(buffer3)); + + pool->Free(rawBuffer1, bufferSize); + pool->Free(rawBuffer2, bufferSize); + pool->Free(rawBuffer3, bufferSize); +} + +TEST_F(TestBuffer, EqualsWithSameBuffer) { + MemoryPool* pool = default_memory_pool(); + const int32_t bufferSize = 128 * 1024; + uint8_t* rawBuffer; + ASSERT_OK(pool->Allocate(bufferSize, &rawBuffer)); + memset(rawBuffer, 111, bufferSize); + + Buffer buffer1(rawBuffer, bufferSize); + Buffer buffer2(rawBuffer, bufferSize); + ASSERT_TRUE(buffer1.Equals(buffer2)); + + const int64_t nbytes = bufferSize / 2; + Buffer buffer3(rawBuffer, nbytes); + ASSERT_TRUE(buffer1.Equals(buffer3, nbytes)); + ASSERT_FALSE(buffer1.Equals(buffer3, nbytes + 1)); + + pool->Free(rawBuffer, bufferSize); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 94e53b61f2e..56532be8070 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -51,12 +51,15 @@ class Buffer : public std::enable_shared_from_this { // Return true if both buffers are the same size and contain the same bytes // up to the number of compared bytes bool Equals(const Buffer& other, int64_t nbytes) const { - return this == &other || (size_ >= nbytes && other.size_ >= nbytes && - !memcmp(data_, other.data_, nbytes)); + return this == &other || + (size_ >= nbytes && other.size_ >= nbytes && + (data_ == other.data_ || !memcmp(data_, other.data_, nbytes))); } bool Equals(const Buffer& other) const { - return this == &other || (size_ == other.size_ && !memcmp(data_, other.data_, size_)); + return this == &other || + (size_ == other.size_ && + (data_ == other.data_ || !memcmp(data_, other.data_, size_))); } const uint8_t* data() const { return data_; } From 37f72716822f5b7ec3055b2dd0fabbc992e46c08 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 14 Apr 2016 19:24:19 +0200 Subject: [PATCH 060/210] ARROW-94: [Format] Expand list example to clarify null vs empty list WIP to make sure what I've done so far looks good. Per discussion on the JIRA item I started conversion of examples images to "text diagrams", but I wanted to get feedback to see if this is actually desirable (and if the way I'm approaching it is desirable). The remaining diagrams are for unions which I can convert if the existing changes look OK to others (although I think the Union diagrams are are pretty reasonable/compact). This change also includes some other minor cleanup, as well as including a statement about endianness per the discussion on the mailing list. Rendered markdown can be seen at: https://github.com/emkornfield/arrow/blob/emk_doc_fixes_PR3/format/Layout.md Author: Micah Kornfield Closes #58 from emkornfield/emk_doc_fixes_PR3 and squashes the following commits: 00b99ef [Micah Kornfield] remove png diagrams that are no longer used cab6f87 [Micah Kornfield] a few more consistency fixes 5550a78 [Micah Kornfield] fix some off by one bugs 69e1a78 [Micah Kornfield] fix some alignment, and one last offset array to buffer conversion b7aa7ea [Micah Kornfield] change list offset array to offset buffer 7dda5d5 [Micah Kornfield] clarify requirements of child types, finish replacing diagrams, fix some typos 0f23052 [Micah Kornfield] replace diagrams with physical layouts, clarify memory requirements for struct 590e4a7 [Micah Kornfield] cleanup magic quotes and clarify/fix some minor points --- format/Layout.md | 343 +++++++++++++++++++-- format/diagrams/layout-dense-union.png | Bin 47999 -> 0 bytes format/diagrams/layout-list-of-list.png | Bin 40105 -> 0 bytes format/diagrams/layout-list-of-struct.png | Bin 54122 -> 0 bytes format/diagrams/layout-list.png | Bin 15906 -> 0 bytes format/diagrams/layout-primitive-array.png | Bin 10907 -> 0 bytes format/diagrams/layout-sparse-union.png | Bin 43020 -> 0 bytes 7 files changed, 311 insertions(+), 32 deletions(-) delete mode 100644 format/diagrams/layout-dense-union.png delete mode 100644 format/diagrams/layout-list-of-list.png delete mode 100644 format/diagrams/layout-list-of-struct.png delete mode 100644 format/diagrams/layout-list.png delete mode 100644 format/diagrams/layout-primitive-array.png delete mode 100644 format/diagrams/layout-sparse-union.png diff --git a/format/Layout.md b/format/Layout.md index 1b532c6b381..92553d944c2 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -9,7 +9,7 @@ concepts, here is a small glossary to help disambiguate. * Slot or array slot: a single logical value in an array of some particular data type * Contiguous memory region: a sequential virtual address space with a given length. Any byte can be reached via a single pointer offset less than the - region’s length. + region's length. * Primitive type: a data type that occupies a fixed-size memory slot specified in bit width or byte width * Nested or parametric type: a data type whose full structure depends on one or @@ -42,7 +42,7 @@ Base requirements * Capable of representing fully-materialized and decoded / decompressed Parquet data * All leaf nodes (primitive value arrays) use contiguous memory regions -* Any relative type can be have null slots +* Any relative type can have null slots * Arrays are immutable once created. Implementations can provide APIs to mutate an array, but applying mutations will require a new array data structure to be built. @@ -69,11 +69,15 @@ Base requirements * To define a selection or masking vector construct * Implementation-specific details * Details of a user or developer C/C++/Java API. -* Any “table” structure composed of named arrays each having their own type or +* Any "table" structure composed of named arrays each having their own type or any other structure that composes arrays. * Any memory management or reference counting subsystem * To enumerate or specify types of encodings or compression support +## Byte Order (Endianness) + +The Arrow format is little endian. + ## Array lengths Any array has a known and fixed length, stored as a 32-bit signed integer, so a @@ -142,9 +146,59 @@ the size is rounded up to the nearest byte. The associated null bitmap is contiguously allocated (as described above) but does not need to be adjacent in memory to the values buffer. -(diagram not to scale) - +### Example Layout: Int32 Array +For example a primitive array of int32s: + +[1, 2, null, 4, 8] + +Would look like: + +``` +* Length: 5, Null count: 1 +* Null bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-7 | + |-------------------------|-----------------------| + |00011011 | 0 (padding) | + +* Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | + |------------|-------------|-------------|-------------|-------------| + | 1 | 2 | unspecified | 4 | 8 | +``` + +### Example Layout: Non-null int32 Array + +[1, 2, 3, 4, 8] has two possible layouts: + +``` +* Length: 5, Null count: 0 +* Null bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-7 | + |--------------------------|-----------------------| + | 00011111 | 0 (padding) | + +* Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | + |------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | +``` + +or with the bitmap elided: + +``` +* Length 5, Null count: 0 +* Null bitmap buffer: Not required +* Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | + |------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | +``` ## List type @@ -158,7 +212,7 @@ A list type is specified like `List`, where `T` is any relative type A list-array is represented by the combination of the following: * A values array, a child array of type T. T may also be a nested type. -* An offsets array containing 32-bit signed integers with length equal to the +* An offsets buffer containing 32-bit signed integers with length equal to the length of the top-level array plus one. Note that this limits the size of the values array to 2^31 -1. @@ -175,20 +229,76 @@ slot_length = offsets[j + 1] - offsets[j] // (for 0 <= j < length) The first value in the offsets array is 0, and the last element is the length of the values array. -Let’s consider an example, the type `List`, where Char is a 1-byte +### Example Layout: `List` Array +Let's consider an example, the type `List`, where Char is a 1-byte logical type. -For an array of length 3 with respective values: +For an array of length 4 with respective values: -[[‘j’, ‘o’, ‘e’], null, [‘m’, ‘a’, ‘r’, ‘k’]] +[['j', 'o', 'e'], null, ['m', 'a', 'r', 'k'], []] -We have the following offsets and values arrays +will have the following representation: - +``` +* Length: 4, Null count: 1 +* Null bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-7 | + |--------------------------|-----------------------| + | 00001101 | 0 (padding) | + +* Offsets buffer (int32) + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | + |------------|-------------|-------------|-------------|-------------| + | 0 | 3 | 3 | 7 | 7 | + +* Values array (char array): + * Length: 7, Null count: 0 + * Null bitmap buffer: Not required + + | Bytes 0-7 | + |------------| + | joemark | +``` + +### Example Layout: `List>` +[[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], [[9, 10]]] + +will be be represented as follows: + +``` +* Length 3 +* Nulls count: 0 +* Null bitmap buffer: Not required +* Offsets buffer (int32) + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | + |------------|------------|------------|-------------| + | 0 | 2 | 6 | 7 | -Let’s consider an array of a nested type, `List>` +* Values array (`List`) + * Length: 6, Null count: 1 + * Null bitmap buffer: - + | Byte 0 (validity bitmap) | Bytes 1-7 | + |--------------------------|-------------| + | 00110111 | 0 (padding) | + + * Offsets buffer (int32) + + | Bytes 0-28 | + |----------------------| + | 0, 2, 4, 7, 7, 8, 10 | + + * Values array (bytes): + * Length: 10, Null count: 0 + * Null bitmap buffer: Not required + + | Bytes 0-9 | + |-------------------------------| + | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | +``` ## Struct type @@ -198,7 +308,8 @@ types (which can all be distinct), called its fields. Typically the fields have names, but the names and their types are part of the type metadata, not the physical memory layout. -A struct does not have any additional allocated physical storage. +A struct array does not have any additional allocated physical storage for its values. +A struct array must still have an allocated null bitmap, if it has one or more null values. Physically, a struct type has one child array for each field. @@ -213,15 +324,67 @@ Struct < ``` has two child arrays, one List array (layout as above) and one 4-byte -physical value array having Int32 logical type. Here is a diagram showing the -full physical layout of this struct: +primitive value array having Int32 logical type. + +### Example Layout: `Struct, Int32>`: +The layout for [{'joe', 1}, {null, 2}, null, {'mark', 4}] would be: + +``` +* Length: 4, Null count: 1 +* Null bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-7 | + |--------------------------|-------------| + | 00001011 | 0 (padding) | + +* Children arrays: + * field-0 array (`List`): + * Length: 4, Null count: 1 + * Null bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-7 | + |--------------------------|-----------------------| + | 00011101 | 0 (padding) | - + * Offsets buffer: + + | Bytes 0-19 | + |----------------| + | 0, 3, 3, 6, 10 | + + * Values array: + * Length: 10, Null count: 0 + * Null bitmap buffer: Not required + + * Value buffer: + + | Bytes 0-9 | + |----------------| + | joebobmark | + + * field-1 array (int32 array): + * Length: 4, Null count: 0 + * Null bitmap buffer: Not required + * Value Buffer: + + | Bytes 0-15 | + |----------------| + | 1, 2, 3, 4 | + +``` While a struct does not have physical storage for each of its semantic slots (i.e. each scalar C-like struct), an entire struct slot can be set to null via the null bitmap. Any of the child field arrays can have null values according to their respective independent null bitmaps. +This implies that for a particular struct slot the null bitmap for the struct +array might indicate a null slot when one or more of its child arrays has a +non-null value in their corresponding slot. When reading the struct array the +parent null bitmap is authoritative. +This is illustrated in the example above, the child arrays have valid entries +for the null struct but are 'hidden' from the consumer by the parent array's +null bitmap. However, when treated independently corresponding +values of the children array will be non-null. ## Dense union type @@ -237,23 +400,64 @@ cases. This first, the dense union, represents a mixed-type array with 6 bytes of overhead for each value. Its physical layout is as follows: * One child array for each relative type -* Types array: An array of unsigned integers, enumerated from 0 corresponding +* Types buffer: A buffer of unsigned integers, enumerated from 0 corresponding to each type, with the smallest byte width capable of representing the number of types in the union. -* Offsets array: An array of signed int32 values indicating the relative offset +* Offsets buffer: A buffer of signed int32 values indicating the relative offset into the respective child array for the type in a given slot. The respective offsets for each child value array must be in order / increasing. -Alternate proposal (TBD): the types and offset values may be packed into an -int48 with 2 bytes for the type and 4 bytes for the offset. - Critically, the dense union allows for minimal overhead in the ubiquitous -union-of-structs with non-overlapping-fields use case (Union) +union-of-structs with non-overlapping-fields use case (`Union`) -Here is a diagram of an example dense union: +### Example Layout: Dense union + +An example layout for logical union of: +`Union` having the values: +[{f=1.2}, null, {f=3.4}, {i=5}] + +``` +* Length: 4, Null count: 1 +* Null bitmap buffer: + |Byte 0 (validity bitmap) | Bytes 1-7 | + |-------------------------|-----------------------| + |00001101 | 0 (padding) | - +* Types buffer: + + |Byte 0-1 | Byte 2-3 | Byte 4-5 | Byte 6-7 | + |---------|-------------|----------|----------| + | 0 | unspecified | 0 | 1 | + +* Offset buffer: + + |Byte 0-3 | Byte 4-7 | Byte 8-11 | Byte 12-15 | + |---------|-------------|-----------|------------| + | 0 | unspecified | 1 | 0 | + +* Children arrays: + * Field-0 array (f: float): + * Length: 2, nulls: 0 + * Null bitmap buffer: Not required + + * Value Buffer: + + | Bytes 0-7 | + |-----------| + | 1.2, 3.4 | + + + * Field-1 array (f: float): + * Length: 1, nulls: 0 + * Null bitmap buffer: Not required + + * Value Buffer: + + | Bytes 0-3 | + |-----------| + | 5 | +``` ## Sparse union type @@ -264,17 +468,92 @@ the length of the union. While a sparse union may use significantly more space compared with a dense union, it has some advantages that may be desirable in certain use cases: - +* A sparse union is more amenable to vectorized expression evaluation in some use cases. +* Equal-length arrays can be interpreted as a union by only defining the types array. -More amenable to vectorized expression evaluation in some use cases. -Equal-length arrays can be interpreted as a union by only defining the types array +### Example layout: `SparseUnion>` + +For the union array: + +[{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, 'mark'] + +will have the following layout: +``` +* Length: 6, Null count: 0 +* Null bitmap buffer: Not required + +* Types buffer: + + | Bytes 0-1 | Bytes 2-3 | Bytes 4-5 | Bytes 6-7 | Bytes 8-9 | Bytes 10-11 | + |------------|-------------|-------------|-------------|-------------|--------------| + | 0 | 1 | 2 | 1 | 0 | 2 | + +* Children arrays: + + * u0 (Int32): + * Length: 6, Null count: 4 + * Null bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-7 | + |-------------------------|-----------------------| + |00010001 | 0 (padding) | + + * Value buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | + |------------|-------------|-------------|-------------|-------------|--------------| + | 1 | unspecified | unspecified | unspecified | 4 | unspecified | + + * u1 (float): + * Length: 6, Null count: 4 + * Null bitmap buffer: + + |Byte 0 (validity bitmap) | Bytes 1-7 | + |-------------------------|-----------------------| + |00001010 | 0 (padding) | + + * Value buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | + |-------------|-------------|-------------|-------------|-------------|--------------| + | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | + + * u2 (`List`) + * Length: 6, Null count: 4 + * Null bitmap buffer: + + | Byte 0 (validity bitmap) | Bytes 1-7 | + |--------------------------|-----------------------| + | 00100100 | 0 (padding) | + + * Offsets buffer (int32) + + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | + |------------|-------------|-------------|-------------|-------------|-------------|-------------| + | 0 | 0 | 0 | 3 | 3 | 3 | 7 | + + * Values array (char array): + * Length: 7, Null count: 0 + * Null bitmap buffer: Not required + + | Bytes 0-7 | + |------------| + | joemark | +``` Note that nested types in a sparse union must be internally consistent -(e.g. see the List in the diagram), i.e. random access at any index j yields -the correct value. +(e.g. see the List in the diagram), i.e. random access at any index j +on any child array will not cause an error. +In other words, the array for the nested type must be valid if it is +reinterpreted as a non-nested array. + +Similar to structs, a particular child array may have a non-null slot +even if the null bitmap of the parent union array indicates the slot is +null. Additionally, a child array may have a non-null slot even if +the the types array indicates that a slot contains a different type at the index. ## References Drill docs https://drill.apache.org/docs/value-vectors/ -[1]: https://en.wikipedia.org/wiki/Bit_numbering \ No newline at end of file +[1]: https://en.wikipedia.org/wiki/Bit_numbering diff --git a/format/diagrams/layout-dense-union.png b/format/diagrams/layout-dense-union.png deleted file mode 100644 index 5f1f3811bf0056defe19abf494afcaba1bedbb77..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 47999 zcmeFZbx>W~*DVMH2n2TxF2OB8a0%|gHMj+L2n2V6yF-8wT!RG-8k``(-Q6Wvuan&S z``&x?y1J^XySo4QZk^nO&E9M8z1CcFjycAhCrn8}66G1;Gbku16lp0jWhf|^Z78TG zIS4S|$g&u=Blrv2Nm)`9s(hGu2mF9&FQw%K1%*ou`2#JjOmz%Sn6y;Ybk>xY<2AOk zVKy|eGcsj%x3LFjLqYMo^MZfcm^vGhx!YLVI`O&-Q2aTA7yKJ?n1zDu&neDU0u-9^ zN@TC?98JkMnO`!qQV2dHBO~K?G%@2<7L)kra`2M?g@v=TJueH3o0}W68wazUqd5y3 z4-XFuD?1B2I}c z$No8=p9S*b|HDN5o#j7A!88j#<7fHLk_kSmTw`B=f)a+3786l*hu+WhSjHKDd@Nx6 zfr%L=6;bv{oa$DcJ}z4Eh1HK%FO#L3{JvIdMCmEUTsRDI@$|lOWvV#zT&cF+Uwc

$Tz|>{&V05 zHCYFBqf=qlzjyub$N0ZF8#toNooGV*XM-F^O?%}+57&93yg3r$!mwgFFA-GfA`LCD zDPn@#^?mL+ZRcMNRLrZ5`YrWYt&;H=rF{knb zVp2*?5Cc1$e`n%AVSn)Tm-pEa_u1x<)o6}L#_xhw$c7O9j?mXbZHV>xkS19;l8`fQ z&f`KCGQ5$fFky0vTwoLVzZPEk4p^u9kQC3~`~N*hR-7#r!G-cO?As3)lRM*ua>eO2JtuXO@`S1i^$omew=Rw4JwbifkukR+G z^@Q{crg2(zg<_0d&}Qp`kqjKV|GXDFLg_>{LISVflF#G9Jz?ARYJ_U1z-lNZ z<6OF3>(1#i=P*SG3XufW_x{T{yQ6nC?b=ploQB_FHM(H%^u4NB()B1_Upszo^15CL zDN!rIEj78A)M(qU9(cGrQ`GMekHFR1l!zpZ4ZKQYsA#U1!FlYJ?u~jbeVRCQce>!V zbTCnbWn=nQj0a*dWTG(I^v_!Huqk0yV9_m(^+WbW*}d15yvbgpzpb+-I67r%yZfnw z90slEb~xhro+BUe4C!dG<<_D1dA;+Fa`i#OTGT?53*TO8jZ&==laVC&C6Ss&M07BPnca$DL^A6C;ARi~@4;D$;)}Ll_3* znMkn=?5Z>wX5_#@{nFbSGkNcDBAePVuz)7dqAOZ|7Xu@_yxpIvF;80|b)7a$V#10< zYQi1($Q)8xKO2TctIgf#D~oRzr?IvwhNa#h_d_h?_F~U`QoUFeG24BydZ+WVX>J0J zzFo^jOzKa(H^mSyh8^1e4z5^ZGF;dTL5!jy$7tniOE<2L0psfPSy`4AlM1Ki!}n_+ zRtDklY8k&;{i@>y=5MJlpzD1(V{TPi1y3;An)Bd5kg;4n%)6Ho3vO| z?ZPax1M^no2#?pCO+7i$8?UWQH#18bO~w1)7h2*JB-H%{$!`eRD?ae($Le^{4NZe?!fTFHr)!8Qh8;cx-X_@=Z*+|#G|?`Zad+5nPxmW3Q3IZ$xkm6YyL6&(*V?Bo{uaX>W@uOqXF%7o ze_)91j#Y)YoKFbXZp5mpLKEc(T^Uf{LaSm>4ybS&GS*Tguo0LN=}*_9h3fJT>gJf$ zvTW)XaC2e5KI_$DdD}cqt5{habYEI#bMr2z1Q%;oM=YrXtZw60R-RckdG+1aQQPih z$?fh~0c3epv7s$>>BF06BcDUNuIBHIi#&avK%Y2{C2eH)rWiT3*Eov9?NE21?P5x& zkUsRTYUgvh_AYI<*Xn?Th!f9R#MaLynf6~g*-3EHK{eH5)yEb2Nn{d`O{q!-85+Gc zjTsaAs_wJEICUV`egb+2PSWUIP^d{wbbK}|gY-CTQO2wk{7YubYSXxv_RH-m3OwDh zGiK>FJyFkf3KcTt`st(p7*eY5%Rf_1?+f+KZrZ{ag>Q#y8ZCKU#{+SbQ5X3=HW!i3 zy9uHr*9Hu)JTI;H{n@_jRhvlHsnNCl88Z#US5YkW z`zo^KDL(Iqn=RGnj?%+#WQsH3B742NcipTcIyPBt*w~Q18Ei?>q%fV~>&`_I^TcsH zlk>V+2_a?eE7PfG&N?Jc8njYT_+t{pz$Ai!+I>y$$NMY98kZN570LYH4zTpmbD100F>#}98&=@? zMCe9-tNjTgj+6k5aNE)+vceG0)SUDXja!;v2jaU9EqiqL_%0orY}g?&40vdV=^ zF3ftr%RHj5#GI?az35KKpbl$4@eO#bk0b(VYDCzgYrWAzzqj)+bW69mm%M6jT5ASG z@Ak0tU#a=3RAA_vTq`5Y7OMbXiv<%~7G?f3E z9ZyO3M{@%hlt+ETSdN;{P;K>q?}rnfZ*N#ogtO63{DZclfk6tAH8Ere_g=@@)o=q5K zO;S`;ehkX@&!4Qd9Pf!F;(J3glJ<8E;-tVDW17o5ifUiSyv| zzd3`fqd<=KZRfw2P+`U_$8~9`WJ6*Z#2d*0C`&gU_N(%r17u+UX;3*#O_)P09^wa8 zfSvX`uX0-db3lp$>;mZw&I_%7?%JOKw%Y#(kKsozy~Ql-u79&TCUGlB-re(C$P++WU7N_<>i z0&a8{pot<7C^LCopyt5jgUf6(U8ba$ekahWK%z{r+zCJ_6#yAVK0a817#@Q9{AaQn zh=sy0VbymPfx#xh!lMwX0`RhY)T~?}gQw>_2I4wK$XlVSd8ZDN;GXl{NrqEY%E9}~ ztEF%p2KJ(6o4LB&4>f&5U?&_ecjR6$zHM$WAIZ|O`Kp$-`t7Kd&q&w8K|I_SK}-tZ zxR+=Y)M3Iw7!6m8o^PNu&n)1sA;B_I-}B(>)sj#4(jyA@e0#}x>+NAl~pMi*2yl#hgyCoIn)kOUOsK%fH5vQ0vJ=uZ&7eCm1C=qGCG6bgFUvi z3-UCj&>kGrh!YnuA+&=J0wzvDgPwaSp%wceqn>rFvUETCZzoDb}(u_Z&vmH40 zd?a9g3pSviC^|?!5sHC7bfopItOoPdpH3cBL4iw2;&m*7N5_E|Lh&fP(Vtir_inP- zRA&y%s+ssSU!slH1|Bps4;7Rkx90T9nI!jDCSs{gM0 ztx*^6hu_Y>?YDD7(k2loVP;2jUjwgBDKPa;u-e^)q z3@)#)c1waK)%_sJjp6hYEfp$)MAmv&-vaX&ku0s(-@VJf9n__Bcfu1{8=R1OU*~dX z9S7mPwUCDFR=8`tHDrCRZzgRYOTw-3?}E)D_$Re&r+UQKf0{)*ZB`e-4S->owLxET+bv~{r zq=i=5bnFWE!S|p~;JE$C$=Y|>1?8Epy`jzT;PF0{giASdFw@!kf$fU5gEA8oRR)5d zg;pgY`5>+-7j%_k@0?*R;o-=oXnc6>3PlU_yW=Dj6qUy@Nm2omK6GUVi7mY_q@kmT zuGh==S4)cP5=1r=AXw`cGegXi7I^f&*kVzPzD(bEQFkBDG7&9_m70eUp~o(ngnG&& z7i&7fn{_O0M{%z*NHl*T^up*QwSnhkQYl0BZY0b5_fDT?^L6Z?(x*yviFL38BSRAH z+U_swKvsLRnNGQn9N}OQAl$D_@|*sp&gRlaP5>$)t3f-ij&-buj&X04z{P~hkCBgL zCnjoKXX*(c4!6q+`$r@Pf2UCDfX%@6Cwgh8;qEa9|B0cz0cOOKYK5PvWm&mIy-dfh z1uSoCDZ$DYraJ!t;9exqJT}&XNu5ZETn=Wx9s#5Zz>U4-XV$15J-3C8@2{FTw-~^7 zs9$hpUWA)(N{k?9^}0UJ>C|86cc!h~8%h=Yhzdd0#%;Vahpo4^A6f&txZM57F>Ie4Pb1IX0S8Ut^aD8;ed~VP-TUrDQUfP=kgWUX z#^-d~)9-EsUeI#Mr~q*eI)?SeR_*9{c{tO7<_+m^e9uN*PWU&e)Sq-2aar%}jYLSux6gJAnbs+a^L)Yo^MH)Rrz z)9B@I;J&ehy*m6Szpl)WHc~LtX+d+j*Tmt>n%A6$>DoKk7_xRX5fP>6zu%$8Ew;wO zrD55bq}A-%2kH3dr%ji$RuxUV^cuMkhCrMQUsUiP$ymf3nXYScDevO&%jJ_vV&9t4ggG@(0IzPK0NX4G5b-5Q7`vZd%% z?vnf*jyD_0x9r=>0C0?gg~Lv(+{N=_M6k!qZQ;`jZohs~1b5|-&^s{yd#~cCEc64n z>yF8DyW;OW=`dwdy1X>WvHh7K*T{;jAgbZyei)g3u6~r?3G_y_C232ir z5JH$2QQFmULdbQXT##ha4DL-a*A(y^58s^??R8F84NMmafZv#j`GeEt z0EQdhR#k%iiDB>Qk&#-)&0TKCQ`$?k_FdrpxxFFGFR(F^bZmDb$?y6ZDm3M;4i_@t z^(Ge*Gjq?|zb5+ZX6z*R#O*Va!iCXS@()cm(;&)L#LJvn%9;WP@-I%OhAAOimQ(h! zm3nbGZ5&JFuG4iqwF92gNBbEGH=&@;f80=FPLqtF8XRuhKr zupy7rS5fWAXf$3?(9-4Oa3=!I{U<#lwCZe8n z@#wt2Rl19vUh#R^OBYQ1eZD|8`Gw@Qo1QhY1kp-a68Ri;FR$aiudGoUo6jM8TsuH9 z1H6QVRF1_MQERqs_pZ*Wj6WS<#;#A+X2RhFre;f-ntL2fxQi12xMnoVpnc{zVN{vG9gOAiFt_}8 z?*phWPnEo9PtopAxOP$U434>Wt>Zn0Jl9%Q02WkvLfjnN%E}&owGfTHEsz}=5Tomz z&u!;}uA367UZ$P8#3TEibm1cbhb{)5WId3 zr+^$SygHP@z@?lw>~0>=utdSye@$VuJ5`26KzuxR1I0gK4VL|Ix{A_PWB$doclXb@ z8dEbI#5ZQEU17x9VkBbE)a{j2I4ys6;e>IuD^O=|a4NXx*@%7(6Xrqx(sk5K+*8CB z4$?^jwjk4p?g+d*{AC2AiYb#sz16L-gaC5d67meuk7WLFzR7HbnB~=s1fzSEyM9P~ zxRQii)+y4l>zf@?a2zJYy}8cN*m2o&PV3Jnz6T*`%^7uv5Zv5sW&dbASn}z6x!6r< z7i-bE{Y8^i^gLf%{a%wPyv}=)28J4vtN&1}f`0)+fzAR$+&^8RFZ4T5=^G$odP|&k zBN4i?e3QbBgv57qK%}$8sFeS;Uz#+btc_JNh8+g{KZTz<2ZTd_g1+oD_K^3LSA?Me zfa(cwSUkci7?4D*FAa)D5})WfNP-f&TpR)0Qe7?z2U0$D9eFP1CZs>fA3p4HTnHbj z_V?TqwXA$%9Np zQ!$#C_FqPMnVO6Uj|BySupt8j(pet7V`-seCf46v6&m9hWS*-9x&P`Pp~9sg^9;K< z|KbFBlRuxgAk%DqiTzKm3Su26c%iB+5*LMkM*S}&^6w>8G=P_w8qXUB`vHUk@Nq{3 z9smMX$Y7{t4KDX*knot1QSBny9{o?kF~Fd-&>q8)Tg_iID2~x!07tD;ZG!klKJDd< zMPB4Xta>h91TIq=kJIP9?;x#tTc-UL5dFzM+;i47^lC=umJ@lB(IA6ba+!o<2F$Eg zMaz^PJ$M{+Cm6Q(rCk{bO|&pLq6Rmo8^rcbAa?5P0Z;pGuFfXtoT_%L9&8iw0vD<7Ayr!$I!9mE{>_dk-S5Gf6<)QxbPywS1sD#B-*}b1$3&A6PW4V0U(vy%*pX zG$8)x)qKFhtO`l)8qQ$2pN~l@@@ypNd*1}bR)B==m#)(g2(7OGxMgNgE5=qN22*mW zz<02IFbBN*0R2zEVTTd(IGO|@F)y~#L6E;L@DDN&{S2%SU{h$mzO-As1JpGL=@Daq zeW=yYbqI2~z4)cCX!JMITbGp8iK<&omCj&D5_iG^jdr^ppY6ttfayV_;mF@E|p`w^u zVbnW1siB`adcny}3~>RXh!#aJup=nM$xdM3dYy>JYg<#{-{E>lS-dX?FpyBsJky1B z9I-Cy2KBO;=BqVOzMejJMvt2ZPx-b)or&EC&&pIW?)m7TjKsDN)oZ>GMw=ytKY){67mUBfhsqhJ@s3<*&-T?> z0sQxuCnvq6SffC>7N&z}$uc!XRiW)d2nO-D-9Vzczxb81jXMjCTy}Tc=JQ30{sUi)u=!UneRw-)j84nUqm&6zN3i5i*q zc6&Z)SUniceZr<$nVHYn!XFWGwlpgZpl{!Q zR>fS5!hgmZK|oN&Sj#ei1jOSh@hYI%QQ-{Fr5cSfjN^YYz9D06jnXB4@d zt)AOZQ;_TgJu^GLeG#+JW|M-?nsNXfkB{$oEaPG#ikZqsFw2-Awn(p*7L8EEK*1oe z?VrpTMKEnMWUT^YVEbiR2&d(9f1zi>x~#0{4HBif8ONpx0}&(LVg{l#`B?C#VGzFC zLFF?lZ2*+*XlWi}a^U4PNS2TySql09&yt$?EZ{wmu@*+z`Jc5v$6h|olhgw0SSd;J zZt(^DAkY=cOS#nyf$Lg_P_FZhDkH}7IpE5vHb754s&cVFZ#;5+(>tOcb_(>+F>9W&wekr{JtL{ae_6Nga1E|+J3On!ALnh z7Qx)K4fZ)oPx9MAfN-sq8sZ)Doc#Mdk6J=%Uf~^~R8pOTL{IDEz(@ho+I@HW5|OL4 z6jNQ{Y>GYphhGMs{7=89Bq`!9 zJf<7Z+W<#tZ@?~i@mizQv6ypQR0`+SMXxCmT;(+npetIaV>Jemx68jBI;P%)yvv4NReg%9jso_^yVDRg%PG9g;~j@+^Y*hE|{aqF)K6Ag<4eWMy$;oI)nye>Khf6mtQr9CimLoaK|R&m=YTa^9~+ za)SZ^ukp`7ykC{Dzpj-T>`bIXPrzb8{DLNK>%c?Rxl zxWbvbz)`a~;)xP2n$sDeN!u{yqAr3l%KucgMYv@uHZYZcK zB}TTr*y5>KzP*Mn7DEJ5gt=n15@lr&Lb~d1t;d*hw zae#?qANKSa180KiBgYhfBoVjkC%ItG^&}Lv@c`g{LB6Bn)fYqFOY9_Pd+{X!Oh6rE zL&(bAK%n&UBZ4kiN}yj;`dX)L-a&Uj7@#DO9DLd+ngx-(Kgc&#_E)){j*sP}h<209p6k+wD-XBG4V|z3Kp`^`&XIxl zND%*n%aD5!fd7X>lBtu_ z67m0xOAOrD4~&fP|6~N%*&ty?wo*(kCej< z;&C@evfC`iJ^?t&4Juk3)_NWvuG)ZP@mz%Hjq1sadG?sVyHBz}JM1g#2N=b1b1*dl zustpZR5|mzVLnBcxp$XbD!VvtCpxAJlq~dOIM8WpUHW~VfK~DhFSnj?wXZo zGI@TwG2nB1YX{N+Eu2BxGXPJ0+M%E3d+fP0xT#|3HlI8L%PX}Ox5@U23gZht3`AA~ zp3c$}2u9W*V*x0*641zxAbkLooCl~y3f+>uUyoNT*Qi(&yxmt})B9LEiNZC#3~c*o zyl@0ir;tD?sqN-&O1ysuWSf{^Q7Zu1&_=K=_ZuA6c!8oDDp3X`65dE4I1H;5tBUAz z8uuVSMAmYu(ZCcyC^Nc|@}4AJ7?7D7X+}`R_XC_=L3QsunDL{QtGCv=+w*`FQZ>|L6z=4FH}OKa3`d`OK>16r z+&jyOFBrU8kjjguuOQ=UmC^%NABZ{Xx~;?*j_nst6(jSmO7NiNY7w`c3U^Xg+*Q@a zTzDBwOm!mJzB@qR-8mC+BG3Cl4u32J9^s)^lpI3}7TO%G3Z!c`fS3&1cA9ZLjTU-n z03@Z&_5S(dyOOeg+#q;nm^T=s%5PMu)S?mY0@?vmegF^v!{w(mZIHJKi&izP&Qo>+ z_c=gZ`Cuz}vKe(D$o5WC+w@GbBD8EI>5TSQiU-~I>~P@mxZGWzOz*)Gflz1#vR?m_ zT(&;@``fE6eI*vnN~lEmIAWrYI$z%b(ZTbp?E-JDDjQ{$?afYMwgpDOJOCdrvOmv2 zb<1-)Z;wU`{npGTvhh6G zRP_$rB?qIB(y+aBZdR~U<#+>V)^J#f<$65au^2To#w^oB3B^a_7SuyFUnG3>O07NXX*r; zx#Z8Mq+Y&H`2mE}cbQZxC&Sf9(P@;0gYe+Bs@QkZASE)AfubG)snu|+#qZvS6_DZO z@4Q%=xh%8uAo*}U_A)ghyGy}f@!KIwBm(9vcP(CX;*7R9{cX4+-$5-1c>c)(t+Uet zB-*fVM($_cT2!xSc8kA%VkS$ZtynDO%E`e8(qvA{_Uf5Sm+uWJvTF8;Cm@%i`N>p` z#N#k3i50vDK#|b-=<5?xJnc^EWL0H_?2a3GcZ5JQ|g;l|!gc-wnQEQU_TRev!QLNlSq_=(7R{+L6vr>avU>dS9SK0;G z-HY-^vYm>@QA`RPPB?qhf?PO3720p)fsfIxU`zLF;3NZ5f!ZjNWihvp%2#fgA#3wz zEn}a6I!B%4wN=%f!j9>)k1rFiIc9 zZCpFWGDI6h`f9BEOhCqyQ1E%IVY%K76kCwb0d^h2aqBqb8!}9nXt4bH;%0RPv}cH= ze4_$OgAV&}rR(I@2#G>$RK8^&JqkKY{jMYl+nT047{zT{E~Wsqs5T31`?}ho#em8^ z)mKVB$Qbo%HPj$4#h2J^-l5g~EXOW3lv5q?6#6OEjicjE;*Eb1GdvR#g+C&-7)AB6 zFq-uDb)0j88s6Jmp*!=9HHz;}%w$PF>m~5cTQ6b+;fe(Jyg{LrBX89ODHu&F znX0rGVs}B`9U`b+^$9K(fqf?H8?E8xHwupnKP_j{x)V%cfy(Yxf#-@GI~8@NH6BXV z0Fezu5?|++-7uWZROje+G{dV{Wjy{~Jb7D|B8~ab#Qsg95kTZ`*Qbte6DLPfWf!TE zevHPVzos_|7k^x)vn0=^CAHLM@TP z8T%4K8RgJJaA0-T%Z?Zgf%rOS5G-e;WLEhTFAZzlY_;nB$!DPA#%toBJN1!=yyC;R z3r6mI5tE_Zi?b4nXdQTSR$QS=58Z{A;I*IsdfLb%0zZu9t>80BQ4kBM@YTW)umhna zg~5K=P>rlX$xi}A33h%eta>)%i7=&BkgZZ85cFyo zVI5Cyv}T#m-MNiEZ% z@(EBZq@KzXhin~C|9(`cpcuV#PE{JC$LY$^mxn!jW?qSds4%sN@2HvMySl0x0m$N& zgX7D|@dAV=i`B{qxGrGl)>03NU0|t+j7Gm!>IW|rHykdHk(P?EML{vpTpJokyrO_h zA~3sw7T}60WDol)j{EMIAhdz^19VqTETv>M`3x+E*OqZi#3TlTI-p={SZc0*-V}CV z%!AaaMH=qex3XaXyY(!vq+DLLRpl?Y;AvCMNqsSzjzD2a|2gE78OEqms9^fJx=j8? zyd5+UM4*K-i<1$;*s?^=ecLO;>#eZ7{C+&zt3`h^MzO58prf?%61GyeM~`TGPyjIv ze~M1@j88|KzdOhbA@V$og?K&y4l_aoP{+~*oxH$99obSJDy21}%(DHz*Z~B}ugxW3m7{c}w^#Lf8b>e&5gqZiy5GYWySu zOYQFt83mh<9-v%Emyd%*-ibP?%gnMc&r#6zgR%2B4|&pMS$Q7% z*=BJgpEX>0|CvwW4*s&oF;BfV9iuQ0cXV}jelh6P$-#s&2Ee0yr$`Y{avk;+OOk7# zQ$&R%2vG9kVg{42^HnX@rN{K^GiCjl@EZ^$JE}c8I=#d31%=mn8$FG-Q|cqf_52o7 zR89OSP6_f}m<|{An#t=Vlmh|SWv?*WGy~Z+<((4N&4Ys zniE%-AeiyZI?}Rsz^)o@-On}!GmOB55WKCVGfuo^VO`!BNxXn}7iqe}Eukf52O=e& zU2tFLbapWBa(%FrqvU$NCE*#;)>yW6Gt!V*w@zTLp@V45D@zETUM=g|!ww?>qNWNeBPc=1tG^%OH6#4?u@% z!W&(O^11g*W=ed;NBuS!sx^;ho!#>)7t*xfS1b!!CEH&dYcq#(<+td!)Kv31HwWQyer& zA%oSZt!kQ@Y`r%V2I0(_E8WbAWQk_YKx+KOmh1xp`s?I?XuUlg&sZ9T45^Q2U-qos z^sJ>-Om)Uw~}uJuGR{qQJp;&DPq>4ULP>&tBjhfojnCr(3kA5S3} zzJhN7>7pY3+v5*y0PyqVEeu-q@zAn}XA~NG`w^+gp*xn-*1?^ejC?|#B4VsQwv|v? zL%SU-HE<=^*^Sze?w&uNzF3g@|D_H7(i*}ifNP)=bovUE z+`pVf3?;br|Bq;n|5qo5k;@NgH4x#g+g9!1FW-Xg3+4z$SS^u_lGS0g3owthwhO-# zHQt74{hE0GXCnRf5dcFImF!3xD-5bib}m5`DJt(aSs06ShRENlVo0GqY7j)g{h2Qf z9P$^HLR3oU)d<{Z8P`@$R^SZcXp}5L(Id5hFp2`~gODE})gj$lAkDuAObp{YpuD#2 z7MCb>hQ|IOkA#)MO{M$gin7Phg}pqM{hp%hgIZzPk~gceFHO`UASYo5foK0@piap~ z>3hv?F-q2z4C*%fn`lm`%~bLaAw-t&EAV71y4X6Xut`ih!Ph?v6-mQbHdq%3^T01H zA+?%>XJ6i|D=dR20dh*`bd|9ljyWhF&U9PcVaEAuNTF!Z2lT-DzR-5lR2p@MlfMpr zX)`+^%><@{2q2pLB%_hQS%^LReMD6K@v4wLTzCil8qh+ZOt}Fg? zAP5(5NJWZ4%vJ_spvoC)k#wBUVz{-8K$gqmv`X5D>x)ZEQU)i8kwH2YyKH49a$zup zO$hkiS^2`uMzU<=rA+A1->Lm+f(!!MM}fVynv*WV9nh_4szJGJ?{BJDYG$WA??01} z1Y2A3Le`)^OBiM-ojaS$Ce1~cLKcQrNe`mk`#oaB^lbPrV$-X^0P)?#B9sxRIlSvJ zuVq|P2$LZ*od79S*kB#l5R?#00^FM2)}Q$oeJotax)iy+gQMdbz zvp8k`*@iC=!QPcIBIHE)_W*?U{O>dVdp`c|E?^&OGDg@QD64>Seg4v7l)eN~E?+-w z6rpCO63k9}40s#BqFRGCt^J<`vUM*&dnebsi7#NYzgzhp1n9l#yQ8*EVA_BMnJa#l z-~*~T6oDEK7{FREQnveQ7~DJR(|1U)xP%kLvvWYUXL|WIR+@^{Y!Kf&1(Za{gB}3? z`$w>SfN*WR3Y3dRkZr?qbn$Q2j0mnBNYFyY1DZ(CC6}$8G$M1!P?mrvXqwEe*@)By z+^-(k;n{$t`OHnH<_^jXih-zFtlFJS;s*$&<9Vl%dqBJmgSz@s*kr95v*&<_2J)kN z4B`ozz_}7@tEiCz$S^wso{qnKR7hpt0Q^RIjIGI{g-C}t@Qr}rdv<`V7zXIk5E?p6 z7zXvyWx6IH)ByJI0}$oInXm-OL1kU|vM5>wXki=mKCOYM!=RtS`DeS|lhF5}vVmRo0tl2(D_6`6wHk&`XDSeY?fL@(=;|sE&g(s-5 zmH}N|PvJEQ>7j(=zM%7my&@lh2ao0L9pLZ21^>P}^4TpazQxANY zcOI7qbRQc^8KCw6RqTnOJ(!D_KjQOvy9AIBptvrEU#)!vR4;dkjL}<@c3)_kucM0I z%gr^Qlc~rc8xZiF9Q62hY#`gZ2TYxE0WQT`|AEIZS&sd50M;!&mdi2(_UX2Hf9Rjg!1o4s%p}GkzGQdb5HMgT@s|H$hRsn>>lv!N?-mmC-J(m+lu77V3A$ z4~Nj0I6gEFn0_zrdWh`0)a~hDrscV2Oh5o!oZ$h^Q>oM47#kJS9njIB%(>sdYu*S# zptlZq#e*%v<4O27jK(Do0iYvPrPN>2T(+FC60Tp>@R!h&OK|92O2^O z=$3$@^Rh^Wo*D)ei3lz+%wvK^Eo1ey?#HJ&tR=se=6kC)H3V=m_5zTk_>H&pj@sXwE-|qtGrr!i* z?ror*ar-A|ztk>PM#?j${tY-dhkiX0BVe|@gjS^(C5;HK68^F=(x2IyHGqJUY-zi} zUf(Hh)G?|&wpP%v8I1Z|CJ$6^(P@v*a3wHS z&O|Xc-axu}2sq5T+8!?>K4%x)--k`v1F^-6-Owr;2lkTrBw0Hpp}`_|&4;ukgd!%q z9W-V)S&UIA-)MS(O67$#hvK`%>mTv7a~nY0-HMF1Nx5fNXb(T*5O6Rs=3B#X{h0U_ zR6F=R1D()sz*a}FifX1er;ASEbUU2y+ly8V+71IfM8@9~SMMS&G2{PMzO9S;a(XiK zNFb6xXY#oXH{)`=TVn`$K9QPSJe^@Jkn?1F1gJ%NRRYrE?=d?*6DS~3LCUlkF{`pe z;W%9jQpq(6WD-|-US8z^rKLuz=L0B{S(_P=dcToh`|9ZUG)h%g9Aw{*0Z?dZAgYGE zXsz^E)tm*kFR~7*hY$^{qu<3IIDz!ufqKT>pB9sO#z>{*y$jwEjBFBPBVgbgy5yzP z1UlXE45VRdUJZutKpo0m&O0gYYZ_nYgA%x*`E3LPJ&~=M{LM%?oVqKk7hhDxL(Q=$ zJ+B=o*g-?c=c1h@NYRteKoZk)?Z!B!cd*Xpcur8AjjIg)!3U)IL<`Ux68x~X>> z4dyvV1Ov2Lw=$2 zx^_F#&5(*@p#4)o#5w#l$q#?4ZQHcVgSi^$-gF*RLov3uTwE`1lUY*acza|APV<`j zsMKyUhMCQGs&*f-|I#g5DtIH#(h?>rtc>cySPSIm&>`SzbqFQJRP>}TXw!Wf32 zGNb!n)+{qU03TZ5&>14U#1NZ_i;D zl>?oax669(mf7f%bsfXF1qD|!I%d%aYeLhpotgSTL08LHjo>IHdl7W!Q|7oAbMlRo zJ5PuNDt#p~r4^MCnZb-5SJeY0jGZ{)DL;LrE9h2#N&QyC+H!M&p#jntcX-n6J~r7n z7uF73Dd}%x5rKgDWye#h73hdI>qB8Y0&{0IK*P03l$&`M z0M_*zXGk749%(j|LdOoIFml+K_d|uFciSg+Hki7nR*A1^IdIwK@1w2WiLaX7AYfwvav4}i8LDRAvcXKxLi}w~5u({H%IM=H+O$p0! zIOP`XlGF_ig(R5VJ!fOi%x&7BEd2SP*nYHr&INjU|6|YnnhzuMhcce9tPv6K@H@lw z)gi3LW&h22-Y}R{I5Rt)KrPSeviHp-MqidrFSv%NX`kS~&nG5gK`Zr<>si9nMDk!T zq?;w^G?G}8uV!7U?g8BnfI?emU)LR-j7bm%-sd?kQ*hKyd#{Lnb1BTU94IDq>QKdt zPh4KLrWw3xDKSa(Gr5<8G<&}8ZRMdwV(+;|VlK%*7+8kI8-M0R$!21wMTK9OVFuTq zE_S`07tJNu$1TOvs#=3b^;+pMZSHJF!-pyvQ^uG7yrJHnZf}i`zkaE&`y7u&8@uKj ziVV1s*X`voUo{^ncK8}9ztk7cYlK#8f`UnHIZIFVMJSSQy4l9d&8=?t%9mT1UKpZT-C(u`=J3}!VJjY@t z2A&FEth@^KjC*6;$eIQBPrOziytC}h z>8yeN9E;6HYa$N5rDze?6&70Dky&$=e_?wcD^*s3P^PV*4=0SO9hVh1YgF36&{utb zGu>`8GvuTnlep~lsd8iq>?+%U)2#fG{XF5PtB&qwV|G)X z8K-rIxEoW4tdZ2KLIfV_1~Fs;X}CS{`9kH2vt29)BJql-!p7SboN@<2uQEq#_KZz3 zHK)ZTpJM(3>}dID74{hk-_bEcqQKm2CT}NFM+KK*xYv6uKcw80KDpsZV@UK%5X~P~ zPVs|e1ayalVQ01jz>Dy*$3sX>? zxN%Cw;M>(GBBQP<%}&Si*Xzrx(;fiQ%1GZArpTjqCq9{ zS2d-fI`oaZ(<_$7-w!Z}K8#0?c%kKwsK^wxuVC0wp4!GCN5!gQ>F%edICU5d6%ei| zq(n^B=+ZQ}ORp^lmc3@=&C95ylX%hFgT$b~Usn~lzVJ~#Q~D?$nLzt=bixv zg-enA?Kf3qPJh{-%$$wT=0@8e>>G)qM??jA;`o|YuXO!o&j&LLkkW*?+zez0J`k~$ zgT{ZYN`!+=(Eo@_cvLrNutVIWfUR9DM=_R{gWgjQyS91sv1S!K?acB!-UQr( z!k23IC3&L33@2m;vFo@-5b0;+A|hEs^ptRh{p=lfxFRs(6S5H>#+$8#MHt;HR)~UI zP;JdoJes|y{lzf}I|VoZi@(R8&Ncr+-D52lt1NwkWj=E2cYF@$S{4`5BIYwntd?3A zjQ(UtGeEdN;2pHMXF_W=$~Uzi@L}DpPhr!5pN)TKFfKUz1xdE(S%Oo3%@5A_P$g_; z#G>w-pF@oX7FY#%T0kWwO1O&eM2Syk@YEV2>lDXjZhs0-(-bOWJHk;NQQruxPzn8$ zw9J`#w(<*$7LPcq<<(3|{cF(i&}r=}CD>!HysNwr84|^G4)3`hD;bPNdU(pP3`jhO z7oQNxa6thkmO369S-mZ$_ATf*K6dRUJ;Z*^A%@@B0Z(*{qY%WO`C2^u%d8_Ve#b8; zA%i-F{C5cd2YYWFRn;2Bivohu(uhH~bV*2ugrIavmw-r@AR!@*G)Si)-5^rZ9nv6O zBB-Q*z?=K%Iq$vu?;GQed*8TYoH5Q2*lVw~_S)Y!=WqT(BPG+a;Ks(rSdX!vB5z9f z%8P?dBWH(3q-%U=+Yc5U^Pep5j+u2M7T9Pc-H4wS-(TQ7P;pF)t2@4?q~N7)lpTn~ z>A3!oG~Ft5% z&NXJ-nh#M=xiwTxevEYe1J~uJ+x_%Rn=K5n*tU1=uaPJF|6XFz(TY#I@9tp97urtJ zN@iN#jCAgu-2aAFVC9x4GwR*dlg&eGxvLvVoZ(%ShQf(tHy;h^{?vGV-=gwwnWeS>(j8Y%+Zbjez;foJJdvO67?RzF~=KEMj;$;4-0$S z{X2K;4*jp!(7y(R4reCwEUT@{>2;DJXs%Bn&#TIRvsZh^(b- zGZCeuGd5vw4%x)pFZ=O+R9MsF&~=6CB-yf04X ztWvr^ZO6nZkW)7r*q8Y5Ebc0E**U#9eF2U-C1tu37i-uV2PgZF))eNeyZf$uGV&N~ z5V7v}%-9$DnX2_>_WBjp+A;2>nLu`%+vP%NPl5OKbg}{8Ly$)!q$CERXb8eytO%Y& zns^4Id=w$v{-ZT07!}cSei^DBn#TRq{%BBuLp11*UVgpioqp76qPS}!?g7%8wcv(d zlZV3xQ`g{Qe(Q;{Ik^}0>UF&;(|*p^Y<_;UpLv>J&QK2~4vb8`a@c2HDg0k*pD*wf z3$+G(y8UBg^`bG~R@+{vJo;3Ch>$_B!gKNppswbU(Oz^z4`} z&IwV|=sPjkz6D6F1q%?dQMlU~C9O^jOfv!+ ze$p%{b+a9^Fra&**MLhpV9*Ft3pCPQ?qdrDj@@X6JC8o9f5A&(*$KfjJnmu+T}^MD zd{?^h2A$1mw8!rYGQs+x&j;4~-nyRe7A5S{w)6v-lTzygiB}hRD4Lo$Nv_%0s>bY` z#cBMKIj;A#GoLgRFI314?X5SomE4avR=_2>Zo7+4dGLZ;ihPiSrB~VDdU0(d_b}0@ zutmx&7(wxTboL8%E%Sf196_kpC{P}~jF+E&a66OZV76nakg=@fO479ouV$SAXpNWJ3G+S5-(H)J_>Q0$Nf&;AiKpE8H%Q%4b_HIDQ%&z{O5e z`*@f3mcH;8zmxmv9}ib4*qS=mE*oE+z2sPP39f%vd|_8=x_gG+ryAB_rxYU2@4(5f zR>QQH<9IFK%n$zq$;6~GR%JlkwXE6gbh+vK`(-bOrbik#%9xhaREo6xR3y?A9}8gG zk=ULbW?es)dzz#c@yfgCXF*T`PoT~3Oq(AUZK~=jzJ{vKOnYu=)}Ec250=)S&QvkQ zY!aHORKupOMr?j#$PX(t(EM>5%@mojg;pc_H07uBKq3Ar_IFo8S0ZuA`p9XHXqS#( zAAGvk@TfH2{Fm7tckt!;D|EkHN_n#;yr>86`ewgkOdYf<(m9M!uCqNf`qA>1r8f7Q z&AlPr?qg{O(bVRuxPzM&<8@L(`XY@FjT!yI6{>_I-JhY{`h>B}x8nR}s_o3A zElM~qYUb>iM^t2_1h^m}ngds?-5msl~dg6&iwu zj&6e_*YV)Tnei*^bHi<^j$;{Nt!LH$yt~?>aHuNTul%n_)~N;2n8Xy1##K3Ati^|n z(nNS>PnH>d>bDnqKcm%8}5IiYMsd}Q{eV$opb5^vzUzZ;N5i0X6d_M zS(B_5N9&3L=N20~bKg9nu8jxU!#VGb<)skJYfev-HB>ZEQTgubNq67(IIyfK9mBhp zzd~4#&RQ6(P9QrxBy~K*Y*9H(#*8?U-i&%|$ivo^A^sI?9=55~nVJ^AT1`G2bep-n zo5I}J+?GDp0-@V}TX*`10cu*fEiV%CZq}hDsF@P6|2yi$6GE9$C;Gso9E2Dw{qvs+ zh?}BA!j}J=yBZ@*Ex+YS-2vYEA8=k)8h*me6#_E7zq2reGiLc)0%zI32a%#AhMyF4 zGwT2MgMWusRDWNC=c@)|tQ_iWxBq@H1#eC8e@^EA+?l@*<9~zB|DQiI8S;gw9id`_ zs57pwwWEuSO!=tbr%Pl@)SEiI=kKuMpN#d~En_YU=+fC<4Ho zHn+@E{lkCC41f4WN};PFtd$TmVwk4K!!WvSPkraV7vUpByi^a>>Hb|p9&5qi7PC*Y z8VDzUUIg#tr{K?_9{1mIjSv(I7K8DBeR8Sss9<^cpL8-A^8U^R%=oY45lW>!X!l?K zI9>qAbsorrP;%LW5LD0WkRNjn)CPl~5{4Bu3{Ywg9o&=8M<9}W2xelZ2U`ez0{Hw? z!JM!fL=-TuJNc|#SMk&I#^o7@cBNk3*#q7LEMJgv_`buKiewnlc@Rnd1T_1L-|b++ zA|2gRxhLzfxr1PYnL6X%<{!S5?#Lfajtic50LWeE`ZiU*f-rY-TqrW66+SqkO2dZP z2EBFsNA8j`^i>(d>ba0L4J#O9HlLr@0^?xL>T!V-UiW333yhN*CZ7S7;w^AB%8VpQ zE5TevDTSYtg6QOUc=i-T#O~04>7;Vv>j7)O6k&IVuM2!+7)SyUT354Bt%o|eN7aSN zJ@BD@|MySr0~N=bAg5S;nDNf94|K{V*FSKYbgs^s5QmBS-Zyy4QujlrTHfR6+X)aF zW}^@TC6KP@{6p#rA-`nhTr=QByf?VERo)yF zIkPb^Y4j<#FIQ^{^eRX)SfBochD8EOc3!9Jyn$mzTB-JUdU#tk)8@BMUvUm29Bw}$ zthZR66xBAKc)}~<`6T(qYRB_YqvQym7eJeMi$YiR*AH^L+xvZbS4*)NjDn|A_ zRZL2zyYV2$#jz2!xq#DGVkQ)Kg5i>^#rF?5iRBQYc^HY_5uS2g@PY}-n4u;NM}2Pc zTCwO@;O6v+mL|2yf1S z7dD`K38qjZIM7-2js=$uSLcLP~WVA1C&+EXN&P8q=6Vy+s!A3VsmOxw`A7RvX zdj=FEFNT)Ohj0cmx_>PMC7aFXnki7``C*M-)url;qD2X9CMvYd{7?KSsNBR!q5ycyDrNxzKtv;j8#d#O6`Tr z?MF4*t_tR1KiLyHJ8oYctetZRRi1%ydBELRvESb%2OKurnJR9h6Y1Gus4E%f;bA5u z2$s4iG=PNE=KD5SJ~)9;HJ{7u7jPBu#Tos`bG^=vTCLtF*;)EhN)RO9I?_%|`yI;O zWC=pHndFS15;}!=kodH==kbD64uo7ApPVj!pCkqt_i`AwzlJpq=hfB5;9tofU`7|^ z%;Eq8a+!{{S?OGF~tg$-dCdNxJ%Yy~t z{t#t*G7L}N3UZp8A6_HIMb5#i;stub$I^9>;!Ng{XxyD*|0<%7Amqd1?Ahjz!EyzG z&%ajv;s@^1AHBM%x67<)EPwh8c3Rw6Vl64dH&h-%X`Oa@try+*Et*JH5=jqinid%( z(?H_pV<45>ZA^@LX)wlL zf$ni`ujuKk(Oh5>v?Gig^HUr`R6E^56Pd8(=0CU|ck2EEc-wHL2PV9qxWf0J2I_Hv z-8j(DWA1~-U>5ip;qR7uD>TV155~TS*Zi7`S`{h7(S?> zd7=_{P*}x#?Z4}bru}hUJQI&*wO>G!HSN~7otINaLFE_A%c^KUBz(8%u8oyFe8u5A z63o5t_ZUp^wuWVF!UBc6!Htscub6G%4LM$hTIaiMmsKV$f1cM6@R8&@{LGl8CRAAZ$ef zZ`pGAMjD__Y|r&j#{`Nk@4^F~Jgue@jISj;xU!TT6sNOIWi7ZV;Q7&PKceH8#xp&> zZ<=W^ipsaP|9N|6+0&oWqRx%Qe9FQ);l@#e+w+z?$ty)(tZ~8*@;I|3_$WS}etrK% z1=B*d1;evDwjc!Yv8pR|>=(48PJeKHoOA?5LXS;h<^K+5RIx0l+ERuRy`Dz2ZS zev?^FVDcH{^j)~5$PIJotK465=7N&fJk7Pu2<0hv{jjt+E!V_^q%h+R?WnuhsHjg~ zHv;(peNl>SJXz_KNj%hqjBmlyF<~Ew85JC}iJDu}{kp^BK4H$El^>MjU{)FszIr^e zVi+jOCnpTTwf996>gs986TCKy1&t#nq6gq1A9bzvCCEAFY2EgC3j)-sjw;3rR2kr- z3?h(ihs6aqRY*Es#P|M?U~eJ6hy~6j7&liiyLH`h)e`@v5Oswy^tK^q;_%h?4z_C? znK*f1KXcF4N;#$OKEGG_FE9}?b(Dm=7*1&OcEV~=50xl zuHLPO`El_}+8INuKu-4~j;UT~`iqoICk(A{IZqDoX0cRnl*I-fq)2Wj5%uH;Wgd2g zPzi^fcqIpo%|DGXNi>>*_Q zs`5tY7Nc$wQfo}DuzRI*;kt0^6-?ax|-@Rk$B*A@`07VO}oF1LdR3v4&ETkE%(WqL> zE^nVvRi<^SMhIw-lv5(}Bui0GeCpLud*#j&7@%+`Ee=zu|KIc4BJ7qIjj=E{#k~?T zAYY@t6u8hM+}K$}e!I$lBIacQI%ivm39WB7sR0f>`5WJBg1uS>Rw4<5SJ*{%_X-#; z%y6EIoFnYwVFG4$hXC3|UDN(yD_DfXHD?Ps}s>p5z@h=2n zq^|Wp=U5{GC?cdmtqQ0}GoK>D2lVd}9?Rl{SAB+sZOHw+>y79emTl7)qnH*H##BGk zFMoG^ZI|@^K`W{zbR@NS)&zu~rWJx>%DWPW}SOdK|Eyy?wk>uDqZ)r8Ul|Nfv z6G}7*Ym~#hhLMrFY{}qB_`xKY<+=q3vv5j@XN<`G%{f^Js3P*^)jc&(jbFgpU77sA zjExtifHqm8-DGANTi#B-FDami%)X)%w#hI#6DUBnTII&rf{@*7*;b<2)XnqlmtGMV zp}f%YoL{~k*cJ6wwB|C0Iik3!FECv%cBe>t*qx6$b7d)o<|rY02~hIG3Bx1KcL4}H ziX!>)XwCqfAQtLne%2A#mNr?{qm8mjdxr6DyUVN)jV6B0c)!Tn7)%6l8L$`s-H$k7gQg)st z(}N7DCF{Dh)@yrx62B8|f-Ljp=_Ef{|1IofeV1cONB#n8EoPpppNJk>O=0>Ezxv!^ z0E(x%rrlv`U{>c4U6XtX{U4v;)jA;9IoPRKUW|tFbQ3#Uo3*J5_xr1y-ZxH6dqiJg zoQOfUhL23)i%F>GdEUJm{b?!cdG`qnR{NeUqqA=(qG|ar|pu(fB%)JV5&o=8IOKB=f^kQK+?@=fFee zIih3vK$}qaU$hN%6t{1d$KXI?LJ_9D|1d(VZP@B37eg-$Y@!?0k8fXR%f}f=(9R z88Zf+5I)8M6FuKVOg4cqDd}ZNvC^vF8xq(Z9Ns7=lX%doUvd@(w&TlomSnXXsR#Xc z`IzF+`%%3s{3j#;fol5tAxP}vW%2I0dHQUQ=r^0tW3hetDHfEuqIjeFF~DCz;O<&~ ztk?&a9j<{wPdLw?^uy3ZxVb+5cvlEvO7;H^6B_+%kMx(|EQH{@+xGGr{tK$I0VL9s zD)cAt1rV^_2*6${-a6p?`xr(eygD5=lo0l4|D0h3g5>`#Pw&dV&hY=)6#c!d41`L9 z=>yZG&)*;XKU|vr=bnZd(+7O(mH-CC4FcIDNz7zLm%sxMOs@H2-5qkeKzv#DzRVvr_>*?A# zC_FUyn6URL3as#a(GePCi9`rf`3oXulmVzk6klrff1}k=aAVPp3Yq@{p4$+@P_3rI zC;z?(5@LY=<6ltt_aITp{P$2%BP2`z zlTJo0xU6JZjTc%hpe6(Eq6v#jk#ctDy7wh8e;NU=+7c=Q142Zw1K5~neFNHfG36zA zXOzQ^xz)#9E%8qp3nI4Yn)f9lJP9nGX2ACW2v$SmnvtO`#pE(COaLSk0Q-31l`w8x zP-9wJM-=iMaxh01J^*v`mb>?vyb);A_C&E{kOvTh^C_ekn!N5!8NM2>a9r<~GtelQ zCg2#Hj^A6O5cY5aY|T%#1~y*mo?l!RFAB&z{WD#cf4wVnC*!xvxF@n`BsF!a~Nu|?Xl_~fFWxV-g33aqZQ=70D(MuzXU*& z7eGS#?rU*s#JG0^KaXrpmM#Edz_J0xEf(P6@AhNypQNB^<4y|4e$I&@fYyjA=OwDl zuJqKKlVD9eaeeYW2WHO*J+b;Bc*al;Xd$4rmM3o<3BOXIGRJ4S?rMD$21dD!%CCEY zH%SkNg$zKk+d%+E0dI%O4gqBROm21G>Wvx~i`n#C!YO#v#HqJZTf{Fe`?zjT4CKMG zJ>>F+M2tNkJTw=`Pam5;_p=uV_V1rL6b2#n#4G_ZzXas+d5MP$061$uI|2-k^pN9H z`{>*b7V$TDvVn^ZbM{@+gwLkRjI?oHfKR6ep0J_@T4=hqUnu7Ts^%X0w;a}5D_O=E zN~kluZ&j<e0=!CeYWk5CPI>0+gO9bsqOU z)3bAgw>3%kkA)O0bFJnIX(b;fM8Lp1gpm;-W{R&4DePL1ztNlYJcq{^AOYQ;EA!ubiI7Vi+B1T8$x>!e9WR= z(@9CX-}XU==UJJO5&DSRvo71^$R|uvV5akM5W^p$F*z{s8WP_ zLb9hyJj5q5^u>HuFLU1{$yyQQF_h}Mhuf32v|OYHcFNaP2SWdu-({eqhS1hwlX0y= zfQBTBon#vm*6!jDT9+O$2!S=i#@$)235g!fQ3~8!7mZx|@se95BCmzyBd7n1-{%{lp1buMc5k=C ze=mo!Kh}U?r9@f5>5?ZOU`4o@8RAM@JZ4d|?g4CS82oR6=Ea06TAejk4$Qmqh&ppv zzGOd}0fbPi?KtFd>(XPATe#1w*}`HH0H5o7LeQNL7OTd@d{l94t)l8(dIS$u)|m7f zJ+%013jYaA(U8kn^y>rrCu0>X_9sNwK`v6XO(=8LjpMdUanE*69Z&;6L_Xw%V9tSm z7=5)NAL64tP1&g+bp^z#phz4sb%@c?T5*FlavmSm(i&Ivw;g7MzZ0xa2nQVHaJKX> zGUGwr+pWzJ<4K*R%?b`lG~Z>CF`s0Y+-mJ->oLP!XVMF^{_IRM6%W6AGf)~G$326540eS6vHXV4u^elA_Je6yGQ6a} zMbq7i8JcPjr4ACWx0yn#l=aV~_+*+?@Zxw^$$t4%&YUFC=K5!#k8Z6kel1Yx5Y-kq zo){2k z72jl$D6|BT)N?+8IMs3N3_S+lhwLF%;6OL1D7W=tLh_1}Ye~UarNvNATq>I9O5Qnr zv0iUme-^9Rijn$zwSwp^9qX!`kk;u{H6?ZKTWd=BN;iT(^1n^#4R)^3Tz2~E|7I$o zvZo+LD~JFUPpn14_gk91cDZo?nd7K6g}Exot(K(Lt1t*5b#5PrbE?n6@(muj40j5JaK!I9Y;OSix{2_bZ}N&+SBIGxSNT(w2uBnq z^(qm1wj}Ad%p3~6sgd%@(rT%S2t)ZFMubK`)(>8T&nQ2NcB5LA#SpoL03mDIj#JXF zwrQ)~O}QdG(;ldW@^2;bu1Li=q|~SP^6{WO2=*h#+Q9A&mZ#w@L6j>@u0Ey^i|@;=OXTYdLJ$ zDHaGaBNKT7;A-oLyO+sG1F}x;(o=3ED;s+z&t&9GfF z$pjp7+;`My+N6-Lb?JO+Ph=8DLPnLRiJEWkNQ$w}3!}y9p3W(vvoDCw`GBhTE>=R- z38EbkHcCk)aUa?{Gpg77{S_5C(}Y}YzvYcKyQEPh*?%=TW#*h24Io(}PlF(H#rk*Dad4+2L;C0d8&x<(mM{$AzRX=+4~dlUJ)vpiK{>jWhmod; z_zvIn{YKA*aKm(0`e<9LtRMZ0^a-hPJd1YLud05z~St1<0@ z3GKm!f&%A_{+P?ECOr@l5HkEI%qw}lMHb#LKQn#zC}6!Hu~tXo9vdguntgzNR`|~3 z;3V__+BG&w6WC7V3Z~1SpFGj|tabNrt9*7zM#+EabW8{ZeYMnQ(B1U#fyG&B(Annm zO16s4&!KxagwtgWGVZt*52Q0=TMP!Vb>$Ss6zYX`C$SwwWvDr)>@{B;mgZILv5rr^ z3OcZnem1W5I2CP#Fmpgr=Cst$^~gcrlB<;ac5%#EzUaA}iRKRv+!~pCc8g;BL_H(f z4}?CnH$7HNRpVf_q^{hx8ReyMF0Qsa2xAaV6?g{CVeY z8nb;5|BRZE8~Z&f_vZ{RSn)Dlb+I`Q*u=2VO(~=16{OcS-|lve%v5-=RXKi@j-(i+ zKUaqggY40p!l_-Kv5%WC_t-D9w%I*yXLCQ&_-V=fPD{~H&?Lu^L7q~j17WiG)ijJg zT(CYO#(7KW=@X?PsL_|*kRN+9ipFZ6SFL-HjisC1nDt(*VVZxp zQ8_u!X>u2#iUgxSoX1lHxDi* zR-cK#teWe5=x^IWux*lPxm<%PS!?|Hff$M}J=^PzV5H`Vj}(@O8?*Qk?MyW2OP4Ix z8$$GD73I~(=r@pIyMQnFHzG>=^8GqIVmNoi1<>MC+ROP#;Q#&S6qDGp@rFGW^Pj)Q zlcg40jz$fj{`=1|RiE~b!4(O$KQE??lJaS%!@!jO`%lAsq~Orhu7G#{ycM1pWiV2( z0s&FT-+#6!A!`g)?*qM%xpKz5s!Qwuo$|*en$1hkuFJ?bmm6=8EcEs^4X^Qi&dSPa zInAO$#p^B^kXqJOpLnst|8bi$eSV{&iHH zzs#eqzo!PTZtp0*aR^Hd;?F60kO>loH@E)1EpwPrb#=tqB7bj#KfTy8!A;>ie?DU$ zA6;YuQpBzP^H#pYsIqzz+4&}a?~2`BvE^lru+zV9g_jIhZF%4n_V+|>(PZ`JLX;`~ zegYDNl=c|mX%TR4VbVaz~|L3Cr z&nKOU@xROCEtWZZ=N$zG!pqkltfRp-w;xO)0Ls>Zeg@(k^$y1#f$W-^6V|^d0c! z^K0FCJ+E3vw!ncvTXzJS)aO%WRO)KYr{ksckJ3E*Ibp9AbJvGkzB;q?rPAUN#EpPI zuG!TIXcMs5>JVupu;iZtflW;dz$&mRP6g+JdOn9vRY7DgP0-5sFSfqAPJZ9=h|-=Z z^>%dy^$+lu5H!dIU+ov*#<$fBs8H434Z6F)%Zl4NoP^CBY+))1ki%sZeH5gXg^cMs zW^11t#8hwY4W|e=DhVH|RYHn(wT^T)5|Y4OIVo{9Aw*_Zz2gSRNsr+o3?Qx-_96wV zu$_8(4oFf38=TEY0ax>`w=8`Ma(w0=#^8qnW*l`8Ok4txB8*%+0!y*aV7okryoZ7| z0C;p6(BsD1H5Q#;VsZ%F%a1sNs0;?Bly_9bL`3%id{mmqB1knuU}^`#GNsNa<`5ks z!0T{THM#{nPfPcAiDz@t0EPi*qPc-rQGQwkcrGn&8v)s5t^x|hE zxPjdBf>KXdSA5tu2Nqar=Q?@Jqap8$%DlD035eaa8<8gMP`?BK5K=6JRDVGg!(@B+ zxufnP7-k4A_GG#HGuz12k!hBU>pAO7c_C#Flei#f_MWcdU}_=#+6hfd zZq}T@gy7C)Li6vAunllo4ZD?S&Id5z1MiOyPk?MuQg8-0Eckur>}?W$NV){;`v!!M z1!R2Ua}{+7gUMezDV&FMk; zAHQvI{l5F#Bn`ksur*TC0!ss6%-pkmkQ#=tc~i4A=VR=12ODQd%pI(=YOL0#Tp0?y zn)@QH`h^kIW$~#*WQZACQHqT0kCG-s9SO!GQGuf}7kQq&2U7E-G*Vv*~vAmEtuNGH<@<2x~Av zUyrc^hf(@sF)EZ?q94aGWV2gf3t%adtJ6zYlQMb2(sG-0Ka!gC!dj`;k{0l+$Xkyt zFU}!4tGJW}?E3w=D_aXFG?Qv!6ZFABc_e%ot3-U4V4mg#j6-Je2so!GTii`-@LBTYOidR)Gli{3lPSF6O_#%&&!;=>b^R;W8x9aOtL=Dw*jc>H|J>Pl!{ZN#m6yU_}4`3z#NTQ&-a)gm zYnd;?4t0gE9U7p9+2gnS`bRgF5r}P zagOq%_vK4fMOFQ}E?T89xF2{WKI>uWV4UE=Q`B|?Mv@d4KZ5L5GF`EtFkrdQNEWE} z!R1+H{)Fs!C}_&X-&O!|>m_pnXQ z=^&W`Q32^8KWUt(Uvl1_Uz%-2KgLFbjm263x9MLh6W#!8f9ha~9UjrOwQMoQ`ruguddpME5r#}>) z`10H8>vUy)T5g<)2zHkN)L!1gUY3fHx4DZM_ho11U|~E=v8ZI?s|QX-IFpkK;XJxm zCz(Y%U)Jf&d~Ex(km62jAvDBM1#V~ck|9T?KV)r07%V>BOo-lZsZMD%qHMylE9iDr zDaEm3D}LJ6Umk9)q`K?u9hJsIq*+ocuLnfWrP#@0T~`3m2X*(2d!p%J)gRl=ju4bz zuvxCzn{eO)VW=F*E(X`^Z3i1-JVq<-e6#by5=3?4cK(5y0Hx+l|MtT zkWYDmgqpZQfK>5;?ckB-HanQV$n7hGl@c$^M(7ojEh~0_$pfK)xitrig%_}$=Lnq4IDwZwg?&4I1Gy=QwKDE!;B`5k$Td@@U~umGWxKpX(HFTTNcBCZ z*c?yDb(e=db_K$5`CLq4#sj7tsx}y<__8!`PoFB)zC&a}Z-3BE*01VG`jXIeQ`b>X z%=^wsxuTdtF|EjwHicAi`1>O15q*G3*Bws~-acrhTkO3GDUXUvu00jcM}&OSA!vXE z8Fe4sZV@9sH67YdC)lySB!*VPFwyOGsH1)|RJNPa9j#dP7-G=qA+r&8Tx6H6Rg?4O zaI=-a?_}ZqR3%?nhcn2F}b`ac{T>Yg(OpPzqP6wn6{B%t46k&yn=NiD!Cws%h0^6O7S^EwJ1 zdw*P#OzMC7=6@_Q@Hv9tr?3VdU4>y2K70)XXNd*#R0|0?jXAF z3e5pcRT4D%s5VMiS$jjXq0Iwld(&kno4<%}J(y;ftKFgeuF)TTs>fWhDu#FUZ?_i9 zBDSnjfzSJI!%F~bmjW)T(Lb#=piDh*Gw5z9U-{b#vOx1){=ewU8N*u;b>z;hW^(f9 z9<2jVLDCCrtm7%&En>4Z(LH`Xw91B3cgCui@Ry5MpPj34bVL;U0 zfl4+0;y^W1A^=!t2nyXDK_^Dlz|uTqFU)v$gO|W=18TW7WSe{iam^bb-(bG`{(C#H zoo=Sf6NZ)v`W{bDIT_DO)e2Nj1#Jf0ePQ4W@t887zJaUj;B~&ztNdWM1hib6xjIhU z&p=st0S0BsJWmNU2XzY9DW@Kdelz6`b}9tP@<#h32q^XyC@o1F`-mya}Xi;2@eOXfpQY=@(m* z*`v_V(7eyL4S>U*S5s8!|W zNmLh9vY4_q&C)-MD$teF&Qkt`7%S+1)G0?KVvoE|WPC+1Ogjgasb-AM7w=22iujDr zS9Ugl565FM!gEV=Ok(AfjfLg&wH6{U4|@-r+*=&vRQn#8%Bm#B7P17XnC{rk53%cP zjC0`3D@AXKf+Ljyvu&6@gzU&0V2Z*_e2PH-fsP??;nzc(aI2KcX(QIad}`7ZE7f_d znC&$B6WxjwbtD}jG${9;r`;T>7C6^DCd47XKUL)pQ|Y8W7xE8=$2su%Bd%bxzke-E zfd^O1M;YNl2ObtKHbEdma)9T=M8WOOQ{;JX>*eS0SrqzOj)9H?bU{BrwZDCJOKqHS z=%HBUYo~?y%Fc ztb<$fW6s`ZvkP25>FQpIKSD>noAKT^?-R70d0*>6E@2+S98;I0(p#s;&=6K;E@w?% z#Wn;?-=V6`QEjsK@#YT*5=I%db-HNTAT2A$)>W^}(3dcmzG<_tX8j&Eg@_*3{WA18 znon4$4l;{i0?1=Ba+sm^su1%0rgM5`3cfy0D{t|>ILMxlBl_C*v57HRE{N=2cSJ7W zf=qO=;p6=uhu6b?!Z>9T;xM5N)V_S_L>m0^(U#wPTfgzID4@y;3LhaPL_v4M8%Cq7 ztZiz@vFe;YfQL7~P*MQp4ZvH$b5v)|K5@y31@0hxf(Nq8w(+No1#N?*N>49D(x_$K zqDKh6pyL$Kb@xR=5unVlCNw8{s#G8dqb>msyFedU_|-(Oz)lINGq2OsonY>;rKj+X zgNNy%s(x$}SODrTRK>O+yqg1-&BrQST2pWK4*?gstxV}d8hBpIQ* z<#(y}5G)+3^1>YA1(Wm(^)_1@86;mubpA(5;|VcpKN06#V5JiA@4l)RG*f49QWH`39hlM?Bd0p@jub!nz^9%#n1>iG{<%|&0L|h~U!oQD)HnDP$6Xm^nOL6V(1wQQ zT|`VBz8GbaPo=4BGTNIzqp%7-MoG$Olob2mRU_45Xgx=3-vL4`Wbumdfw5B6=3b5C zSGsasI=CKb--%+~w`HWPIlwuKZQYoye+N1IefhjpN_h^HWICDWkYiAUK4_1ZcNna|Djb z^IQ(AvX$cn@!VG|DBYnnAsD~zLw;^Wk~+ra5LFC8A(_H(vZlQ<=2tcJaun&C!aK_iH zvUuy!grqwBYRi$fqKrGRouE>)!b*isA#y$gd)5T7t`SMCQ%05pR|kBOU=Mu6&X5C*aSc4%YZ_H{UGx);oHzIa%JFafcuQd)RZf<03Ne9RJjXgT=c0rTZC8)wd{mjINiG zXEYDnbV8`ly3D-&a}5VIh}nqE6yja}g`=S$gZ1kv-ro|Xf)2JbR_&QT#U2teqT2Ih z$QA#KM|a1=_q4|JDe2!oN0gF3Q)93GJsPDhEVk3pxljK7`41@AI$Tw5O8o`WZJtFtlal7o5P1kJ$hHZidh)d$Ue9xlaNrmW{z#c_Nf1mhSSPj2flvFls{37v6pWRA zhb>9`kk1ODK*`&3f4CuLx9cxOtbeT6dsNUIHGHH9WGg$&JV-vqY+oOX7wmECM*FH~zixeNA6 zR$#WOC-9tx0a0<`)5u`o5j+4~w7m5$=gvrW7vc+=<}@l-zSby1c@6c~MBJ5YKP@B^ z@4N58Ms!k1{be<`h49h+%>HpxRxpWp2K6U|KZp4@rEc$l6<=ZkM?LrE^71l3Z$Ltv zHD!fw>3taL#ug9}5g{CGL3ptY8qhLez|4_x(1Lw$N@+7&qevR;U?q!9PlLR6 z7s^ns4_}~sJ>7hyPxj@L|OQ3-ETVtH@`!4{cXrXkP zHDu*aEe7S^9W!I7I46L~&eJ9RJmF6<7DRCu-;_3^k_EB-x(M%e(}14t92CX;1Dj6C zgN@|CRb(Dniue$+sefFmGM@3ATws5K1=RC2v-7u7E?`qM=(0Ysgs6LS8VqskJD1UKp>l*r)H zBLY^NaSD}}1ykOzfVR$=&mJP)ptwu(Bw$J(vKm!yn=bY|n1W6t{ z!k#Ff{{bA63r}RoZ&@7kz{2v>j=B;EWRjOqfE9kofGp#F?jbb-!lYpg9xRkoHd~!% zi0W;S*%}eMRA>6V``3jZEL@{_H$B1Go`<&-D7(qicoN4>Pm@}pczY;?bBpgG zu?{>q{q*Ye_HwzVjwfrP=Q$ydNs%`OQ{NXi915$O$5s%qV0uv1^*`QS35SPXfVhA>=+T}fa!_)@a? znjYI1il|Yi_uJwHr=|ZCcxPcM^6(ov9=h4iQ-Wn_B^E$XFWt zArB`UfTGs~a6^Nvsd0Ss%UgHNMotBF^frL8ry*f-VwZJ;JXnD!az@l-B9*2=HpiP5 zm}V505*J(13#CrT-AJ1+W7G%+^FVknqJF2BhBiYth$mfc+f3*|Gw9ZbM9zvFCAIxK zjosg=@Fpzrq?*AVovgiyTI77`6fa$gU46qh`o{S&Py!ul_S)EycbWGTxKdoIWOrd_ zbz)eB7L!Z~^}s6Kr&3TM+4($GQOWGAQ(TG-)#;`o$g5%QAUsg7MUvs@tJP3A`0OoY ziObqdEPtB^VY0BZ`181l&8%qEL9AwgI(|oEXQqN2Vk@Q0)_puxs-cV5AANIF9+@@8 zYI(G1M-3S&IBa@##a7FBzvHyRqr+fxtP?}6iE|*M z8c~OwU)&+ht_*fpw&%k4dXK6$8CcbM39i`3V02jmy;S8PFUgb7cJVvGo5Mwzx>$tU zqm2?)AU;cONhzLNNs?h&QKV%f8w+U6p0J+Oxuesz1T;J>0iuT5hb`LO4}dv_Iq(m2 zEM|`p)3RX`f*&4g6kfARHjB;X$D1{t$aN=-uD=n>e1D@vIR7P-$QG0>hXL5bZ40oR zvVs6Kd3pFd&3qgwX%vYQCmjHl9J}{P zAJ6zyY!d7q=C(E84uSlgk72g;NVE_kLO}e=1O?U$Zkkh|W~$%}Ogk+HGeFT?UyYI+ zqMIvE>?J*dNsZvF6K_BodtSS=?tpx6L7}p~2l58~_EssffNrX_OzwBTWICAa$I6w$+LTd=7oEO_N$$@wW%kgs3aerA)M@&Csg9y z!pdSluDbBD2STI---l*6QMouL@#%+#qn-&l#PX_}bZw(m6dHHfS4QJpD?h|PmAiQf zy}hyR1UHk?s30AQM{rqtXSTj?PyPBzt2cf(lP~z)C;zm}O|PR{=lNg;;EMTsRo48g z=@hT(A-JPo>~KR>Kz=TRw=@^bFa)df_F&hnD{efb=CGOKh{2D5q2<@l1@?+tTAPCy z+*jm|9q)kT9zMX z1+V##B4Z`aB72IWCXuy`5N>x^YDL1A-{(}8e&h>IkV8`}vQbpE6^J{tQXj6<<+$###Q?R<2i zq8@N&&BB0u0IrLi#ZQjoa_l?(uOtjK?IhHSw%L$Vc+b=<3}2vg3lSVe_QC&Sj?LKFh)>#N zY5oAUlz9H|UB>G@fM))mpERLI)&2Tk_18qFqnP1t_#D^A_C`_8)(er+!}WwK%2$>d zyR1-8Zk|w9dwmP#W)m@Puq}M|uKQ5L_xmsmTU*X@J{kO&UX=WYI7N(T-;^4BB@lgq zZxLqV^WS`z1T~^<{kT!*D;4nnRj+9{N)yK@`rk$Zl?s|&e=e`TaIgde{@dT1{q2PxKeGuTy3~ll<2-tqWfOC z2H&?MMD>5J{_e5SaE3UPD31W+H0g5>7yFEcd5{@-dgC=>pz%6OOkCWz?dX?+8$?H3 zo-bS$yY$Ah&H~^JEpRau96;y1x(dVq3YeF4@;6B(+JB#(9l5ZdrZ75+Pzq+{?#h*- zW=7ohcuZ(IKAW-vuBZdW6hiegl=cyIKX&W2;X^U=FqkIEcMG*r$ImjQXTXuqprZ!3fSS1p#YgP zy9JBAF~cZP4@~;7GvmlC_&7P|yI6$A8rTiz}-Cta6cl-cXA4nJV9?6mx zJj;Ds4>x5Tcps^|h1!*QT_Yz1nO!%E=2Jg}6bbX72ErFKhbU3VwWNaN85=XRym|r5 z_5hmRRTisF1^rC*SpFl|nUN(9vYd)|M6wc?qd4*XZ2L2ZeTsv)7ZPFy2a5Md_3V7z zGeuJnRxAMY7L{I{e({ERy=IA?!0X3*rIFzIvso45K#KFE+aleort03anH(Dxg)L9$ ze{`fCj72)Jdt6MTavBpPP{wW{rXoKyVYsLGO2^C=AXD&8iLs>**;xj_?lw?pn$KsO zJ4p6sKKPX29RfOS$_k>167!qTDGk6b8wmw+hD!XWpzGU#r9ctV!U#Vme@dm~(^GFx z*@yrzs3{FE?5ybho7T1q{d;tHq=ALhKz|B`Ec1Y|Lo$mq>Q%x5JgCXF6|I!J1&evX zikvSjm#!fa#z|>J@~iy(a8%|5tn0 z{torF#$%6AXlBSglm^*um0Y_>N|GpLwo<(-SxALLq8WGW6ON$sT4y%0qqZ!$vQVi((YGR}!n7CL7S3eiEP5Oox@p8rv=n)3 z!>n^2YS)AQa&C2dbOr#xrqZg?E)P?0pOQsCz#1gj91&s#{5(M1wL;5)tRb%8VX7UW zv(U$7jfw%~746><#J4!?=~=W%Ipfg8gZ#|xr}(k4yEV}PLm^Ejv$jt&W^kIhvv|+i zL8lE%sO+!0rL2mrtZUJL?$Ye)rbqN0k<7RgCit2|BO0}UsY-M0Ui-CtgC*nN_ zx~|M-&PcaLYs}dYxW^^&{q84I0~OELfME*AiIoD50m{3sy)^ zq#YB#^Ck5AydI2y;I{Q0S0DjAhO@OM>4kN)Z}j992}$LgLpre(Az`@f@!my?bSWo4hyPCHdV*Dule2sVfC!uN;um|J1i5EZv;5X+*yxS|}|;pO!pf z)l`eAy5ano^e-Q*TgUQO8~@^2eeNKyLZROGf-a=cE<_9r8=ba4_5)LvzHnlP}edpdwGU zY{>KFjU)A-CiK&7@s95%{P4k>)!(alQyrYyl!iPJKR1c3DgxJ+|> zAAx>47JKjf!H#A8o|J=*V!m9vpWl?&X1E5ckV=7Q{0Kh%uKKE=srg72PsZB!knPf> zQF0#~m5caE%)~IilpiM8rvJ;&SpSYzQ%X;Eh4#UUW4E z{SA>Fg8h|Ny;!czG1~iib6HNnwKs_)l0H1YgYP6RJUXAN<5GU`Y4uUKgRWi{5KMQF zKH3kN(FQTrgY+1)Q1WlBTr7Tyoi%HgO4#Ic6#1C10sn1}u*tI}r5NZhco3bROL8_U zyht`%&aIx&scP*^lonB&j3a>gHhAhPhIL)rmS>mVjVhlSS5(8a-5*Z3nK8s*iqA;Z zBh4wmE(?AI#Ff4Bw%xm*L*usZzV#X9)a?=UL=2>gQ13kAsnt|P40Uh9wb@t~E!6`C z*Qs&gu<7Af4((<#p3q_>FP>-4d)`07FP5cve()?fU_zY2nC9bmp9if4@%nZ|sPJ)e zOcvq+&$mz(bZm8ZoQZ}MvWjOD69k3L=+hc=$y?9}w{!!y%gb-fmB7MWV&N`F%o#*9 z>rY5}!IRG~gaQ5WqE`|rpSkBC)peIvHmk{>KhNJ9cU=nNbGQ*sCIU5ol9a6S*D#GaF##Z>yh zY&wnn$a%*XJisjKn5&(P;#*=qEOX0~>qL}8e7z1PsCYD-D#h!!SdJ_TX8u9r5w|qM_A>g0WeLv5}NY7mSS_#-{3w z=p(_{C}M03P5k&yOQ;HCGis({B^VoTy#B9kJj%dW)sR8uNV#9u*Jp_`RIbr<1gW4pH|_zgc}F? z(+)QPyHi;e76Ml4*8{qC*C1MIoMbHXb!wymyBZ2pFIJDUT_`2JnskMe8UqYn zwn*-qu7dY{6(4a$**8do=XRw}bihX!d${iLH>ppkUtqk^cEGEqq{Dj~ni zL`rpK26Hg^ikNy(#+H*06j(9_GQ6ET$!Qliw>9EO&7j>xHHH1Onk+uCWWeZ29E(7V;nEgT_+(Z7&Kob$#eT9^;xT< zRE`eQ5**|XJKsD|s1%EAp%cgf<{V;x(yLcv8yE$rGV!b0kmQMz9C4>Oa!YV}J*^Kn zxCHnFZB$*4iZFY=2Qcz2f|(KBDr+N-tRC18TR#DFxsXCmOS zJRWxrR>URv_mD+X{YDAOup%#sgS66K}fSg2OZzsWdk$ z67~;;9gjtyT}Q6Camy@_VV)TX2)^S?-@5%zj8>Gbw_l`a9`t*6ODHhK`z(EbZLj7N zg-$Fvty@ybBCjGp=Nc1ziMTFOt@9ogh;7S1Zq)5dHW6Dk_5N?}d(>O>9F;5m6N=MH zrdhFP9D_s@uKqOei*0A38c*y=(R6@0;O@#H@MU^B(=Yc|K+xm+=u-53Ra}g0i`)u% zx3SROl_wULQxDZ>!fkz{d4`bQW|iEBk&u{+rRCVkK~$Bv)*OSDm{~;4|FLj`F?eTe zqr?rcMs;70%DjI1mA<=D>QZ$=1LMARB8OZ*eX}{4vK-ay}%KR%F-5c8ijKK(JjuXdTB`R)31;nkt=ZOk4`OlOuTiAUqEzaCz)Z{w~mt>^|q#Ziuo-N6ulO zinC`-O_9F~tXD@UH1iZu=uC{?E*73==e;N|{w|=FfPh<3S>PMPcVaMu=FPdp-ytJ- zTy5QF7Eudu*uPR?azW7)!<_YLUVS_`W}I<63spTe8)e;*F`US0O5+4Ai{7Gh+EpyneC@olVg6YUJEfpJ?jkxkl|ANA^vn?;hlhixPl>KzTp zQqUl~dV>NO_d;Q=xAAJ2Qp8DN-csrlI|uUFF7VAU#2sS|_uBYNikgEZl=S$MAf34g zU$=a$cqxBDQXq<@=NN?wk|W42Qip?qHVd$hT?>ln(Fve`PV^^M9wDdh)~!ZevZ!=c4Kv__0~L$*Rb5SLD9{2x%!u diff --git a/format/diagrams/layout-list-of-list.png b/format/diagrams/layout-list-of-list.png deleted file mode 100644 index 5bc00784641ab3cb3c3275ce1e52e7ccd3b2ec37..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 40105 zcmd?RWmH_-7Bz?l4Z+=m26rd8LvVL@ch}(V?hqij1`F;Iyl{7yAOVs-<@%`clw;fBvbbvxVvZO0sqO zw^={~8D2hNV4`Pa_;238qr5M7xrOa)>>W*os5MgMjPaGj3S=#q&{Cw(x!RvChW3_F$?Qtb?A$4&v+-!}->u!u? za!i#@yTu;PM??VX&#MPnoms}P7y}#y`HxFoM8L9PAOJSZ|9@971u+{pE3?U;>5#$8 zrf?xii6Q=XA!qo{9()^068x)}A~7q(KQjvxFo#eq6n?Ap>aTL+a4VVq%#0W!A5J0~ zDOv{S&pQ9E+#fXqGXDQW<#M`u%yZ9H+d`|Axc+KyUK*6VVWCAgLd6hKF|>+`ib5St z_5XSXXg_+0fCyWQDXow=Iw>ZOa@b!VLXjXxlBjo13Chz%|MdZ_n1JPtA*VCRAN@xR zAqWeYTu9R5uPW#>0sV#5n~(5Uf58d~SUMw1Nxl6mgGiJ(+zxCcoxeLA8OR_=UJ>`N z4Ai55(b8{!AoSO02}F*=B?wNK{wsqnL#QG#hL6Hn{~K$-P(cm`G9XS$68kFy%(ujB zo=o_ye~r}uVju&U{36!BGN{1@#&G23w~)Uq14RN1c3ARals_f~azH;|8C}Fs`oAng z3=3o+NJahsbJJSBk}P7~mD0bz*nI!tjVvY(L8vu}AU80_oanpZ6U5A%$s&5wA# zSHqN-P4UxPHhb5T@4p^bwJDwyNhV&E-%Mt5*bV>kF@!0A5dFP`I#M7ISD&K1Ub2x>=yoMQp%dEn(P4nsqO*K9W0$<$# zu2q)rk3*DEN$cAMU9a(hwaNFt@U@5D|9YBnHm~!(nG)}2QO#uj(B)*c7fR%74)KP7 z8$G#Mb2Er?NQor~Ax=w*q9Uy|RmXKP9F1~MHPA8q_mW7KE^x@@f{4x7Wx@D{@8+Aw zes-#H=Wx>LOUZnmtlT$V=N$`Qoc6IpywASu+oLaf9#y^>2gpZ53a8bKfcsdHIl;+qO8y3W)^R*rChTvcb<|%2 z1yYs3Jf%P!rIYehR^k1KRxCW3!nHpW5&FvzUbje0Ee#8ES${=c` z0vM+?2EBo^NC5qm`MskdCxL}pi#({aDM-O4H)}Q9 z&bFL)J|uWezzgEdW4R-09=)Ql$Jna?yCrE4bR^TcaA1HCcs zZ~dyK?GVCo`B|fWg-$Qe<#>KA!GXshh~MJdZ9BPo+NpTTGccxQJMb7k65T5YP^YN4 zcqg%77%GaEEofZ@zz`m6$9WD@+tDLP;lZiLif!7*)7Px~K006}$3G;9R<)hA_N}oJiHCnLC}a2p*McdF z<#-_M%nIT<;&7%k^CF+ncu%ggU|N@1=O!MG%{(2>u6~@I;=LAuw`j30|79NgEX zN8^SB;L~JS#HmBoYHBEiMQCpD&s%}ab)ulD$n@vdj0FVP8s4}n!LI|$>95H9J3swa z+A}KF6lm=t=o-a^O087X3C?Xqx6xViBJvz+8&bU7FBf7yZ+JcO4NS?^zO0>X+V_!0V>zU#p z+QzvBiLiUxygpGpihhwi*ocwYS?TvoZ_LuV0N-jF{TujF-CV*4U~$zbc3bl(P4>83b<>i${rZ#=nn1#K`3lck z0K=m4e66nu`Ze1Go+PkB60yT|&>)tGp~V_~T<|}QQS;gefYIjS^h)!{X@tg-1k>I_ znE2wmc3-n8|D(9s-mP7Rf-NJ!%w~}DXgrM%AGw7FYP6V(zP<*M2J0MQXJsr1i91Zy zSfMa+MnF%+2{-GPVa%y z6Xxp7R3P!3Fk<1nyO@?{Fi0jth=`|KXg&F~r_TDN&u*Huc7&zTo=B-kj{A@d%LqKv zsKxOLt0vW1U-|Z7mj4!Hq#SYKUi@e|s3@JN)nLB4)#iK{NIJ1Q%JB7s^zr5}DEp1q z@-SuY2`QX@s3~{=7@cVG?V6MXsA-H8vFX}7tj`$LE)MXHa;p)ep3U7wFXLtPSu`fD zzSJx$4gbKH=$prS@QR~;KawZvIPm^h$GiAqqXv{x(YT5kz(!YQ={e2<(G2P zO8$G9jms7o{xpSc&A2^j`E9%!Ot_9F>r2F!GHfO+JMeZBMFMq)h0QY} zzo$*qOr#OxVIH@-Hq)Qtp<;ZFhlyxi^D*_vSKqu#a4eq7ol-EHoS6z^`te=AZGE)* zdC7&9<}?7sp7`9-xd&U#5XjH>hSHV^84i=QGL$<2jVf!sOGQCNEgF(Eqb`(pd!R?;E+lafoipq*fO_B!k}Y>M-dcAr1rz!9g;Cz?)TTOG+}YB zzW7Le^tA??@({=9=Xze(EYyokd()JZ{%f(B;9_ncVGzz}I}7hvB1aphQMDaEZQaE- zCeQ-TO(c&vsXH7~32nG(s}o#@FwKWI%+`f(G)wxdS46CHC@8Wf)~S^S4RH0%9a1p} zqh`_*sC9pbal||U2?~Ydg^MlcinTzn2uKh-7-EW zcYU@jHZ&E*O>#_~ma#%a9lGbY*6pJVjy*MW)<{6>Ldb#?0O43ny!-leh2S*{yB*W> zsxE6c_*HD|=~Q2K5E`Tt{kxQ?VY?D7dB>kdth_+O$i;~oyom~4*K6QRvdWzqO*-7> z7}moKQY5_6@}FF%^%7hC2V=QFrFzYVn^^AQT;m$PTyYmV-mG=<0VaH1F8ixjw)A7MjQ4h z*VN~wf`|nF!_ZR#)~`s`Rjz^JpUOfWhV4J5T0t!Lz_QpGuo#wnXq5U$Q+WaIt;30m zZIThx#q~4f`doc*!@LJ@BRL`%PnVm`@|gGgG>1{*&dS0SO_SQ;>?mT(8z=S%tBN1o zfgtZ35x`7Dbw{Bd;Gu|0C!P5JXq{O=Eb35TGsD-zIfdy6(~!0AzGVwShwTY%qUt9-)&Nkdm5we5H3yM zg%4?hPD!g`df>Q^u5KTmy%Of98$dOb`Fi zJWo>Gb2uX#ceCK=G2$XYx37sRT@Ca7%4B;F+8&wpZOVn`TkR$FgO`xUr!CRv@SjbQ z6j-w?K27aSb+@hGvq?SYO-)N!r<(?juWf|6m965#dAD7vr|dd^kqOy$v02_rdy~e& z2;+aj{8i>KsIT5AD%T^pwi(j1zUC1*Rosx&@@S12@}mtN(UB%{t~F8{+b{SM0sK!w z80tTieCAnHq7xB$y43q`I}HY(%xY{3NsUL;g)XBKvt5_q0xo?ey}Bf*HW0C6b_Tj1aH1uaB{2lh{}adwp>?rh>>7O+rOsb!=LFAG2wiV#2p=a4C$r{l zhQhZr4fI%lPGPSWE%*?(rW=z+j?*l!b()HSNAgt{Gic&y^(gh0+~Dcs;v#I5*Wowe zH{-erac!pK#p;OC>c_)9rp!yWc0ES(n{plBlT8!)zGDj8ZJy|}T*=@aV6}x&*754Ku-&%n1i@c5Xbvf~#i+3`Pfhoi7>85`uLBqk1 zGn16hjPmg`?`-^csS|^37A?GZZRkZKfN)#@2aVB%+z?WF<2%h+GX9C#R2nM{QIaG*tGfNQO?()=c@7@lFGjbeCS!oSCKcg?o~aS1cSf^UQ_~ zt*sNGFVY%E+Wp8_E^hQpY-)prOf<#o0p&3-?bg>x zpnNX2{?tee4=r*^gmU&!KNeYtq%xEa&+k{oZ(I&Tv=%X~eQI{;ur61)&Gh34i8OM1dOOS%Tc|%B z#l?as^K1KLR0%njY+Tu|7ql!wV76D=^kW8Ny0smf@^9*R+Ojp>kuU1&9Kv*u=~!m- zOdQPzwILQe42QKc7-a0OphLi{r%ZW5&#QiqyF6w%P^@ELQ>s6%4a3bIhO z_lLY^i5-<_F$X~-*HS+HtO4fe0GOZ@7fA}XrR+GM2c)%xR(rmUcB|;VL7P=Ya+Oe_ z@95wA5M6?q44d)&j~9 zN|8!qQZ~j#qHp1|a0h0(<|K%@#*GibwD`}EU4yw2Sb7bDYlad{R01f1ed4ATaCU5O z6j314nq$0bk&)QU7t==NtWFL(g{F0SO5!ecPOrj^}W;+cH+D8|KstL$g{DE}bL^p5>YlXC(w^q_T~im@DBx0gjG z?*y)|pj#@M{u^gE2C5H=5trX$0+BYyyEt#r@)Ec6HFphpE}k{!4wwG2DUBF(+JyV^ zrY+b5!KxN|wICpekt6|HOJ|F$S%o9MFb|7ZzFUFN0Z1r4$-{5+`B`)QIK#mKiISo8 z->Xuec0~l6aN&u!#=6wZ_8S5f{#fviUD|#1 z1=mzo`4M}_we=I+qjLjIluaHU>wBlR)Wi^QBzZ7gBm^W(F99*J1%ONM*NU^RdppIn zY(=q8Xm69_4lh2X4zjDMv1&bY79(}wP-Lm2?sTjGAv_O^%OafK97gpcDe`-8zM)GN zmGBoxTOJdtp}&T-haGb;K+sB%nt zZEVqj^740e;f63oLE>z(TgHZ`zjhKSTg({*z(`kE@^;$Oa0jM-_l>+Acp}&hd^4A#9u^5J1}&- zEU0LIhfzSP-&ep3f%E@rrn;{PVG~iq1j6%{rg2Uc4t=(S_DNZ zzXY{6nd=8YlX^fLlt>|a)l>vvasWJ0!$A>xvGTz(fKc5(Ui@9D?^;dnF@o#}`X*afVmtDQcN z$v^MD9F(*4-0CuRpc(_<5jT(l481}j zL)%H6vcn|rwai2wCqPBHt-35s6^d{F4B)x|n3Mxh?Y4qALCEdKc%WpPTD2~1%iZ~U zu6u;f&xie~1qbxAl`q#of3~eSj(W@h!S->|?O|!I;{bH14sHDyv-%)P_)&Xjf08TE z1#y=j@oYDs5bfYF`M1K}`T|stZ9MUJ+lw^c=N}|*%6kD2p%xfAj`)2Jg9ObJ+{ec6 z47U2hSIX6@*Ak=$7|lA*T93gDUD5;jfxQEP6C8Q($8q&OYP1ai!RN5sWJEex&|U!a zSmr3tVI>@q@BJous`hl1P8+|^560Nm>j|Fl2LOpz)%7spC8uxM2yA%uU8CM~6)3K# z@0E75txg)F{(&cAZzt}x+wg_H#Qn(Ey$maEm!qA?8{N)_z05bf7hp7WoD2r)HnX2l z)MNzcPS^Ye*ydhU{M77Z|5(FIZ(3axn`OXH`ojA?sL%%V-XN~I-W%? z?oXNPop3FD@yQ^Vh4JF68Q8t?s|Bw8EXd|@j`b(NCs_o5TyHq%-=Ja$aSh|rph%SJ zM@PzR0R_)5FW@?`xjhPnvXIMswEOzZjxnF2Wjm~+T&t$%Oi{*A_8w?E5Px2hhV7Y*&}V~L+nZGcct4gc33ag6(@E>@NeXM zvTzKi8y6gFomO#pXr8y7^XKkpUx@&`3OkwDPnzm><`)b-j|>*%&j6<|Dl?=$#DIb# zMJptdB)E7jn|IMMhC|>QLFY=XIe1Q9j%}ESUcx>hgSwvCtTzb%xG!i_$CNK%s9{67HoFvVj<)PuDS z9%#cVF~Pe5w9bnjgfAhQT_4}CS8k-LuTu3TnbJm5oq?um+fj!@PlKwcth=ZLZkbpJ zWgi)~)_*a5Xuau%{EesY`^ukB@SH#C^?m&%yoM2$^()a9ZtMv_rq#=8@tsl=g;(fi zjt<1xj~oz0(ePNL!ArVOwrOB0GgxZE2N52&W8aGrl*XQQKHAcxLrQ87b3qA}O|kB; z{^GxZCcO-hDDmV9jDz|Vb>OF7w>S~Tltzx8!uDxl7an27qyelrKS&W};tehC{h6ah z7Y<>Gr5OxiIfql+(*T~Gh;ndutRDc?-%#W{mry}QV3aC9NcLJHe!1VMOZah6Y^o3i zOBU__dZS2E1@5X(NY~Pt95MNmO_a%}5Y(-17@{IY=dCiOQcbb>oA7RG(nfrM)ELdU z=Q(fRM$Nm**wMw9nKh3z`|b_E{q%gASeKvQwx#_8lA~Ei(7%?lv2<}=a9*U+TvhtO zXA)9)(}oK-mc`}t{C!~`^N3&S>#F5cK?wQ871n0Q?hf*re$Z>G_m$sQwUz)zSDbm4 z(~b;)kbOaY*Hw&Zs^Yx2qED>t8W%IfzWo?#eQjO+nU&eMtqz9GX+9d8?yxO=29Wu) z=60KB?&-9a4X;VoWyZ85EIEnoQ&FH-S2Wc@o7p_{Yk05-yvla15rGmh!A(hhbqS*q zVEfSkrI)z20juxYF^rlpKJat*^rGT$k-(|#BRB1xl2J{QP%Z+Fu@FTpT|}J zRj$%O|MoqqUqS(lYWT?zFjwjs-}dE?Ab#_bMwxw=jbeRFE4yS)_YL`Y*GVFpA=7Th zLx&@BrmtI6MD){m8*J8}+pis@;C@6%cb#{XEm-&9u~{blM#J&QL82ew^{)ri(rDeS zicI^Y#OM09@$}t1A_;A3-|T6TrW-wBU7r8Sv}w|?RCoyM{?PRV50c&=F{?7v!iY|b zbPGWHz&jN}=l>WD4brjzSGF`|4jy0aCp~2l4(ad=_E{$Cu!z7jZ6O6qaX)DrYXf!Q zrt;Pl$Ktrq?$xZOq>II^77O~nHL&f#DYCe?i;F7z>}G0(W5WbPkU==9j6TV)%STIo zd0^9eSYm5IrR3TFJF=n$>@gUq5Yi{whC;SiyjW7L1?MiiF)&ii!?Ss@XtJ6ftHwx40v@w1VBMhGOqsCm>%_RxxLk7L zIRk@eWHE4<*ynCrWSUQ6>M?Fy`e9sg2)kVz#3CMT-vtROr--t9g{jeoAwuZpi(d`b za5F(W{?zxQ|8XH|Jr6Co^3o?l`$2Fka5sNQ&7^%MNwXre^%2^Dvr~#{jwV; zj5PeEL3slmL(Q&4%A{jBooh|5fzg=O=XUMnTYfJfdx-@hvb39Alz*rcW%}*vGXRRB z%PVCICS=A~q$JZDxp|LOn0lIGrz-)>LS@7`uZXUi(9v#5%53A%kXqd+B6cF4?D52l zbz;k^FBgW;tcmQVF6G?rI=3zw-YtVXb@nZZk2s*Rb<>V028*` zrs9jr!+zjAN}B|{h%D-3z~kqZxYPp8Gyj-zAjd3xJ6r{G5QdlcQ8H^fAeHkT`wric%?(#%~wRoH&tv4TV=Oe-1 zw>RqA(|$kmU1t;iCGr!ivUel7PXzBZQu-3OZYwkt{%x}05rn@btF$yjoJgBzMJOaJT5gT1xq#0;B_P7QC zfo}8iEnLtZ(iQC16C-J%O6z_&O&mcIwjJqOYIS-|(y8>;J0=vvoRY_VDmGrp|7e?t z0r?w@`AC8|#BYTA+oa-q^M=`Ix|x$|k0hgT@=pN5J$4P63rXo+{=QPo;9pv2lWD}S zzt)w7nc!-b$6$0Tz05Z(O;w})L(wsbnMu8EQDe6)c|lsqOkp9=I(Az|Pb=OGz7L!l z_G9t-aZrwqGIvkLFbcl;bV{0v?{(FtO4yuSM_p=eT-yl`tCY2oECw~Y{j6S=81^Sb z2_)}60O7W^OiwqIZ##ufgduQ7w#Dc)Ht=Ez+nK=rb-x|^SU&1Ie2ltS(WMj~$%}MZ zCgpc~@{g_~9#+39?5ei4Fe$Hnm{Pt*cK{BAsP+_G+8=1t0OvF#%kc&36Re2m`tzP` z@heXDQK=2srrEYNY-J85V*B9@@eHlL_M2(xH6LI6n9XrR1H>v<_CT9y?wfDIN;ARH z1|h4#-ENqX*(QTHvmL2Wk0wjZi>#7f8i#UT98zRFpeVALS3mkAgtChtEFv2hLzn~u zGc-CUyw99E-C11LEi0-Z4tG4|gqwb-KMKg|ovnjx^IW2oz~`DqQ^hF{tf4 zaaWdmE(cF`MjrD}sT$k7%Jnk|os`ciF?BCQa}O6vNa6-n;v%s?I~%HNm*gJO&VUb`P@Cxm4c`VR5w&`LQAtyku8r zAG}845i>+>+`_RMYgRcW@~C`8C+;K-7pN(hnr{MZC}?+%d?*oD_Q5RIr>T(u8fj8` z*(!wXW-F4R>At~|6{n+=r`~8R9GAR{ha!sv^DD}H^>{|)$V87%Ny2rh*!Im9-&J~{ za+*FP!d63LLl5W@W#Ba(oegg3Ts-1jZtsRWQ;SJ=Wk}=WtTz6;lg~?_Qit5#AjxU< z;!a-P#$XvHxoCE9Z@)#XgYhu_(A5Qg^*}nNfL~jd1srWRRG(A8+z3QW%rDuc@O1i& zO5LbC0&2Q+baN=YcF9U?&mY4jHlQbnVK^r+2GG*((_4hR1Bg2g>!6bGIZ*dMLF!KU z>`#f2WqN);nj6KC0OP|GN3s{hE^NC?=^>TZggRNyYcQsNV_`(Xz}Cw9O4wHmWmF!4 zw4fsoFUTsT572ILUtLH~=De3qdC!)S>&I&9#t(L@Pdk%#lZBfk8STom%vfV$Aq~>% z{rI|<-fG_w47^(&$0s7bdhDreN;x<>jzbT`V{09kH)E!OX%FLWHa(f8y6-FnFEYgo zQ2HJu?bQ}%s93IYWDPTAvWPzm^0&OAyQj$!@L-F@w*5}ZBJZm;sJfo6)74EesN0hYZ~MD9zU))c!{PIX!>#IjeugmRxx`b1ecwYQljY zKdQH4LpGTsAqrfiZpN*=QI2UPoT;V+v)r*HYjm>kXCeM`-@BjMdk z*WQfqC(m^1>?luTa6qMZ5ZX{-zDF=%9=so6Mlggyr-ilGQ&0|t=0K1!SE5BAfZZO= z^u#IRP;x>+lXr9-zdFMI$SuR!_|4*>lR18Iyv4g2h*XP(@_PIXB0;jXajCdEeT1~> zqi@2 z3wqYeWLBAm3yeYCAmFil>+0_p(tUps;6alQ3q|aYVe=cWHXZ&61j+G8He>cy{QFr> zu#l9ANCf^>31or(o-8P<#3X==h$@h03Sk@vODcZ_1;9|$Na)*9ysinCskbZc+tKYh z?K^mw`QTvPBxx=lO@-+*EV>i7uTo%(-&r}pe&UGi)oBEHDY8reoiVnA^|eRi|$xRRMV2u-p;dWXUUAjLcql95zoD8P3{ZVbSvOk$4Yr0vGh* zllu$H85+TE*qv{g(224bFPc?>T3GW)gW91~#g5vR%~Zu8@```8*yzFjz6J<^T(acA z+Jo84YK-xZUgS5*33y7205m-@A^w{}_`Pra_Mu+`b+gE$lKKlMtwW2$&CZTJdtv4N zdkNqIfZ~jRLed}i{9%nLfz6MH=wH76Uo1U<74Sq$0}81AfX2a5hU5Sz#+CnHwcvlM zWdS$=a*(15|Ddt|zQ$z(Hb3!pR{N`3(}2=F`&C52OU3_t5%T~oSbR9B`m0(+FOu}v z=%W9~eE)+py8bz|V9+jeJ=D#$X)~8dpJ& zc~vLIlk?*6B+&(2Js#36S1I<25JQ5?hO_u9I5WWwakQr0s5JjkeSvV)I9#0q0?OKd zIOTsD8zBjpkLj?&_K(^Jppp}_wTke_8NLJn|GCg#@I=FDwbeguN-hOF9`2gehxrGI zK}7)=ufM3e_0NK&l7PoiPdUTs{s7A5Wgtv`{z%^J4SjJJ0HDtbphY#CtQP@@3y5b~ z(9Y$VRE@~LZh*EkL#4Db;8moyeTZTOgh;jIej>6>8Ohr1K7d zjCFx1Lc1Mem<-wSlvH`kxJgYhTp$XnL?-l!PEfdr8*i9}Geo?+>^SDi{N#>)PF}xem zY1jv}_zFV60h1)hB#w0`p-P=#sVueO#&7Xz?eBPr10JsT0o~~-0IpM)ytYdLpm9R) zi|(~F{GMIzq4y6viK}kw1jGWPcg98liV6%mXQYc{QK)x`0v_d4fHawZ^$Q?&SE&$A zq*dM;!6&zDeEYjmhH9cG;B?M?fj2;+HS@w~Oj>mT3voh^9YAl1C0(1WIt1u!J%1dQ z&RAl?yy8+Uo1(J%Y)+6rY&p|k*0LJbLN3{b4kG0ai`n3Kis|yVP!2muem>MadF4^vZ5hFxA zyCnLlPoYS04FcI6Filx8ptJw_e2cdZ1_*AG&&z225sP)9*EeTMKa^s;fT&U2&SggN zPrw{1mL@*5ya8f2uUQ50+N>kwdK&2ZZ*OCHd#+}VDyq-6t_cJv*8seBAeQL-T?R0c z0J^3Vm?BXzWtPAK(lN>ZXpIjpIr-+*;={%-H$d?J} zq9()R9EBveXk=3rV1qT=V3Ro`lt(h{Ab5j-vjwjjFSLoAzX<{}!4o(YQY04IZ~H-{ zQnPWX*^Y?4dipm600Bp)8NKIb1?-XjOB9*bfdp8CeYHk|{J);A)tMRPj2ca<-J)^X zB)az60m0PbCEG92s6I#-Bi-=n2nx>m{KHq9(T6V(TZjyu!tqH!$fW7FryJIWlx2j6Y4*lMg>7j3W8#!g``fP-M5MabH} z214MPlz!R{*-Z8%?;oA%3rD&s9+9@}YOKa{ir33b{L7}fT@qNL9f2Kqx^-yAQ* z&R-JT1Etq&GyxnNfCW2fz&8}XpTFdsW3kpLI~1&WVsL;RrAR1~QT~k|0xEfNcRJ z*sZmVl1T3}Fzvf$VhY;XXH+VI%>oR3!awq=7E_FxL`nBn`x;v0p~dpD&ohzl`>c{G z!@3owA0YdMF?gY^68M!3X?|t9t<_#VEM7|`u#ch00fzx-4DDp)2sybF(mp!)W zo`4ub6NBow%uPpzN>aNj3>=NtkVJAcX3KEZxYEgQLCcO0ABsiN>isY4#F3;fJR(M> zP9lDrVYuD_YP$eTvAf2_x5F!MxXOop?pF6v>&>^}hfxKx9W`Ksllw9`Y7MM>MJ`)i zj>WhDbf00jro`BUnM01w6&Ue5^ppId^QA~Nz>%Y#%w1rGB4zU7@VY$#sJWR47eB67 zNWB3Zdu5{U4%*#w(tkbLLAd5OUP z@DepaytHVhYzeL-*YgIJ|NNmr`o_!&0(?by0C!ny*BAIr+&qJmd)#Wf>=TKR*qi$($Wz(`l>sh{Xe{aM!!xd zE4BJu`^t$BEz@`NZrTSC`XdJ*!-XgD`P8S%AH(+~r$j4^Tb3rmWbp)ui)-0y_4r-n z`jx+ZL`EREM`wz=;8-=yvcj<*LrQZY^QbJ-NnS&Px>QsbqrzBqwXe!g(%Qkh^{ddJvxZ6(+yc%1|kexX6`P@Ta`&iIbbV^F+ zpNvaGJ(uk;fYYhf%>d{joun9x(_4{W@wDRbhnYR*(ZIL0c&S^P!OA)@VS-Fjb+mtM zfICGX@DKGeGwmk(_U(KqRF~Ws3}b+wQ3}7T6|LAqwBDL30*=oa<;dpfeP$QS+@0Bw zy)I^z+-T~7r%|Sn{A9Y7bkKNAwD>?)KMWTW?N3&o-Zj!-u~8YH-@07`l~-&~*DOS$%`)-bc4J*gpd zdjdc~$npC4T~f`GWrfFEI3t7_+!@UopYR#o*6#vIzIG5dO91Pb+w;)mEFDIt*-&#- z_tdX7pe2X2>A#+vpO$tN0Rov^%v?zI8-H}yjf_MCWZzmmr!L!lwoH$8O)MO);lj{8 zQmEzWatW{5L10`TI5(lN64~RvnwC^f;8|dIFzuqyFSwF_bu%Q%)n;3Y*;;p=k)62> zey#}sdlNAWCCF-&N~3Q`%V)~QdiJ8WP03fbNi#eR4JOjjF7P4prK9Dk_onoRhWLpX z$Jz4RP5}3_tL=4y1NcjrWh{o!8jg}$LdaS-#X4`YUyG>7VLhnew(l6;rGV)(h^%dV_GJ)LvWX;J~Lb5Szc+E7?wsvY+A!9{D% zwyZ&p5r3>P83!A#`2MWw###SLCONem0^M|9uPQLrx)QL$FX05?u2&sX@x8+%VTrsG zsbJo#xkZAxUA9sXQbJ)(Qixt__bhh3@Em^qlL3Yu0@`q&G*mk$y{+K3zDK(zqr%=l z<3xlj=_>h|tiWx#cV%MRY7x#TRO@5z#i@qX%BOqMNC0~X6{9jsHkjb-0kCaJCQyS3 z^QWJs*m{3yqIuv$1t_d0=}i3EU;Qi6=`Ab8bT|xVTZ^RIJhP-WTP>syvfM!64)DM{ zC*Ak7TZ%ml_-*48e0SP*7@uz(!iPio$kX|6>-Wbo-r>t9$eXCj8OF%w)$SE1E?Khg zp@l2GeJ@((W)y2c#QA*G@SpDjK>Zp|>%3tZc+C6N_&&CP*MZr=Qipnuffd*AQb@7rE_rWk2+(w&!}YQ?9+$HrHL zV74PfjWPM;xOUNJ)U>1TwNVF_LN7XFwVrF9W$2|p3D`>tVQ5WMU-{n~wvg`$4X!F@MPVGzO z98OgbV|Z7Xxvt4o=I67oSumS05HnDL8R>K+;Z>K}kY+u_y)4r8wwc2}ix1uC0rhorHQx+d%9tM&3_@4Azq-AS~H-phtK zNz0Ev0kv#J&{*HbqCvZwHx1A5she2i6$A_&>mhL&q9Hzh>u9WL?{C8h>sH*=NfaHe z?X7fQMxZC6o%eEbkkDA#meU|<+~u-fKG}66guZ7slyM{vPUzcJ1XOki1iHBCJx{O9 zG#rPMD#o@h57KztY;VQ5gzavPr_Ps3_^dmCSpU5*1ULd~1(}_7x20`~`54Gm$gm65 zdzjH^nQ7nM?$OpkvWP1V-M#GFxNG3&JyEkB4_8*qQV*V<*cv(^RM<8o$rLK0i8eAG ziqcsu;g2>YOk+qj>f=#?5+0(&)ldl)Vl&n$OvC!pTwP#Iz@d^t;6AoelN4cF9+cEjZd0Fc0>4n9w}MStnOHccRNha=CP=H+&G zoT5{FpVcOF3{o`JV!K*Zye2fLJH0N^@2e9?EpwGw*xL_k{dG_nn{WcevX>89W+R!@ zYZH$~ktAZ^dblfcVFo3DlDt#PqXUs`=q>YRMQ_G%esI%sQpsk$LA9Onf<>cJb&LGQ zjfq|14)pg<1wjmhObbM=zHc}Vio3K@EK2?IfDj{0K%}Crk<5@pqwdX?^XF?PF#qV0hWO&^BDPmUl;#R|dJNR=Riuk=Zi%x{L< zB`%?w`fLe%X$L*Ya(~H)I$M{Al?N8pIR5L9{dSnWy!TwdxBoyP8N96|sQ{)R{LV!%O9)>^%Y)N&_v zh%%gpOM*qmuv1aS*O94-j&RrqlEUw|e~{+>*LL{HY_N}11bH5xtIv^SeO3^4pZfLB zFmJAc*{z~5@z6)^QC))2nakwhi>F)Ui5$>bofQ;=*pbkbWF|j{Ok`WxgO$h4GY0nS%L0@3@lV~*Dj+aw9YL=ENT>1 zE%e~~F_%4}H|$G>Qi{OWO=wHm`4{6Ii&!xroTaQiGGEjrxQ~8t1mWfc1)s5z^PZ&F zEq|e3+dc(Ir)!%HB#NH>=m1Muqj|IQ-H9SGo;!`{LU$8O=}+z{8(AbN`JS#0Atp6)M zerg#A?&JoD#Bh2Lswj=^PgkS##SF+r>GTL+d7^0Wl%GrvF$yvCV#^pHCzV*T$)iggT994|eXvaip}O>Op- zf;s2?Si&xFzQNmcTc+W&=#%Ia7XXdm46{Rz6p1Cq<7DiP6gZk#m@a8q3-!|<%Qbbo zKqXH-ZVk+8d#DtaA-S+TTIQ$~S*COD&JhH5lYwWa$ue}(<*XTUVc}Q5Up71Rj4*rc z=j_ecP8}2qKxF=9X#@jR>MPS}tu^HXl_$H(%QAMZCkOz>(B45g6kIxNaW}eGn64{1 z{ZY24p;_wxYVSRuqFTN^VQEppL>2`J5=8_=lH^til9PaB5tSr4v}9UDP?CsA59ZLA-nKmb^;c4gZS=bOCODGy=zYdv5Gz!z2TBZk09|v-~N8AN6z|(F=1O{HnZ}HYkUT5 zu5yn}ZTBO>HNGvod`JxkmMc5zV-hII>z2PC!sJgt9)0X|LP=T4qIwb4*5~n2&f_OH zgYDAT@Uk)_%uj_rmW_#I3YS%L6(e}qR~5|oygm=@NG-%WB`DAHJk66WeBm?vl)Uc8 z$DQYGs|c!JVzgHuiv1jCJwt7@iQhtcgZZnCF-&@PMNxlf9DxqlaG&2b&yV`AB0P)rE#WPM8-16?ya(|z3Kvsb+L{a&i!SKIR2K7*G zXUmO=@bByZsWiwVj1)@1{!Xnh#ej8>Z3HtNp2$5+1Kqm}k_)eANC}z#$|JmW1v!`y zjPyI@zjHV?AYa6qCdc&lpCpDad>JIH6y51${*NtmJY8x7)**-FIAP8N3O7h^o~KVEvziUlOyX$#cXTrP`` z%i|7b#>&V%EqkAKe|-CTn0e7V@wP`-40yKY{3R={kiDu^RTMitHIQv2_C`J2iTTFk zs;c4-bP0>iG!YW@X~hi0jL6JNpPC2S#gcpr`L~NteB%XdBFm_}uSgTx6>s);o2!F! zHk`*EtN-1vO4$++I!q_%o9N&Bd;%1(6oMqy-(8*H1tdxDIbLP1_RofAy&~Fqx;bgU zXWC`Iw$^2q_e(aDJmt={59PC3JU$+L&o(t<1hmq6r_EElEyvZbUGS#j{JrJO5Vy2) zx@;r}aEcUmf9kcmaF9EK`Ll>FE-UNM$oN> z(Mr7Eu&Fa{H9m?uYAh_W%2vXOT;laFbKba5)Fmbo7CG~qnMSQ2^RRt;7!p3SvD-z@ z`0Tc`rB9Z&8?N!i^Y%6eC@%ZHX)DZd;&ZMf^$Ywm8glebP+t_(B!{%X_h?){SLv(sCF@c% zf;15#seL(vt?F&-lf6vW8W ztcyQcV{XSvcc%taf+BIt;+az8QzaGS-IzV)Ve1apLpHFoU(>r`xoW^<);)}&;Q+mm z$~GeUcLoP)KOCWE3H`{n5>bB0wsD5|8BGF+45kt!z9vRpTRh zt$rC))GoNz-4#Grj?cA-KFD(~5zKm;h054geSZgkpYg_gx;P=fX`g>tQb#Ra>&oXM z%V$4ySNiE_mGj)a1hb44!bPUavbPQJuSJyS6Ox&Fv_GLL+xz)8713{nr8L0Uj#uXy z*s%qV)^HUj1VLJoJQ%3l>0zZPkx+?qRD8}0<9X)?Z1cFp_lsBfBSn|j?V|Y6)>GPY zcIMmfO*_Nl$Qc&tkjZKnnu@4+&2|T=x#;p&J{Y*K+e|j)85$cV1_NCMb@B{)n-@_+uR^qPt%? zK16a`Y;#cR?et}Fbtb8dZk|yR$Br>R@G$0Z|Ap~ey)sy%>c$3k0Y_re_r2PGs@<|+ zyP{j))e+@8?SQ&19kaXO8?h>o{G7VM!*s0-@E7t!Q)K(Tbj&5|a@rz&C--!r9CJHC z$6fsWzSK8g<7*D24_$}`jSw>CE zshlR+41#A};PBXaf)56c>*%?!xPgHP7M!Rd%GazT-!ZBXm(Yo0P)6ZvnDs}jFXZhs z3+THekN@nYdD*S)rVgnri2UBrQ!@pua!e2nR8d`}OZ7$69PK?%Hl49yFEt-z2KcGK zfvJGIu=G;ReD@f2Q{p!(_HS5T%TOXrZ*^pb=DB|hnaV#mV77JPmXuNye~F>jcI!p) zr3LqvlDbTU^Pa4C&^bN>{C9A8DXMd$;)7`6JCo`6Y+$=rztVPC9SajB>OoT{XAs<% zd)P4N{PrZ#)X8(yPGm&RxpZNo8zSWgzuiPe@R>*33~92%W9-`1Irlu>-7qdj)_W(<70ngqL4zcQJVg;DGu!OXFKyOO z>e|MoZQM<0`nbd+lA#?xDi5;s~ z?yu!m`H0XJ$u!6}b)W75@ZZSt8Zm1>d6nyUqB>pK#zZgc<(n>Djh)@vRylm#mzK~V zx4=yvhBcqZHYC%KM5cxzUGTI$ro3D9S-c6Ux3FBt5pA}cdT61XneLRyM+_%dZ!Y>C zJM*bTH)h<{LKEpDeZCVE9GTfGFEf_z4M((d?(e4zuS_wyGDiq4&B_U-q1l`x6Qo=&)htU+ zaVQUa;`_V$o7tB;d$ER4(R@-TRsArNr3)OHijVc2cdw7-RNoLD!%4xxJA(G&Rqz_y>K_e+K7ZZ+xa<%>1XM29Ezu9+>TS zF48~#1>`|>Auk>Mt4RN~EAp2ufXq{%00?2-Skq|5We5bDR_^%t9zcZ!ceB z(W5Yu_?sy@sFyXAi6>qhr`)?{w8ZR*#Ua9 zUyX^EBEs#aU_7=`)%&|HNCMG3vpE`B^Pt7*|A>Os(TWVW^+x!TR6a15CEIXx#MCsmIYp z>9X)=V7MFx9kZT9@;l)=^9<(1u@h&~O#2P1Ir}a3f7|b@r_HyX15dea5lQ35Nb$1C{UV*|xeYAnp&0N}>746q;Hd;DO(kWv3#`hC zmL+5@h0h0zoq%E;!jN>@fsz`t^R$a&MjL2OK4%;4!fz%po~O;zZ5A7*?THJ6Mzx@w z<~NKT*L~jee4tkky&({QQ`2uPcVD78Q_`xFJC^iG?cnFXwxC)FWT$A|p0TPfA5U1@qQndW-F za9$Xd3Tl%L(Mx)3s_t$>u7%l0@S?BDT9F7r$Du_-gAlv%z?89%2E7swZD5sUR8m#H zQl^_0{p?VozuAW=6zsXK^V7cf$x7&*i-H?Mk8rxn>_G(A6(9C>uqlb!a@n}h*pH1M z?2qaT-$L?!7tsj78K>}4@sd%EW#Qfj9sBOjr$6Ie^9m}DBXQ^=9K~(hwWsD`;>-~k zJ-wsaLMEtM&3f;L^>9UsVhOh7r&X?g!5lL}PY1Lu=5OurRu??Ia;@pyr0&wlXiM(1q6(T8(Vke9O6%}YZG2UN0qdz=F0qvd z4P6!s&o>KziA)&KeECOQ;CBeIUjk!kygC|Z?hf~!=^{@XSW*V+B${sn@|2rP20 z@4GU}qytpbAs+!fpyx&uyQJGc(}w33Gt!Ach-@ggUuaN|k!F*i=YbX%$8&OZrr$B) zJhoLK()0-<&vkV2a~N0Yc8#%t+vp9~#d_s^n|haX2~xt>p;E-(=!M7)K>&g%HU3m3 zh)K)Rvva~}eUk~2KA0&+7#`u-J*=L+32D%lQO97v)oeqjduO0q^yOKbLXRsLVno~|=r6#itxB8Tl8_m($$I}bXkfv9p683%FF17` zwwgRF9#yV@yjkl>8T-GD$0HER(7V3;?eE&r*q6Y}a1J+zVE?u#)eOMzeK4l}FK#m- zMDL0yR@C1?k2lcyo6Nct{l6bX3VM7Oz7$r9gDS87`~eFEwU@z~C4WDN88WAYh-eAm zI{Yq?1(Bu;;N`0s(w+Z)knBfrJL$a0>pu=OG>!6gnEDw?PJdOzpGLm_mM{aCx&5-R zHtCVvfh98q!kemO-qRsp$zP0)d0})l$F4n)`+Xe4X}O1WzOTJgZ%9}~3G)&LYs*N| z$KcnTr;BJAVGKnVkDd%(X3qkurJC$5U*U+4=dmLKfWN$i^pXhA_|dC^za%{eLdxz= zK=6l9&<$|tVkM8-`G2TFjC{kY%%*de(Qy?~S$b(_(jeLUI;L>^o4L8y!)q6-@82CN zO7{t%;Su6PEd$_FSksrh^yaAZ^gSv-RHXA4j3`mra-(0e3_{SP#m=hmI-fqsr$)ic zeCFMs4DlqJI`;0w+NBo@Lf4h+@uTT>`(nE>yCabcX5HF_4JV_Tg{FE~8hFzl)0-86 zmEM8cM4(6j;Er$lWl?geFFo_yhN>Y6fn@BY{;6EkydMH`JDpvN`qw~v3G3+-VWLa3 zPW1|J9N-13*YB45?T0)P+-@3t&@ zR=mZp6@vXra9;@7Jd>X1Slf9X<6(8&rYk{n!|jJh+2qN>kFuw}90VVN0LP4EYBn7* zTqj8nA}@N72^3bnqS~9=5O-#zfGU2Gam1}MG?G8lWDnWR_R0;9eb|rA6di%E#$N}^ z$s~OXmeU+lMy-;lkFIIpZ2#ia?tE_Ar!E;REU>(Vu{2#R-F@`Q2aC3Bt&)@H+L={8 z5%_{u<2~keYZvtRwXbU&^cz=(TBF=mE?bM6(VccEdX~{0ykk|=5C|ax0=c2C6fS$M zE_6nN{wbOqi#$DD03+yOVlX<7_3FJK(VV628Ar4&4~FBwM!<)G=$@}~ZJ6)RH7kP2 zOg8TskMt~6ie28WmiF5ZpyRQ`7!jSn|C02zr%(9@y`nrbbf7P2*(`rq+_Hz2&UI-@ zdY9T`kYb0{@e}Lk7vNDi(oM@MrBgA_w`~vIdc?XX*8?NwH8B6;1jlrer zP8@-ooO$bP$aTZTQorbB z5Vd*qXf;lk5X6?|x-p1;2k;l;G70-d?7(M2=9IFt>1$yo`9C!>JcH?kT}`g@?QgE6 z7tuMNl@#sHQ}PMzt5$uw5=gBaSR-})#u?>%1!4^#R1Q1)+E*PMn$|gMH{}f)8Z6gk zDVk(IGb;5c+kX6jQ}SvM!E=J^R4S}7Xp`_KlmSVGG0yW>ufiBIs))q*e%$j(`!x>a($x zO5!??Y!HbS{QVMU_JV6Upixl*1T16qg)L%?sCX?Olo@($txN>$&k-q3(`P77u5snB z{eo|W-owAZYTZZTB4E;y@9y;qY}G>9xt-NYNk%iTW=dVV{t-e4W*CJ601_u{EL6Xn z1e$@vm~eMU6I8A3aFs>z4qM4-+a)Jx=$BL?S!puSmgr}Z#<2{b?~Dk3DQ==c#zQH? z7t9yhNhC5oHs?J1m1G?GCH4>0g1-I$N4I-m*~w}ZK5|<~?Y&wHvuc0S#EJaD7EJJ% zz7^Nh!lHOlz-T*rdySg2><)E6(%Dbse+u*KAqAq0=}0vQ@<6`377WhEDCgcA=UW?+Z-Krmb+nmpufpy}b~AnF*Fupo#TdS0 zzW#W_RD$B2<7(kM91TTDK>hXGO0qY166_PM{Z5VqUpTY1oF@g^uZV8B^~OsFjh$ip z7hmr(68v2B_1aq=lrm(E_3Z_B$ttu(Z_rbfSUx1b^>D2AJe~HC zvzKN!FqceSA*7!k*r*+P`q$=72vuFKk2PSZXz&yM^~(+LtB~>>R(BbZUa76F?3+fo9y{s)W(8aHpbULsyUn|@rmC=q-F8XB_EoO z?^AK3CaVYG_z~#jithfmk~@UDHq9~%#r!6nf#wVO4f-kRXw$J4@zRxPKf02g8Yfn| zL90cdChhF+%A97o1Yd|Zt+A89ed=>=Z$5K@b9b@5W!`xarHOG~=uD(bGf3B$)s{Q} z4Yvug1K1ZMoTy)q1a&ec`lGMqqI_*FS3&UH*)o@9Tx7rP6nm~g=eLid<`bMcX*^Hb zFku7M0)-A{B|nYqEwjTyrP9Nsu2f2it=q+K2u@e!iC=6>+~_Q_@8^7EXK!gQ;d!E2 zKdr^KAUCDFlF?`S(Uo+LSAuUfFOc|3JZ!5X9JBhNYnOP}POC@}9V+}?@u6vfv~*lb z;}gY)dGo@p6;nJ<-#ZC+iU6~#3guSFon>x*pafBy^w-p@AZUD!cJYh(rT%l`xG^v7 z95Y1kkCcOeS-JF(pVcd}c87*0*QEoPj09l-EBoQ_lp0yq@;hzOr!L!QE;Mk7u54PB z?e1ofHE%AaYFd>}!}2e?KS~#J7+=zwVw9NMAh{@DII>75B_Aoa7eyrQKH>dSK+oN9 zWjx|fx6#u|VCTWaQX~&X-Wn;5KPz&vESwIPUT|4oIt5OYl!qfge7$F)fBvcv&>~{6 zyRXg*Pp{ftx9wbK=X<7%34@10I!2isG#9#h+EM))Ozv6F6ThxG4H&)gcCPNVTC0pN4b@=t2P$ddB^vyL255l zqNEJ)=gE)>AHMbGI;4PZ-upaO`d@k9KmZ?E0mV@-Hh+`E-;XJ;T`1!-pYV4OnEg@N z>{8gHkT>WaRzftgyeDAb7z{aMn*{!s*J>vXJcx666M+1^4wE*6OS< zIafh@0`d$&1Vpwz&UVx#KBG9en6( z(E^>YJBr$G^Ll%dN?w{yl;bRkg?3K%#XU%IAQ?zoFH|yAaDn5UHkDlAT^g)ODBmB| z#2Bp6`rHzDuMSi-J3m^&u?otIacNiSji{+tI8iJIz}&gxG$dyD!}3q6yIF z3b^wsz;%`56_9)?aT^BVGdjHHZ?uGJSimVqe*l5*|0@JCv#-)m@}12*oAcw!6W+Ka zRP4u_Qwu&j12z<&n%!r35(I$gqa>2Ft&6nu}! z{@>rEhc;L+<&O=%YK2FT6jt&!;FSPNxS?oh7Z+7YHC2qW>Kn;Y-5HMGme&T9C1`=V z0Mvh37huMs$N&lm65xk|;%iWe(TG+QD_6mnF5k-(CnSb&gg`msdU#rJ<^wN*t5`C%d~2arlC^`KgokfuQd&@LFdth=wArY^@$*y!g&fjEaPoKuAYlwT z$NQ=DEmP>7hX4|^bzI=kT;#h*NA#%22I~hN0rFzCJL0(g+Pmo@Wtt%Km~2gMRtmN_ z4%(vohV2)Z#rfgTzcu1$jZ*$*VPq~y*`r-%AJktyNN(YS zt%)jE488}9bV&gJXYV~amxmSkWx8s{@-gcTz6%APcx{Rg^H!hO({%E!U(axgX>G2|GgQaG3FMSWw zW5Esl7F{{DmVD+NLrkXn)eOpjABaElAm#8%jX>LpK#SFV7F)Zt{vy&kuUW$0&@{;E=>`9 z(jI(F?P&TXAa?MZqHMPxFkewP)PYR?;j=e^liG!yee0l!zbqFEd@7Bfv%Og$p^t`8 zj;?vhu10d3WUK@MuVv3rD&C%J#e6nym_vtm#%`2PNVHEHL=5R+-XMHbxY(#3Pw?f6 z^K1>L{IfH(ii2>XO>7(hBPni7p?1KSmKeCXM;@HLvn2 zml)1U{<+|TAIKk+=Uu#zUyv*1c{q^1^%@>`q56*rX87+bS2H{>>J3fwQ5~$89_|TG z!LPz<)JKHFe}Tl>hS!2{p^mri9dK=q|p$+fA`M?Fp0;bkAD1YKj>jxC5r=UL1Mg*n+#+2cWKejruMwg zEEJ4R6xcmoSS>*Nq~$mY7(2f?q<$TsW3%jF;W+8@l{HPSkBenirTgEi{U|*i?I@*Q zm6C256kF{!v)5L;!RojvdoLiGwI^H zJ!COk9Z9*lL>`7YYqMI{@FSVw^zuj|oF-f2_I*kE1y~}j$7(#$V1GK!(_?!AG0_<9 zu;oOiu~m!k;LJ*Ar43)5Ga`CTdpT#+xHe|lt8$~4S*W>Tqnsm)Qk?Xj?oeEohmbWK zSCz;IZBhgYaeO3-=R^6g6YJt*=8+be(=O0lw)Zz=2A!^zQL!{OS1^|jl&u0Tw_31jS{*-xI!3m5R% ze{Y=Vs0fZ~rl$=)_aZ4PnJqRnKOZ0-P69;Yo+{j5QMFwcl8(3E%ZE?V5bvo)uH*;| z7LR>1XZhq&k5CjIjmv7e=XfOxhzxz2rxmw9`-CebPjylc(GFevgM_0#XK8t-f@^;y zmwyA-u?$0&_LOIRph<~~;X$AUti#sXsi7HldOVmn1#osVQggV$4CG1=M=TJ)4IEvG z8O(gHlf?+bw`B!4&uf2u2!4NmZ>U>fH|>XP0%!@$W84(MyR|f9S?G|Zz`y;ozBbIa zT!WC9KpiNQ9G+L7LM_{nqgo#vg`hqY0NosAgzB+;BUoVauQ6_lWU zDBo4h1zE0-7owCy7Czl9sNbS2-WbfDvT8BNco=19&Jr2z6L6WLb0W0uv6m{fen+>h z%0_6PH)`?D>$)OpbVeBxf11=c&W?kOOK=AWUzo`9ew?HnZ4j+4X}md6b2ML;{x8CXZM1l|HsXFW8()kB)GqC*?;MrKXie zccP}O#LdEwQZcarj(zn0 zMD*2Q4-u9EJAdB06aeDxKW*I%sf1eo^?+l}EDa$lRC@4RYg z-WhZ3C%5x)l=OsKp4feDXsMHE4PNFwe5mhrp8ue`&ZR>9)Lco}7MZnxkcX%mH<&<{ zVKGnBVCECQTNtmsTaGy5sGxzBuVIXo&*WQ<+ZW&Jwg3kyPGDuAc*Be|Lqv}Yt!wRQA2JGi?lNY2A zQh4YDTbbbUrKLrbi;3_3M7XSO4wbfyRQrKXF;;LO2KN??l%6NnE;ouudBUB6q$L_# zFhkqn%Y4H=;3sNpZ+kuU2^M=xsK=v)OY)SG!j|0k-IL3>qCaMr;1 z0k6txD7#)_%6+lTWDwugqqB#`t%7l@O{oXt90E9g66(1m@ zS;%PWlB2bORG&P+Lh?Nvve2{+grBTuRnCBB5@nS^1^W~I`_>D`S)-l%tgH(5e7$Nd zOQ%>I0uZEts@g0EVj-`;b0?XF)%5EzBQ@gIaq_nuC3rn2>xz@~>!K23N~U>@l9Bn!qoiWc zAc@u#(v;*$VPaS~z7S!G%!F*o+jr$}e|o8E*WU#HN>VC!iJ!5vy#U?;+Au%?P9O!p z&Ca19Op_7+M;`*A$nn}9pHNw5s`fFaOiey|Jv=?d?tU|w)A`OJ9sWF;3S!N^&zjeE z*r_)*Qx1V-^@4RDBWW-~Kxi&kE9j4&h&=|gIML(90Z|M75f-J!;-mH8erHB-{EJit zASh#Z%F$TeY?0u(O|*g|j6 znqNML!0dFqFS~5lb0uYMs(tgi6f}qsk6VaP((ZsR*I>rcc7vY#dx5t$J@0E|5r8KY z0zN8gU%CuE>?NahE!~34a!bo)r#j@`&f1y8YoWyS%G$tL$xh9)0Yk>*hZ^l#4BT?W zq=9TSPit`E!A?*R$DiqY&<|8JJQIJucyhlsW`E_g(FXO+=gOImPX)kj0LV0(Pj)DP z*Ha~w$cxjQ>zv6?r_uhV*MA)kb_Klg6@;_}^d^hQa=i@&=hQ^50|A*HyTAkq%PklJgBu!SjTy{y3p&?r zz}0+f63`+x2aGXyYf^;Nh{EaJ)lVyNm-g4Nj1iw|Aq~yMKGaYivc^jd(Sqr^b-Rbb z`$hk+0Mx=sffi)_Y;qp_a43)O0QWGP=g<8E7OnuW@PEySLpY)KuW*8mUMEfxmNt0d zXPSLq#ICGT!!CYV*-fk4Q73@mUwNh(ee4*Ox{So_`@rAp?CmIWjrPRN?l1$@BlhqEYr5r{!l!WId%Y3=L z)qsoLN`;4c)I-(%s41dOV2|9d%F>n*LQCp=TNB~}&M7VKHv2D51fjQz@b-2jUBX3; z=W~nfB|??bgesm*#3z z=adz#g(tVD|S#Q3IFJrVx+K5X_+K&Rf2d zO2u=5SMSC+k8E1AYh1#EC6Fu*V%xF{Uk9)(?|)|L5_k_E1)Nbc7eTERVzpm_QlYSF ziQs$MlG%=lzC=Srb!%vP#Ur!n17l>$yccSa8eMD{!E*=~6Pzq~b2S*2_v0TJZAwA0 z3()aUE!w4YbSNEkRoPrsQ`*`N#_@XWzgYJQ%AfzL)Bd2J3hWV91MHEdlgJhkQerNA_Lr#b?1SV+@!WI+|K&`5-Z?Ulh#w2XwGyX7Jj8A4hHf{_Hlwjm zSKH`x4tX(g1=ERL^hzY5d{7;GkS+xrDt=f&){ zzkmNG<1E+|4SIU1^kX%ZkZO3FQ!bn}DZyO4|J1nq4#1)md$*pY@)e0;AsaAtX7fwn zKd|RnR%e}pz~JJ9@cj06nVUyYRILP#WWCvCAxP}I(Q1-KUXOi9;aBTJC^J-o;)v&H zt$JPTkp!pBea8wQj0XB!y#Q_ytEulY>Jh~^{1tQUEpxuYszz8 zGo#L(XzgokT9EU%9K6wV>CLt_=X7aXF+Y_KJBgFkw8>e_RYC=*FOvPf z1t@bNfc^emPXqbgdfUOz#nrEcn!rOss&v|Vlh0snfi;qs2Qz(3eiK6iA16Hx4b80rj#2vtn`m>ulXC8RX1$MS$5CvGK64Sa03 z1l^_MzOk8}&SzhDb8-_3YEGTWr;aKupnxKavHD1$Th<#!1D%(~H8KktDSU#|V_l`jG83d8b@;Ye#09oNTxKBI4aXsXnSIsJ zVr9NFBpxG$__`r7R#$m`0Oya}iekyr*cmVZlQ0>XRr(z;qH56%D*daF8%w<|@D}1U zZFXy1We60E_-xRd{^4_cwPVjZS-(}uxO%q|xrdINl_Nl40PD@bZBIiqdwgY4YBJlT z7kPXeq37z?B(^??!3_#pICr0-0d|Vs4In5tJ9fy@!}VZv{U%=6VwoV3uC>|Q$4^O# zmo9HpA9*#ds?ysN8V9-6c3N2|#r>qVDq^H6)&?IN zQW$0eE6A2FkGYuZp9?2rUFP3~IrYd!15Kvokivo)i@7LOElu}psRXwmhj3$5-qv~ra! zPu~p^LuJ;cN%n@j0VQVyC{o*`CIqMlLX(XhlR=v`G5~X~psi>%zXd>u&hYeeHR+FT zDijcfUFjDisjb|n#x>)VN_S>3`q8{wnItpk#TUNEoTa+uc^(Y934#MWR!G6mCscG~^D9j2a9)nS^{7q85>z zVAkkD>?{w{RFJmQ%VWq75JSs3DI7Qt-1yDg<0pIqa9zT76Y`r&L?48&%QmXK8r+d9 z!tD!$;Wk|`rCY!JYUev`R%k%8Nz9k==!M&msDl)dWmIbTflNljYv+qxaXL7msSh*f z2aH?B1B(~mq{P>j;Zm*Eu}eP}(y`}PWol_?Xg+yVt2c;Zynt0OiSiV1$Zqaxgsdj9 z($FYf|E=xBNbU_$5vNEF=`;b?xjsQP+;$s^byvs!a*0x5qzV*1gR|og1Xy~16Fhjf z6Vm$(CWhaw=AD4Nfx}*vXiZ=wzfnzHlh_4OTG7becTa&9D}MvxG!*^MUqOoX4yiAx z-TmfPzLRA;^Tp{gw$lP-86dewx=^F-7k2>#o!k11IC&zD!=H7+q ze)r*!Cmu>F+GEM8h~~z8f~Nl7`lgYh$a)=Cp$!$oBo)*h%~7K`_u?2*7eoXLOhOo- zH9ib2O-bjE^(Z?<>tx>(aW}nWGfa3c!|hyxUEeCu*6F3%bW(r__teTGh1xoD55gt9 z%o$538RM1&_BN{JZm!M>+QmF~{-vSgTAb{3js;M`6)3OLg9HKn+u-v2%pECqrQ*G2 z<{1O)YIP7+S^zO5PNyn7@WyQ2fz1bDQ%&qP(h#) zZ&+pY{G0PN^PTR^l3{zbEgnlp)<_a1-RQ>f!<&uJaC-9tU z+#YB$EDuueT}cAT43H!7gQx3TG(E-^KU0eW(F2fM^ckSZIf63y*MQI7ck#*5h5yha z4wUJ8jsFps_`jx1$u(k2N@Qm3&>s`T&MsBv~nRB%=n)pAa(WsJUvX#q%oWT zvPW-;plQ|X9F;fF*8)Qw7@25)H0gKX#(W=3mHXpA{tY`1o(tCiw`eSP z9Tid3WqMwW8l7N^g7zO8LeCDAJ-gG)QIu}%0f)=6;K`DYUv(c9euXCMQ!aA zN%yZ!ahmY`5u$t$h4$%>w7zBzlC_B!L1-QlJ_bU2dwX(I;eVIUx(SQ zBc>*(BE84-vyhsmVW9qqkMYHT6RZb;lHZK{2uN{e{n?ZxnLr zk6h5h;r@>2{@e?agBFpSg&2;QoOMPya|J7{=Yl_s?APxu6^* zvhmm->BoaT$~OOJF8DJb$o%1W|17r3r2C(_kUc;XJiB@PpT+v3(*EckFf0>jpPt7j z|5+?}@#Fu@^`9^PhtmJg?s)45g4+qIGLLCP{$Y|jZ+I3WzQ4cUE>r2B!ZYFr46UF> zVk|$?3o5vI_C@6%Mv`T3oke5YDtd4IVLo^`V7;3N%;1qMQjrtKQ@`Z#|HHkpjevK< zi>wxora5=o7j>@U(owOX@>7t1YeeTE{s;&1B$)K%@Y&zxR-g(X3)CV^%=Zf)J=P0M z>Sg`v2ztNQg8OsYZOk2+#@O3n(%ZKxk3{AdFnnPJJL^ZLu{GB7BAWgaRAq1|#!r9( zXc%yFIWmnV%D|Z=ZoYA}uA4xEJee;oIWmpBSLvk$2#i78@^F@a8##8xL(>Y>xfkcp R9s~c$NGeLC-Ff)p{{zlJ;^+VX diff --git a/format/diagrams/layout-list-of-struct.png b/format/diagrams/layout-list-of-struct.png deleted file mode 100644 index fb6f2a27e07a766729d12ea33454db011ce6ae00..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 54122 zcmeGEWmJ{h_XZ3Df(j_zCDPs9N_RKX-Q9wKba#g|(kZ2Mce4p;Bo$B)Bz)FBN6-Iw zp7DHrzq~)kaJaqqU2Cqn)|~UYuDQdN6eJ%b5g@_9z&w_g5>tVJfuDqdfjf8z2afm{ z35kM#U|m!spTks*6YYclAUaBEyTHKUP(uI0N~=(ufD`7d)U;f+fmff z%E`pR#6l*7L`q63;B0Enry?ft>vHfvK{88MS4Tc(W)BY!CJ%Nd2WJarR$g9SW)?PP zHa13Z2BV9Yy{nNYqrD6H-#7VvA2Bl*6K5+&S1Sj5Qs{k+j2+xu1R{*SZ06zu?k~hC@Yk7t9Q(iL{C&QX zrGu*ju!6IdiL|||nKQW9)d>1_LTtaT{{KGXzxPsfwlV{w{_ASiUswP8*st>in4tsz zmm&VP@?S@RnT3!9nE#nFA*7luwpADy5g2K)XKJ3XN7-(#h}7rbt%*2#cxW>;^LGW+ zH}gl1Z04*Y;28KiAs=m$7&>`)JSJFq@l|l->f^P>TQl$0atrU`+&dHJ!}+{pWNHOC(qX z3)PyX(%(1ueJ>iG_YeMi`#f5BtStF1>w@1W{eE`d`ZtpQdxyWj4Jbxaa^ylNkCyr4 z1;LkrGd}-41fTI-qzi&o!A1Aq0fCqP@1g&Hcxhhv5Y7vH7DEx2ezWrbntL)EKF%k4 zzt8Ap>Du6uD1Hcfnr<`=6Emwd((W;Tj@q`2=;@^G{Fua1Ek3rlm}x zSS1c0m9URlowyY_UzTaH>b6ic z{~Vc7Z4j^ZeZ0$rAUCPNN$0&)p|mQZ&N)p3V$8Ld-&flz4E#QRY59}`?rgSf_ioku zu<~kL-)kpX`1YIXIqb>thvTMY@HS~kar)wBXzc7h5e4zhf{3(qlVqp1Tk4L(LP*%Zabj# zfk2o$ZrG@V7W7=O*&0!p_mG*}I~PudsB=f%xaZVViCrh30Smlf5=MFOn4n=sf&Y|p zEc=B?G-nLo0S$)>)0*h-Al8~Hf{K=9k^7P>N&?U5^_t)HF&LcS&zrBPz{a18<^$h9 zdHHSX4Oi{%n?|ZE$8PYD_8uo=oXZh|yXUMVtUl4}z9(G|z(u!8$Y5^1?`Ap(E!)<^ zZU>%bG9T=AQUn!eYI=zStogSW7kV!>7C&=l^2caeD!*zQxpblz`B;O!i?M z_l2vn>E=Ke>VnUg_XG5(3NL()8ykj}r8zBdGOYd{r^q`@OSe>=90B{59UK4nGY`F=^R(D|&Y35C|km-esYOnB;FJ_lYOpY))9=T!RgVYqJ` zMR6(H<0J3vuD;K4@Zk2&0(q>!*+%FSy9*AdeYhnQ3XgBTz5IFm9imfA`72x5(}$~M;)z$3dWdIN+lSNqGQo2z zn)`F}R?J^h7K%!tAz$In$kWb5h!L9uW)OY#4a?8F?;WWZkyHb&Ez9<8P7&O=6}t3{ zCJH4FZCiFy9uPTp=JN~4WgC@qL-oe;V{c^jr0 zHtl$cs>E=#%|#wfWCr`my1X((oI_!Zb%3ZnHOnkTo2S?RsV+UzrRJ|3D!`s%ayk%G z@RcAYANx4Z7Fi_2^OaOmY=x#(sYG+D9@hmvIU;ja3xGu&BLrD2K2T zd2PoRBKmqS8ik6R?;CvFk)(+M9_$c^NJYb_lo2#dBH(#a5GxdsiO+nrPvEmE@O88& zp(LC-he!ABY=n6LHJV6o>QZ(w_Wk#d{76LJdv7Y7hH(_Eh>{ig_9;+$&{KO6aP`op zBVTn+mYe7c5a>G(QF8PlsYr~hJGAao^(*3{6=>(64fdL08gn$LOy)JZl+D^ z6rTFR9l~*jf@$lIVfOYxYnIn;x^m&-bJ~j$<_<$T-cMOB-P;&lQXl<)-ijq;MiAzu zWQIynGYDk5FIgd4tGJg6=+6){D{1AY(2IyOunW+$zqEHI8nm*w+DWyTTB0e4NSV!nu5I`=CE01xX&P9G+5aD*fBO##JbYPT#fuA z$J?ZJAM`a8GCC)5EmS#2g3sG(q^YGlCFE~mK7DTcLNx|wtB<`wI$X|yL)MMtL;She zRdb&{>#(6L^d{o0uMpY(i&H=TC$QtUfywyt{=5E%)m{gB1eFnlXt&v$97T>Pv7#XA zW>?dY?kJA$S`^cn1A5|qrevW3MXFIv;w;5a(|^U@24dI-dGg%#IVoiI1B@S(OP>Nb zBg9`5vi&@b^-t1}bJF%==|<;Xp(;Sg$oCCJpy?sM1i?{fNNrG*9W}4aMz1CRrP%Y~ zQK}DM>f^L5OKkm;Icn{mGfIv?IxK549X{~tz8e-|m7262vi$+p|2S8s{1%Dy5Z|u& zqa4f7$!dm1WAaJc`uH4|abB;$(HM#%C&Ye7Wu0K63)yw+;)FTOqcXJUyyN(oIb@58 zs`j&CGaqX9=LxgUUtbHXA4@dt*m`frOBQ#5pB^uu@kNf0A4l5bwHNsQqF4{-;krB8Rdx!JJo52Er@mrhlL><~ee zya^TWF!M*69BqFkr5Ywy;>TCoP0;sBw9m6TNvpOG{Y%MZ;|B9HCud^L{S&YYM#9!S zVp|r6>t23mQ`JJnTUR`lFbKDQ5_m=xcr51`Ej0p(Q6V^+;yJ8UHTNzXn9u}vOBgp@ zD@a;9$5UOP`c*T%eq#3vDnxeD@N$>jA)BiiJY@+b-d_(wjlo%bPaY+%b>RuO zs~xv+5Gc{AbpjY}Nj^hXw-7XL8{x#y2za+{7CEqct={ z71?+*af;g(10h!E>{PWzLt;hiLeUMyEXCM+Mr0C>XYY;*{F{iUdr{utts#AWV#7# zu4~=1(@PQ@F8#T!NBu#}82Kr1RzqkL9K1bnu_FQ=5(_i|dj!nQN;#LWjEV=NLwgz* z^fT#baE?-nEL?a8>9UjyB8yUV==fp!W8JtpC}M4**fqIHx*he$b~J^z9Skf4^ z2ig;Y6YryYdLo(=ZcpLM>U`@{+Hx+00|AhuWYTkSrLJF&!yP_%o54v*_38l8K5dWY zj_gf1PoL~-Q|fB{>;W^nv(XIt`HfZaD?Wy-6|NzSu$pZv{&QqXhGQ6gf zYWOuDC75A9bqTZj{O`GOoza@2nLR+O{_4fmcb3TLI80k2R+5HWcU723kq4t_)h_nM zHlWa1TQs?R#Hd2K&oB*C>0 zc6Dep9H%`kJ|$yk@iyX%Qtp~QlzpYd6a91T=bZR?5k80kF#J>yOx0S)79JWIt^i5yH+oz1`4mj3?Siy!dt(}9%r7F>D~KPOl->rO zaXr50ypnsO-&v642{0>R>a?n!LqUDo~0Qt)s? z*>paGd=oSiyK!d7h72+v`^0vJQfT!Y-QD%29gRzkp0pn=s_@x)54j)HV!Dl0#0Mtl zyqR%f=|4~_?}7i&+@9G<^J2a+4Hq_yOrS`1x$Ilg%Ep+es@ucyW=FKn!)8~O?HW=X zR0$yv!2)EqMpIyExt}&{edv}Z+{chrsqa&1rRhp>j9Me)9ux!%ivYEbJjgS$Rj1+|AMX{Xfm$A}6Jw+C+ir_;F#^S1PZM$M6q}UB<E@ycsDbvVRk2T_FfruO_Ch_6tAqbijp4T}VM#=N? zyr7EaBS|LGc;dxJs9RZhFeWw2UCq&5C%VbdPc%!#hGD~4zq=8efF~>(nNo?J&R|{X z+%!!aMIu(YEeb1F%_Cc%`Y1lSxp&R?>tkkS63_L3hq!8G!Fqn7+=^Ac$)fJJ;gKIV zR?0aN_=6!hMj}x~N`nj%;)#Y!D(75-mKIIPZCGlsbyx%#SF-nFL`@Z2tOiQRN zs)`+jSW!=QU^^_0NI|(}x+ak0nXul=6R~AiuRc^+PxI=OI#l`8J&3=u;;Zjl6|N~pypfn*xnDIqGu327 zAZD#5zY|gcs>{yE2lR=QGPPGVrR}tmua(kD*|khg7M^dROS&nGF`L+RsSvtt@*J|`m6nVozE0$2XYz3U^Qcg@3K}dCbagrnu8!$3)h0)#L=PHB zmZ@8m2PQ<-tz^c8o6Aw)k(1JBh)G!+gpbbo>ly$fe92h`&=>LfEqGc@EX)mX;I``& zc$zlKK6(Q8yZ=!}wDL;{xlZ|)7~=q@4k<<9TUU+*8g&Ar!|kt(Va#v}^~n;!nNGHf z$y8}FIlgB@4B{vYFc&f#*!M@!rXk~0-c0@DgX#s6&}7MZAre=M9&c1$oY7+&QW}-Yue4fn3D#&>(J5@aj93@rGR!gK7m~9O zFmeIKeA7XVJ(kF%M1E_vEuZeHOJlzRKZC|V`*gxcUGx?S{*}>Nd8dRlp{9&Fb?b@V zdGy1>#eurk^+Ne7w}%R9`1)Rr&4L3`bgKfyF~Iv=y4hZQqnq^Ph^^GaH)5os(zrT{ zxu+;QI|%!L2nKCs-04bzu)&>Y+!*=fF8J8xYLdZkU(A^-gk{Q}WeQW9Lrnaf z{X~PndR3U1V-dn1@s4ZpcJc*$QxV^N6&!^Uw=c;Oi+oo+_Xhlg8Ti(V^dSM1sLPkn z6wI1)0g@$x;uRgZPiS!J`nEniC#JUkYh(Mbgs1JHns3+2i+f1O*zfBVtPX~A)iqym z>M7Rd0^o-;I0`41!<9iEogwtoxk_Vtg-g+DEfReLW?f>vnDhpM0PGR&s~B@H6Pyl;rQFI02{-!n*+I0~aY9?}S?P2tRz zL5zB>6?8@9zGy^SFB5p}o+!`N1xY~gtMC`CFot{nu^u1sy;_;pIxUXy7RFF^mQ5lv z$D63;B!-DytZ==6x=^=?Qhd*mn40+9n7YRr!=~zVUr;&+%i(9KmM`iwG5gh1_Rq54 zs+G^3CC)sk4Wx@^WW*n=N;C9dP|CcyG`XtKwYxxxN}fz-{+si#VC=s>wi4l+7EURvf^Pi}isLcXk4#NlA=Y1SEePyfjYX4hEf<55C*J;g}x7b<0k z-Cq+8?g>8>yHB_5w-jUz5UMD+8`s`cf=H%2^3uD`23r{{!g>+e>jf|dH3=^RraUhv6ZV(GoPWNN`-)ub@QQIpO@wVPZFLC`Ua#U*7 zhkj>RbZlbtc;X{K?S&rkJS?0xR(JjgK*}k}C)v_QH1w-333!zZ@=>&@pB0tXnrLim z(H`$!c#fjf^?L+Z}r8qI)19)cMSm zU>Ax_ZUjHCQq76&v5t5yDsC~ML157&Cxbb$n-_!EvcxQ4GNVGzz<61YeoetWs8x!iYIv*l*_JWXDpQ`| zY9{yY^%UwdF1gXp@@q_mG##cn<#R-ZcBTUPUJ_XvZM=~EJlCwoQF_F2!>|E{$7F0% zmDFjlU_Q zW@1f|zz57+F1x{@56F<#!Wk0mQYsQVtLTVr%?_{p_;3C2m>k&Jbz$|CaUKveT#~$& zzzyLn>@2lj3=-4H&M2H}D>&w-LaeS2I1Y@jxk#SZV#LZ@uE*X6=u*{;D)oTclxw`<5`)`?$Rks_M^d~My2*5 zu20iW#YDE5`>W5aY-k@y8!R1l2HXR#>}WCV9LkMqX8BliTJM}^JA;g%M<)|AOUtP~ zK}a*3+E!n*u&hX|$f+1E8DlYZvEBlJ_>nP-+?2_3f$ZENb)iQYnAoxi)K^xl?vPf~ zd~?-~c&Hs){<7mYl>tRbx2_?&F_8i034o1_XKWKy9fVer@{}7W2_V0%;8|aQd9ZXu zWHRSbv`m|s`!aHtwEqZ3PWErrz*3yZ4eIiGudd}|>d)B%)$H`6;f1kny#TOR~O<;uJxM_K&ggcuOYwiW)c#4Wd5T@;Fi0K|>bPx|VZK z7+7ES(z_1u-&OC&U70&bIv zTPuNz$noHE`I5KP5#3tvaVLK336z3Gp1|8A&k*NCl+4{H+^%)&;h7I!$9ud1n%= zV|_O)?)hQrN9kB>re9=`#gwUBe>#6Vqb!qmdI;zE!By9`7QueB==hxi%GUP9Iz6i3 zXRpt3D>kVTAxk{{65j8s{sER~#13hBBHJH_EyWIn;jj`Ki_z8X+uMQRBqNR66W3+S z=B*+c+76_Rx@Xs}*|u#t-u=VlV@L?+pXX^^#JLtLe~@dk>b}J?!_@-&Ej`#wC~;PNOa1ELi3X`{rYOj%p>{5e*86 zT~%*C@>Sb+TfM{BQ-QNxk~THxa*|*E>1y@eo$4P+$6cz(+E4jQhiv$VIM17PnobRW zqGph;qw%^m&B@GpAGYg+*P6tO)IUTk!xoNgxF=uVTKnSf*bd6=NpzqPl6=j_r`U^Z z%qT%iwxZ{EwWDr)>oKc^oc*QptM$`f6md9tauLJh;zqlk-|sDzhY8d|*{{@_Xr7Tm zh7;aDhM)!Ss6gJ!4X1peP0!9=n3iRJ3FvWF{>|?fNxXat|2z`P6Xun{N2x5UwR183 zGmgJ7lsric78$$Pc3TvkKQ&GwnP}jPOg+mI+W%Zpj!47PmeG7G{io{45)&Aft7TF_ z^*=Y##8h&0=XAPG{6|&w*O(@$z3c|rZ3=gL2%;aT2( z&M*JRz(n9)$OEm@T<`IJL|R|~N{&rK!R{IVnDFmMv}OR!O?mio?th-k^)zqYHWB%l z<2SMTix6_<1T&_I`A+yh&$WYUtf+98tbQ-;-+Zh!FhmuVz5jonyYw17SDvY1Ch0$h zm;;7*A-{6>AAMK@V!#JGwh5-p-PXgZ>rwWJD4-WW6~1vybz7Q^85jJ%QV_!z!32o# zcA!OQD0?||scG=i?fa^S<`=WhSFq$UJjUm2Imv%Ktu-6G#JcRsyspFhvtfFuuAu9f ztEFWth99Mfdx#?O2=K-&RYE^+T+GFG0ri~9KNr>0_PNz2nsdJNhkWc@4qcO=?CB7=)TYC|0&y}3W!Vw{C?hCZpEs~bI!@o zS2h5s3rZHYI1vHi1eTWNn_wVEA{?X?PP<}{S=0eOf?Z$aOt(e3L_7gg^ z>s7j50(vON|L#gr;I#L_6JM59*BQpzUJMX~x>aNuIYFfZq!-b~fc?87av6037q@?# zmL-~I@CQ`(EtKvh-~~dK@J7P@BLkfe>w(CZ*Faw($A8@LC2j2Pr3oVHeA-Sn^A5H%l?P+2IPntYMJDwS)15>d%Fb0IWTfzD!P2- zfXr~sa_Xmj&r6kN7)YJ#%cki!$~sWE&9!VRDyH$;*s7AYI1LUlqZ~UF493yQ%UMD2 zeAef43$+4JW2j?6r8FZc6p4;)Q%g|x$xKTv-1TUtM9~Vn{|XcVW)Hps3XC4x2hfmp zC2tEMft1!SFCZ3wv-tl~IazWAc(OUv;4)@O<8X4>_v5hoCs15K5f^#x zRjdcIA71|aLIzbPS#|(nOq{y}ffWUTp65m|Yn(AbUqTC*6gxnCK$xgKwxB zC-FIgs+hpCdilBK)1DI;YwW{TrlX3?O6-46$>d{r0;~$wO%VCz=SFmRpi&W-U^*aE zS(x_qzdq(Rode>EVeO-=%f@0=qYWaX*FTx+R)NcFucRr|vhD^VGvPhXXH*pmhssvY z&`T8s&($tw!48l#7pJN3Wp=?ZoEQ<-YHMDY7@P*iUG|0R?8y{_@$HN_&^N-aT>#K!vn3hx)!6+&OgcOs#wQ5FJ9hhAAL}P#gzq3cRmDp(>$m4eVC9 zV%Ul6TN$zov95#2V^^S|T6XMttUZDhz+wx6XDa_uO%Et=UlbuI{uz`iMZp&R&4$(U z#PH_%{_d-LD4kK-OKbLR$01&EzP=i=6C{3@8%60y1Q<57P*IobMTSjP7ki4RD?vb@ zik}Qr$W}BJJ`F*^1S`&7p;sdLYB!S^1JV^2BoeI{$|woDNw{GyM_{oIutuA`$v3Hb z&O-)2e`rS4dZHAJ7A%4yzyK^3g5`#^KR2~#u&K$1J~_nLHwVs3yNv(VcE2cf7i(AM z=rx3mC%(>}JrmRFQBEn_H2i|Jg_C^^s#-ljG|1lk@}BpXZ1m} zx^>=E#fC@ey_95p%J#f8DE*ftnA)j0S_`c}$ffQuT>nQn5}`-C?C@l-WXaIyz2EdB z%xi-{hpU&~LOTWD>>&cy{5Vfkm{9!ZlP_Sy!Z#bv;N};YGrG5f;(8P8YD)g+%RdFS3pT8n ze^?v1$?+!0(YjLOoBy4~+p^-f`gy~U?IoF;H+6)5D&K+YS5H4b%>B{x38V0^J~DQZrc0h7 zICpFnc)vC(@xAU_+op7Q7fSKf<9bJ!KVOi{a5C?8zBVGf1;;JO%hV5o#(2Nn-F^l3 z(x96$5(;D=*7I0(GefTbaSHYW%@A|XS**6-|j2A_B2u2zU2j4 zP`y-KJ;yFpML}_@<#aV9lgx6J3pD>##-_zURiMp#x}3WmRm8pOib?2~QYmHraMhvX zwn-&me3!jI_~#c8X+M?~d4oNtU=fHO)8I&pB5X&7$QF47&IZX zn|zL32_kzWV-wzT5y$}qHT9hO0H*%B1w?lTM!P2l8WFYKn6;7=T1TNg&2`siRzn0X zCr`|A6KUGJ?i0>pJhB2RIFsu!z}sDeIs3d&y#~!HHY4bvJ02L^1+m9U*X)OzGg*Qa zAh%28sB~h%`7^_94F)+5eGN35NaF^Y$d@UkWC&N=-r`FelEV^7EjQyN`G+8GuyWn_ z{_q!u;6U|+fJ767x8PYTt`8GTVhM%Dd($ewy+l78pLH(5fQIT3ZN!(~z;b$Wa!svC zta}RLZ|2CiKO7pvQXZ~#)_uyY%Qg&a1;|nnBNC4jqEi~1;FQK0_Htt4FW8l)^F z067L5y6tOcvuhc-Y#Qly>lvY3UZ=J%Kwk<*A7uwtVs+bLKfUq!`>*5tAOM_co2OGQ z6BHN+Q$Nqvj$-Y{gQ)%^M*J(Z2c%F=mbgkAmGl$Xpb**?stR5Nb~lF@zv>MO?Vus3 z?iu4f%wbyyrs3x~EG~NEf4#hhB6Mj;U|Oy4+H#fA<5?ew_i)59K8%}~O5{=NBk@dm z_PGCE370?Nk*2(n#y`3UqpyKf&%jK`^F(85*cs$iz99D2`jUO}o*2yz|6|Wp;s|&S zU!qZv&n)}BF;ozIWtQKn$+O1{I`Ay0Gka$_t9>#1fd;A;3QiZ z;K5=Yn}H!@-9>u4UpTx!A2fP^U%3+0-0Kv=C{pA=6;oBQ50U&_0E7*E;0U;KN5CtxPa$+*4SBs8!8^=QKBJ_ zAVA9u(Jtgjm;qv5*(6OhON*uYe`L#l8EG;Gyz;TMqz|B3Gjtg^Mm zkK6ja{K(c4sc0C+|0`-KNCUX2LH+ZUuyg+xF!VcVw#0#2?O9NUo0six5$CUAie!S? zuIcLW`nP}QIG_&Vhy=c`ZEE+w^Z#cbs2}~$AO7m?`rmr_Rr>wUAO7bL|1K8(zx`ov zdH1Ww4-88)xblj+VGyWwy6q5{acS+ol70189JSApW zgUAKhs^f!}kose$7R*1xGQ}h(h_NqvAo{<<4F5?5Y(Q?^7E=HI zca!HYw9t|tpoP2oXCUL*Tz2TNvwza`Ux=0xT1Po+>-(PeHO}kU09_X`D3ALAj0E7P z6np#k=kvlhcP&SCBLw^BKrJ8Re*ZDSY?*si^JYY;Cy;)pR$g`^Gec1ikh*-K%CvO` z4GJ$nSq73kdq{%$?=k-dFyme==evfL&LE&!MKZP0G>xf!J*@7a8e0SSLa^9{8l9>S zT%uJ2G^bh6k4!M|yE0RYIyF7?1DP1Kmlz2;SoQl09ao+yG0ZWJ899K&jDlld z0dac-kVt4-5>WXrtM}AC`Dc2egUCw-brO#M(|Bu+Ve@8a6#@Eztdb{8$zWaYZq2#Y8 z6aiqvgrM(jx_ahzjo%pn7-&9p^y~N@R=xv(nc5N_=&1YL@zamPq9GLZ7~r`47mFs< zAfMj`4JSgtZ&~SDYd==}nKm|{!v@{wJT(4}W#Lr=5J^k`U{=(Rx6WzkG6J}*h_ZaU#y82G;Z70Y!0iX}`2Ng*8>;u&5{l$_^ z2j~xkxTEC`Jn6vaFr)-Dc?V`@tH=!iJiDS{;i;YE}7Iui;e-3fx z0j*#KysCa&=!s;Zb2$*-ncwsCYyH0knX|^Hneckao4Yq~QaFc2+9? z?W3|JaUqBPfdmI<>2y`BAFi)>G!~T7$lEv z2S#++dir4s)N0SEeBNy<6!m~D1NT&}7FFny)GXs)@GJD=K6 zsK(Zzm1vo~GiYt$OOcl#%>~vXwlPh9lLd-MBXY`yBY;V!&QhscVlw)I?Ic;_C+Jrs zPef}hvLFvV08LJORvdOkY`g!dCTJ-0))~yEi+Hs-_felL0nF`PgNZxk6|{v=Xv5#W zb-#ETj`kD47o)y`2E{=0np70a^G5!sg|;pcHwuT9YY2BglUO?22CZ)hGbv zLxKw^cGFB^z^LBY)&d^5oCC!!p?E_}w}~v>si~EwQf=WTh4SR@GT z62%2b4iZ+K2fv6lO9Z&8jid4M695Da6f6U*M0#)OKs0<$$ZTX%VO!I|6u=(i>QBs) zl*vkdV47xCVyV+C6o(21!zjl?f~X}%7{4ua+_^>27VpspeW`QW)DyX3w9PD&gi|VU zE$RjMg&*Jrol@Qj9610j80xSE?InG6JI1}3Y#$qZf%_Y!=g^&8<%3H4{Kn9*ul|WV zz$}tOtpA~5ctMOJienP=1z>A|Fo{LIVuRU(XN8cuE7Ri3+BpPKh6iI@7?JEl;g+Hh zRoZBL996u~GDkkMd`V26>!W&n;Sif|_cX5n4o}^z77OHi_x%B`FyN6$Iw2N7)sEw{ z?7lb!wjT8H6CkwH{DPK6tk6=1unqY?#?q1wL=4qa_emaE?t)(5uVe2<*+$fu1wjj; z8eN?6s2?D{l857d0x)1_T!Ezvz95>5z?!|y&7w+*=zGpQNJU7 zDHPbjkr|^)lzYO{M&dHc>gJIsV3o+ju&h?`bu>Ez;D(D2zk#MYv4EuHB(Q_%sl73+ zstOfMg*l@oB7e7NjBLB`^KwAgbKh}**rSHhb|PDiU1EIx8AqhLYNPgpkND;$;pQ4auC(DndscfvRUb|)bET+ zUjl>=Zi7j64ecEe2fWXQs2$pln|4xkt6Vv3`vA*ilc30YGb4j6HvZ)2r6dLX6a$nS zs-r)JV(FnI-lYvcz|QUm@cTr?(KKir!Hu{8JG@coM5*}D^ug{oF*p#suK}_=G{_Ax z#k?CA{&};a(>~Pgc%$Mb?R^S!>)tY$%Ftr3yQWnP;fitOTTOVC_13iV) zjZBgN_r=YJB_1K(LzJm;6G$o{(2&9AcOTlY2?62QaZX*cvM!m)4H3-^bZugiM|kLZ zo^9zq)7xaKF`rol-I+ZwGS%c2;~)omK;rpIH(q@c8Zt<5KyD^Psp?yIAkboA*`k{J zR5ZkJBG)uQ+PM)hl3soREjyQJq5H`s!T89)^xc9`^kwH50Lf<@Teh`H<{9vWpLF}%AHqpuf3VS;aRY2oFt*0v-9_C4^LYTLova<~O3v!-ymI7TOP=C%1rTVO{ra6m6G z#xcdQPJl>~3WH;5V~Q*J(;LQ|X-_QBBZuL2DHFRtV=BfAMMiiWyvk;5W4Z zn*WpdcZ_h+9$p4}^qRbExLLBP7T-eO2$A!(8FByqN%$x4TSI4mfCAPBxGZu~AlM7; z7L`*QH;WL4cav^w^C`N0pud7Z zK13ClZN)@<#;|M!k;~8l>nlk3VnHUBjR)BnS&#gH&!I*MApN}k;y)1c`o$?oeY-!z<5-Y@j()3*ApZ0)L(N1r>-(M7zDLb<=P7%h9IRGcN zT6*@e&{w~jw37wfIhN7|^Si%y^FTM|6%DNY7BD7XiiipM<`uu0hO{XsMG8I2_az(V zN0}=ZtSlew+cUy(E;TkCMyWq;onT+y?sfRXXqBZx>CK2DPeaS7!Hq+?${PP^;Tj4Y_X8gV#1w-n`41!eb1+o=X>`*3i&WyE&BDcm$-ai`o9Im6N^{@2b6+hte=|SD+qf*_6 zSk9o8uFs%t97#dn_G^1G3IArR`OmX6{%a*IX~#3Q*DuQw5W*j{eX9W?L{h1gQQMCA z-8X2cRhz63nBp}?67|jP=H*2^`OW$kU&*uKEwm)3vIg(lCN%e{kP1Gj z2BoMG!*leF{%{0Nl6fy9^z5_a&UBV#p{_*m3l|$aBR(mvNW>WFu1RdFWLFiRL8=8L z>YISN44-$T>9pX~#pqZAzx&Xq-79*5wJ&4vNw0S0DFG<~XlJIcFPT){nCj%us}QG8 zHiA(TTf^YyVM|=6$aV0*$aliI1Hsi7bY|91hyCizRKPo}>v)=m1{!)#OU1Duvp&gU z0fFq6?RcCEV#dutv3Q4!h}4^+c}=$I+`evcQwmwVpirPru-i3umHV7^=ZcM|yk z8oQ=+x7$H)RVXvBT{O_w_nx;umY0t|hjU(w{0u7ZSaJKbz@y+)tCEb;p`_J)rL7Q9 z;n@|?)iGV3K3nk@nbn68mRoJS+|Z~&t~98w0cufgdcij-Ze{YTUz>yaXthZ;4~*@* z-?3a5N?pkWvS%0vpncXqRU(W&DGVkK3YMm&kSkk{GOrCVJ*QKA&k-E|^zpP9E+);k zY!0D|ryJRGt*UfE?8B$~@$XO}H&WDH(y+c)TCM4fi8-;nBX&LAeMj2U(mTeU6WN-!|f+?_*VR+OEqUZP^4cJeI)O+hmM=SRdUFqm~T&&rL4dsxHq%*a%LrPA++}?P2xmz<*?$PI~ zY~AE0ij>K5DT9_atUj|osWdy`X`s`+bAEx;1qhqcaBdt z%$|T2Eel?`wTHId`?eTaMPI0Bf;#M32FeB0VP8~4?q(~1p770GQzxt8qZ4Fo$`(fk zF7x-IuEP^|DEvoCtsLZ>HqSx&B2o*#IZ(35Wmh@(_adlt(`Hp#&kIAvLuDH3RUSa| zO4)ptCBkeQJY9D1x{H(!ahG&{|BW)g`a*Z`U#2Bi*TD*)Wwo*gs2c$-na`35j%0sV-*4w_rXi_htSS-_m@USaSzxl=0* zrV1dlG8Qt}?1|cl$Kv(WHv-Q~pEor}HqDJ?RWVa^OY7(5>RTQ~9H&5Z=4uUlGNNST zdQl8X(=6NPe2l7&SjE*1NgI99LcA;Js*9c$7u08H1dN$D0qs z5}${faz7)9S#foE^TAUyH77^(Z7R;+*QARZv$$gYIPQdfsjJ~Qe;1s;?Y?Z6I!|Gu z)K|dYn0~wfZFmlN{;+pK^Qk;UY6Jre`sYI4h9vmGkq2;|*ddveirb5U37Iw-K2lgH z=vkK>#W%KOeFj73#T*wL`Dukc)9@W7i;YTAcwv{MTrAzGIFUpY@Q?jKk&_mpL{+Ax z#ei?O^o%2$E>p^9Q3Ib)<#9Q!)?ytTzj|FlvRzG=oLUzYxsw@a$RxYJwE+c?rrD@A z<365Yk=!63_B2013G+e4pDqa{!e{#W)?ofCWDW@fj_#oY*6}u`etS?lIGA zwj85}ieYT_jA;~Ugc_9EzA%@6==yPd{n4ntnCt5|$(gXEuLotGbLjT{ZVpF=;+ZgY zFQoD)9~!beQl)sy9K21-&qlHaNv}DgdR==`ZOh13p4BH#WP+lCW0qVg1hB6H4#%J4 z<`M+xQnN}pOlPh!v6R<#2@~t0j>c`}4I4&PWfY8_m*d&PB+fWp($}6RDcL?vj^tKi z=r^_YM(^bUF*DhnC)go=s~pskLW(gc~M&wXV^EL`HgL3+$f5Sd5kC?r_# ze_Gs4@(J4i`o=~e05QM6Ekq|Ipj4twC86yhGK;dWl4YY>paWka$;F@ zWW@piCNe{y4nBny0^ozE{=gu9U}B%uk^V^85f?RtHK~}~6#1$bee8HYfpHYVQ=vRy ze&|joe?3SsOZ7BK2>d?H;wD+tL@8Wal3iu1yBEMGe_`& z(J;zKaD9G-BAvI%9z&q2}YnEY`kY%y$ zPhJtHI26JVo)G!poHlg6Y9WL7+H4xA8}@m8KZ@RsvP*j(d2Yjy#q}P=H-TUi(QFhO z?Oz(+$wvS$Fjs1nM{FR1z0s_#VqwT}dVorzXXVRtjZPC>4^6JUNqHf~XIB2*Ns01m ze37poHimmV87sE9wOy^V5~KAFr(xS;O`@I{dpZgluL+*FBvqtY9n?o@R&-LH7&u&VO&XF!LdUC6^vYMws*R7FH=9?i*> z)5qzh%qd^`k@QmtFz67~aRT%q(rHTXiM^|wJB&tl#s&crGQp25)DY6zliVO0j*z~G zTth426oZH}8&|-<9gUFR0@p}{!7DJk3uHF~-cz0qDzf6kYCoQF=fvetP$O%wz{tH_1f^5 zBJ`9`(A>JnnqB+T|HIx}MpfB$QJ{b{DAFz69nuKW-QC?F0@96y(%m5`QWAnR(w&Nw z64D~l(%k*%8~ToMf87}Oj`96r;PJ4}d3LP5V$Qh+-JkMMU~5sX&s-fNxDuu??ADzZ6yX^|NdlOnC{TiXvkP z2EjKqSZQw}6BRg79@d-o+atiqRogVmp>2H-?;ow90q}GDyE^@4yqfh+g9`X}3Y>J` z=w4kbf74KQzS%nnjuz#}rEF2{=Ob{{cC{~JaW89F`kPUe2;q$2r>$tl#o&?O5$QYa z>9h~fW@?b>GLAFgFEBbj*zd-E><&^0Z2D0fUuw?cpFQE0z%hE02FrnHMkwyARSKBi z4#>$qK((#LfZc=V&Q!h@TviH;il-H_?eHkTNubn1`SwirPxRmJz=mZoj?jho#FZn{ zU_N##Q0OMP?iF2dyqrcgRvn$YvEm*sycm{oS>e1FKjx)YU2!oS3Vz-i!Va;MgS z;*ihnxLq(Bb08!($7J~UMDSS*a&?Fv6Dj(41F^^hcXEQ~p&K_zf+>a_F_ zIeaH&=_p!M=N2d=Oaw0Xx>T2Bc4vfN);;I{l?x84qy|WLxw#XXpk0a%smSGt!6{#w z@9(&4sQEy8+Wp>wy8kDX9VHMhX6RJ9d|Q%{l^E%d>Uh-%-AOIfFVSI@_~+XLZh$%! zk9PSWMLl!9bs^0B@Jek)eXxRN#5Y1r``4%|ap?zNm8g|vyvyGszX<6xC+yNj&Sbo< zn?Tl-t3rN)A-Kbiy2nF-rJD9q2-6;4UK=$Nr{r`fSQa(T1Rc|ZF5Av77)EvCfnaub z{)mCcOf?l<^6G2692|3232oXVCcTVI_yKPa#X6V0zD>)z))-?GP?W4i2`W#E)Hcqn zDHvxU-7fsdPG)bAO1nWleCt~Ja)Z%q{gX{WbUwaPa_fz&W=*(_EzW&_&6gcE1Hybu zXu;bLmI7~0Hmx~${gpe1X6ewFDwn{GibV$?OI}Lkz5K2%q%%d7QYYtUd6wRfa;PB_-25^DXAjwazBgX|9Rlwoc!t z4A8dN%lcGa)7Nd2g-ILk^}?{fD=JR7U2EL#k4z0GXIc;$L_^+!_wEa?W@)@NBN+;F zI5&aMunBJ8Dq6Jzp-^k0InU_o+rk=g^otb;ls<4R8*G_Xs+E>EpupcKu`oRA$G3lc z8@U^4#V#L9wG&>p6OQmvz~mJME8n;Yh#wJZ@HE5U*T+zjrIz7o-QMZoRIA_vQ!^YctjnS}DyiDJ;^fWNx)^vFMBN+9Eka5RW2hcvzwEOlD>12^x=R>HZ}H1;c8gY`h69>oN{H0X?0fvo z>?EETI0b~ePm2)xWrGvfs$^-3^%&kw<0D@fpcx3(6Ub&T;wqsW5y)|Bm+hEpITvqh zV)e_+F);C=G(^5HSHmw+ao-xgQy`Ty%g7KN*lm|hxnC2*n@#x+IT4}8O@{eNFYkT* zwma_ze3>NgNg+~Qvo)?Jyube{ZTA(L68Z~`Le!WQNYj$W3sa)RlTcT28dYiqL^4CziYVZEWpcf&C! zZHsXj=%)=zvrk}^d*(2-6N?r8mVBBo(&5P@ym;pYP9}@;%`7Vc;#VHb)`6CkGMzpO z;@qWzNSWe&2jj-#z3r$#DbKDI)=a@n!9s>%GiJr%O1f1}%y2~2L9y;h8wGAb=OxiB zjaf$;La_qf1vV5qapsjc*2cCB?}!s|+b=tMO||qq9YCY+mSS4|POpsJs;>LYQLrMx z>@zcI(e)&*GNWo&r`PWgmqxUZI5&|SHTsm&X5D4DjN2qUUxioPA9bk)*SevTgS$Hxcv8@Hnbh0Rr zX&7oV7W#_KgB{7#P5|n197 zwA7Uu+s2!%P|G5n(OUeNvXD$}M*R6l;qa&DOHU1G+B9Zo)1jhFo-!IKD@=NTap!5PxzzB=AX8NoZef}@lVmXiZdR++xst@s z`SIq4%~kV0S}S|J7iwT>XlGyO$N@&4M}AF(Y?ru8HRI3_U9cH5E1E#dQEa-*+~hd_ zFMO2cApj&}rM`OziiP}eo+1Yn*s!zF~wsDaoO$PdCR(u}<7 znOsX{;nWL?IvID5>doq@jq#>lKguB>hTre(z5smk>tS7rNUgu?masz7_0}Gd!vrv2sT@N zRmvX_cTFl_3nBFb!{4|Dm1ltuLm1;ZjNv{h=f#E2eG-XolG+1T0U)g3nDC-$=z_`{ zVAyXpW!N^#0XhdUtS!p;U>Lnbwx#~Wwz!2gk=dkL*lBCgMTADxKP2o!g-~TWEKy)!j(Cz0V0-gT=H{?fSl z=fng}Q-EpwY7n^|T^LtZ9$M>#-=6}3JXcZNMne24ZiH9Ejh1KH7z4K?+&^;dbT2Xg zr8RC76==^#{KeYwKa-C2QdIPvE^O57e+uE$;{>3vXMS1E62wCV2@(5(q^l*J;hM(m z!S6_dBX1B?7zI4b&VxrK-vg-GN|1pBw6$IEoV*}-0)>m}(g{dhcjW#@g%XH`<~I13 zvK(|G0QQdEMkR9*M#&hC6x$Pg2)?dGJS3j`9P4Bq)KFrM0Fw^F#-<2=c-(wD)$?+{ zuL=M_GGrb(rZ%>#a}u*^N?d4!f8!T0Bc>>T2 zM2Z9Q&oh9j)*Yvl)%X`2r4(ia`V*3{DmD}nZ-8iihy_VbgJMIf+%>45gPLI%O{&2G zC<(RRTwgq$v_uEfwFGKgT%hz>Z1y`?CPN}mnSg%VNtl`VHjkRecdaDR@3bnXKfVHV z(E>=-K{XyVCz)oz8*}c#)mA8}|Lpx0kO_E6r_)XWWAGWHgy{+jq^~T~Hu=1N6ch)1 z0D564yYnHwxK=E@ld;4P**;(DQjFB9%gBK#^1R;pG2Xpj5&1XP56}mg4c(TRNA?TedLZ-za zT~NEzD#X^B^_^VpE(P}GE5I5J=d^+vX6Fn14;dqm+y#24c8v1u(TS~X(7?C* zPCK65p4ggc!8BGGSLQID*Rg4ga^wOBWjP{w(y4oULHI-Z#VCWF61!|2z&Ku}Tj03nYKz~|LQo9OnO67qv1W-agp@WDO<@&*ZK0tCjn zW%QOL$NzlG($B5H;^a}C)F@!1rs%UhhlhA~{GnIa4f75xzXOhXumA>YXlj~zJRe?1 zm5@ph(DAy`(up?#qmr`=oh$AcNzWONN4OBOErWv<4{0+O1wtU}nxdn?vMM|<4I(m60BuAzMJk_ZXCV+s& z^)@T;+L>P~u(lDf7<(=w%vtN&Vm)u%O`zJfOf^gaa*}cP^W&8x=4euM;4tPPrMcv8 zqd}O{C@@hBQ@`8)Wq5QV4L&vEB+(&E$i#GE<9;TDYnByLG^e9t`k*%B ztm%VHYltBUpbyHZ#6wm&5?@XIXelCII~oBrOldq&Jm;U>*^vF1{6LkqbF#Qe6U%cM zYdIxQd|6C^+i+zi2iz$@s*9+t89+7kci8m)WI}R7kiZa_3f7=|q9)*0i&gc%KHJa? zyO_1k?U@xLQJiggnw(xvYAH!C7yuA>tw3@0uBzQuu;^QfU4D?Yy#ZAq*NDqV{|zAM zzFayOO*PShV6qBg>#UER0aqtt4?Io4U0qE6gQ}tOb0mH0Ids^OsSvAmu3Fe z5@VbkT=;)~9e{0$QPuj9tZ1G1$nlFS)#n|!_5d5y@6*77r@WLz7P$jq0zvKVRxRWD z_-4do2yYYZkzsENM73v5e_)*LFn9xS$Ln0w`5-en8fvzEAJ?_t?=sLxf%T1Kd>tuD z9c$1t+6?-rl)I9dKA^fOyLiO$?L34tMZTDKTZ!mff z4@(*uQ}j2m1QhiwxvK&Cr539qfVGDHf?JAJKMcswx|?m_VtttZw{-w82~@yyjvlWC z!83Xg03>Bs11MXA#2$m-Ca6Q718GyA7=gHUIp|tiPANQo|{^cAl6TdzX+f= zp$Tv2K7%5KW!Y<7I-sVzM5$jd%IXSk4+|KEddJ-z;5y*u)b)R#R8)5f{rzcQW&+>V z$qig)%9MelB;YI(a@gZPdb=F9FkUSSVTE_h#I7mANjQN*{5q=JcAQF`>bNtd*N(b%`S59T)0k;?|r!=7@PB37lIjmuAfwiW!lrjTlY08H>0N94O zrt~;sjf)bIqN3})u3j(6zn&kcB+>KzG?SaK@Woj!a?=W&DkrPd3SU9#&Kb-Jwjxs3 zI-puV?}^Q;$+E;Jslw*FL~%u!+wW*Muv;lDidoH z0si@w|GqGd$rzBU3tVSLvj&N-CP3C#8=ldy0!G{u9IW_qP&-x{jDp|y*YO<1kj zn0!#RKL8W80#4kBmDQBP(6el=P_z{siqM&UczFi`z1mPJ#RW3BON-kHo~t#F=$g23 zdlOSwM8CwrTg|(@;tuK-25zO?VP=y0&WG{`G8eB#OHyh%%R>_^fiLR*^9uSl+~~7m z)UP?+Iopaq@tJ1)3H<4R32_2M;Rnvl&>TuNJdkC;*y`b-D6*?<+pqRQ>6Jo{?SF58 z<_2hlKI~EDbA{*>p@?@#B2X>a5&^f5N*}Gq2e?10?`aWqoCZJV>=cS=6k{i;pF!^ud!0KEW*jvvXBI6url}^?HpKk;pbz}BkLo;qq z#e{3mPS#8#7~cXgl2Bs;aDr#LX$eOwHT8P?2jr9-N&p^w8J%l*d`T-E6!sAib%twa zJPA7h7F<24vXV1&QeCk%qj7hK38^qfoNyX;gb|!Reay|l#esUbd9_5ApXJN#3;_ZZ zbm_b}wlilI?{Po)rUSb^M8QM0*KcWD^tvW^I?AGNJ2oO+h@GT<>ha?3i{O>3z$IVCcPNV zU&)kTaVhxf7?k4;NYNi&K#=^+EqKZ~{g1<)ZA2Rj`h+}6y?B2?5br~`2^D=L#CErd z=m>CB?^v&NwTIE!%LdXzl2@g6gvF)1g$JkP~7l?#7-V;SH7G7r*lhdTA(sR z>*-2yDXvTM8)w>_-<{*1ACxBpo|n>gF6Gb1g+-wTuRrbl?Q;D^S;UbA5gxVTe8R7P z{+$sZ4+7JZu9Ef_PVC>8DS>f_EVR}C&*V_Yfmy>w#s2ThD%4=o;<+7vMYMmTF~O+N zWQv(W7=S-M_&G?gGba=~{;3H7CYdsCNr7JBpYQQk@Ff}#jJD)t9m~Jw>(~FtfY(t% z|0F4ciKkBhmR2UC{@>KYUtbAmn{d5g|2Gr!*UOSvoB9^!Ki{4gjRMFlj3G!EPUbL5 zhCjb}h9qFJzyhzuyjHKl5QRJOSHM2$4|>JC{A@xMjo#6Owb=uI>3>`vN&K_%X(LF4WxzvC86X-te+vNSsso8F zP&$%;b=MqNPizAWls*6!ovft$TbL%T3rY)sw=lCP->Er_=KV}xp(dkrq$8uw^Vht~ z!0R2}q)wA)23zIdh5H#M2TinNZC=5Hz^EPpPX~s09g>g%0V25m*myU!*o$^pTJsTj>DPj5mles&rmEo26{ zir9)74GS(qMIhS>pdaIEa*52yK#)jmt_J}DM@1AC`VkZckIqLNd|w}dR@DS#W2vU| zEskQpq4PakKpvQB0@?tDli0FSbzUIl^bI)kvngGG6J`TkH#!a+oo%_inzY-G*)ji=`$v2<9g&LNE>+$i%G^N=2@DNeCLaqM zSOR7!opn^9$n;fDr^WNJMC^C(kInR6*7cA4zUrHMQ>-IBuXs8IKfpX?o%W!i(Qw+x z#}P4NvcC}kQCE4+RDZ4%^dk=i_SPt^#)x{*0B)YPBMz%ofg%D8dVvok89)qwtaAbG zsKCaH-1qD0JRF)kz_C8h{_aaxQR=tsvv^^&+6cT~9IMftw@??F%*ys_?_xMJQ^4wo z4O4_$0c&d$AP=&v;a3v(NVFRYeAr&iNM48iDpGWCEFbNv&= zk`M?a)EJ+9NApw zWx!$PVz;-^G*(46a0*%3>`jg53>Oc_G1PaUUKCF=qE0N<3p;%cO8;Gmr(hwlU#*vR ztfI1eFpVS+!*m0|S9R)j4?oa&%xj1y)JkW);NI~|hpnv_=TOBKe6q;*y5ZwYOG>g{ zh3lgdk&)u6$Lap<@%kd+ryr{Pai67grXr3(y|@acGzj#U>zaTmWJc-;`5I-$OQ%g%f$+Qdw+}c;`wyLF6eKBb=SQ8Ah~L_C=ZF$ zO+`+m3k^~Pw*3^{)V_qkmUre6#w4ilfna|~#ndYC`LhfFT$QN2H05*{Xfsgt{rGZ6 z`oRHaWnIeqCjI+_notM4th3=wg$WkLhGTH~d>HdTsVxy&V`wRTt55Wi2`s-lqe%yU z5#6X6lZs&=d7?)?``h@y!`AP9Y=k|Rbs+sP2T=7HBdJzn5j>C(9c?|$(y&v7#~I)+ zj#MtXO&8kkRnhG_{@%8N!{QPii865JhHU8la zEu(U|_oTQ02Ml5ph(|Cg+PYoQ$>zV07F^_crMx;U@8*4Pr7%&6G6Blk%?lyn`prR^ zi1N*MpVb}evDv4WKyIQWhJ#~1FWC}Dpj9cY#c`aJsqn{SMc_IsDm{BtU?Sq}f{iv* zg)&hE(r%VJt;$>U#8bCDkEbk|hi5$JaP-U~y zUNK%B1A%pV`FdP(exIjArjYaUCw`#_)5`T6uY=1KKtX%_j};i{Uz1c9B(Z}e)Q^?nFf0HDv+9728YhA0)qngNt%m1jSG zla*$3AwxfrXnK<;WxiXQNVc+R`_+PXQ;}qvoKW0+(gV)TB%`(@A+^cC0p>TCK;Z|0 zLD8K80qZJ!aQ0J){aO9IQ-L&x<$4d#a8@@QDY;-XtFm@c2EF5O`0x}S+yO*DhA!l3wXv?=@Po^H1OXrAxX;+brDnKY@(U zOeA{($`e5$pZ5^w(9@5SbHd$pg+XlrGT8z( z#W6L*^)4R`J(9=wwueCOHWhvcYBdQu{=FFjTm;WI^5;tR)jRFFK?VT_JG7~|{V44GcMlE01vetj^U5D!ctCDpR>A%)nP}FARygz%@SP)d(&^u>pDMi0K;rM!|p$ z!A03589}OJwQnh?hNZoJjvtqV?hkQh&6J2(>p@p54$^#!pyr#N zNc=;Ikn7I|w@5O=(G-$;G+1pvf0@Z%zXE=E*qW!`9c9rpMMj=|JyO<+&Vs$yREW6J z*aDv>ri@@F3k_Zv36Y(3(>-{%Y*SWJhjWc)ddAEmZXqBHCBWrx#od37Nt`hBHXU9i8WS$Z3WkrYm8Qq3loK83W`3GDe>+upOvx< z)xFXe2QF87a_s{jp?=wDgyBmD2x23iF=hXQ#6r{C?b^*x;ON-b&&X~2R{@WhsnP8$ z$BJvC@m@RATbR>j?>t-&@CW`=nV-AAuR3t1LMNaN{&ZewSee8;jbuR^7NH4+`|4^A z6u`dRUBGc?y7dqZb4X48YO#wUUwzl-?SMJHcUSSd^g4YhT8Y4Tme?EDSr@wVC~G0A z`U5uJm<7B;0aPa&vlU7$7yb14NZLUFk;+YXnTla{{xb9x3E9@=*;zl!93cPYnq}7E zdn^N%8@XdncJuoh+B+KZT3yy+N%z)Fjf(oC37yPFfNwLG9*|-6tk)u+_8gU#w}#Sh z48HJ%F}LDc)^yz?_aH$%)eAGiMbmYN!{FLkscF+$zmN#Je1`} zD?GZZe=00_Xr*&Ypx(D9MPlC7jWTMAN> zhD`2f^X`~^5{eJ0+HSS`xSA%xJ;syr>h_PhpI)?GvRb@loR?C2 zSu#r$epXs^w+pAk4>!#;XyD9$f+ks87Yo~^;h}Td{gR%9yYY1UEe9OXR7_*gMit!- zcaTqb;*{SQWj-lo5P6JoyKj8ztl$sMb^DPHtuwIa0TQU&&2rj@;&FJ z83Vk{88KD$((!8csr?WNSohgRoT$I12Xa5{-4DNz zVij&Mh%!pKQ;c+q6==EpHCPfM)*|IU40BgK$0~_We=08V$@T{%RnY9~pKr~lta{d=?f&e`b{q{l=0i7?_-y>7`KljIXw}69YS^ z-uI5%pO`Jqga!zjzYb!>jR7CA&Zuu&l&m(&JdT*jML5)lbNxiC9<(6DRmEprBIxfB zN1_$^QuYqUp-rx|&6jX$mO3gHEvH+^jb2D@_;`eW4c}@^p4Vk5@9|sEk1>o5GZj15 zijr(P5x4ytTI|}O6v9d&w|4|;y0TR%qei!(9K0R>5MmZhr<0n=Fx(s8fj)z99#Vgo zQGCZVRf6+wt1*}meskeijGC{yO8eA@j#6mEuo$h}w&ZK+Vj?J>1eodASrVJccODkp z_86;9-ik06Zlwe6EVJuD`@86g~9Gp?0JOA@8y~9kYKz z;r*-x_5_aN$|p`X12WTlRdpkd6)IIY&iM~Dvd~_8jf#0pTM(B}ktXeKI#BKkuO+Fg z(RXU))E5pEnU7Y~$u}U3ifspq-fZj?(A<$bBDTy^7Z*@L<(uv9Hks8)DkvPte|A=j zM0%uVx`^W}H6_de!3ktT`1xkD`raDIcMx_;cz~q2eLbtiJ!3qqajkS7%$Yt`o5EyU zieh70*bu5oIlLyrCI>Tae~W%@mr@%&{+hqF09@g}@_%s~S$lj--*#h_ID2G#4HLQB zi|}0#qCmGrid`OK+S?LQ7VFu-+LBrn9B=|zj1Q`l+x(DnvJP=u1yIB`4YPzXI5Hw# zCWknhbJlR2Ymy0K<2^%YWmKKq8b-g}GshVanrCqa*Zgg60gLrXu6)j2C;4+9YmXNl z@wi?0(Yii@o>5k5pW@fNZz*w-)EUhNa^X8lsoG}6b|eTS@FnNYP}z+{mD*bBb;P~! zpe9dY<(6eIryx8Sr@)_12rS;}k48^FXL6T)C2^oYm^~C1%b-DfM_HfAfRSNob@PbN za?@(C!Z&E0FSzA)C;*LM|A( zO$b>AR{Lck*BhbHmn|J!oI;@BIHhVp5G~ptyRLIZX2~i4A>d4LHul%_kl9Vqv)>ES zYg_%o;LQ{(K(P^NLM|sZ6DNpq)Ev3|b~r-(Y5J#(+kDSbYckCmt>LVuC}P&=QeAgO zXWHpM08~Q`XGw@nxdl%~9VZ^a2tj5V0c}I+p858HG`6BC+}zh`9rNh^#LnIf4Z$sGEwVo+4$3P*dH;zAv3@Tdw5uL?0Pxk}Flsl&Cj# z(mW;iNQe?4i(|aOwdy^5U&m10o6e#HI3>MT>2~y%A6$cwNGQfP+PQar4A-(hG=$v% zy>@ldM3`)wV)fXYD2ikaQw76MrL0|86O#Dwzzq3*q`c`g{k$_|9?_qu+SVv^HB%>= zl~EkaX>*;Z`|6>+>A6dg-spi^eB=!yl&u&J6ZPJg-6(U4uufAC*Ue7*_6##2YKyVl z#g2esVQ{b$9{p&g#2G*McE*wbFJ{x-RrZz>uG};9)&TXAhiX#uq=yBgV`Cn>Y5L-C zaT8}bYc=dphKpXbrfyZ4-N7B~s%ve8zwcjez$TR?)$C+#`tl=4ILVeH;NLb7;#=Cy zwN!I?+vycHQwKQuQd-^5X=-~C5XBM2Yxah%A8@r|Fq|mfU`6dGKCr@9OWoR$OZoCf zig|n{hez|R#$?;karAPGt>PJ6MYDpwVGeEcec{IbN|H5~hDS7(o*+aYCfCQ?WEvW- zQ)Sfh-;y^rOTRNbe2U9-mRiF2G>_TsLaR8*mg32~5yg&7tnrwUCo=WJF+=UPZf$8PPL?CkQFQr%|Y?h zk|UR3B0QYk>oR1%H95K4h~Sip!M6lMBxwt$$n_N~O9Dx3POm_gr$Kv{>W~7{uezpX z)z)63q)}n}N~gl8^l50QDHAN+)xGpPsndFPinS~ZwwKg+EDJOh7Ap37tOh0zw%K>m zHJ5j9Ap|jm2eBs4*ynA=8+txcZ)sv6bxXiMbrC+Pj{T<9>E`83Sja7rv(uAV<^pRy z+}QI&xHW|L+oU5!j`8c{<*5nv1}+-I0&^PeBv|W`SbLD6QRxV*jq6DUX_F2O)bTi6 zKjX$^l(Lz4e(kBp@^}OTc>3xkvz^@x^m1Ehi63fe^jZZL*AiJ266oF+7Eh4N z!qJW{(`DAUrUw?fTDmJ{bn~@t(l@`^aA^pebrRHit3g@zbev_Q)M)VrA?}sz7ap!z z&ZAThhsiyHs=DQ`WMIm+AYwV;NV7fE`>NJuMZ_Bp#_oD(q?^9x7cfS!FtPZVb;ES*PJ zi)x5X;(%TnQz#1itT2&nIueayMh1)MhKqNiw4MK!LRSV3;*S6{qcmj^rm_!ZLQV>A zBJqpHD{*NkdN-$$IC0a%vVx7&k7Ukv*q~?AN@O4EJb!YVrt<46S54Lfpe*{!1h!RUPd}qq6FRpx6tu20|^%CFDQXUnX0v;v5 zlqZ^}@VuLtM)0JTfiNX0$erulA(8kh3(0YEIOWAbkr*v_Du?T-W?zs*-iN~qlf%=Z z@PIAkHoy>iyyVy=RJtr@VpQa5(G_WzmdH=@wXp;D2}t23Ishb={73PzRt3l|9pTVl z7frkJV*kvkDrCSU21v!H9UU`#3P=;8ZOWO<_XNPi_iAY8AA6#x6yN}?j_!$6X6cj4 zL+7Z$$QcG&d&_!dL~PDjr)|^uMtP?ZfZsi}siR(9HzaSY*rIBUF!$nywGQ*IpMbgiE#khY=q;k>>v9Pz^of_Ut^p;=8DvWhok0MBLO6TC_c9cUizL@)H zW3aXX@sgR43<=iN2#<6MOKyt{E^M^>dN7g{RT_HkbPjB%=r;+eZR{l_ln^r?))RCp zwLRdlWHoeb?y0ox+Y$4As_G&+L3;H9ZJX&6Yl1@nR6B`l(+bRI{^PJ&m1q3w!8vF^d5Oi= zCctOR=d*g+DL{2bWT7ZFmFD^OX23(J7Mvf?dTJDQ8)bJ7zT?~F?w6z!wrpdfMO z{BhG|Z;L1_(YiI2Y2`9|fFJ?{NOT9%=K!+NRo|97rtfZ+paAl?VxVpSGN0?fpvb}G zdmB+J37du_UyjDCG+3Yd%#3Eu#baP_6@dlH$n;?V&z}fV(s%{ZU*(Ms5&*2EsKuFK z3F6m3+aJXKQTt>qSv$re>OTh%qMO`kHj=UcNqQnWZFWVraQOgxKa z%?j{)%usD3``tGH^cziTanquHhbc`0QKh<#x-N-%7bbe@^dI1E_poML!zbEktlEmz zKiLO*R8bxS<485Gw1p6C4?gL(KH{J|lZ75&754QVfC5#@=eYR#$pbH-%e5EpIrCRWgSc81(5lFb;Gca&Wtqbkm1N(~}V-lnHz`^}zqA^JDzVTk#x3POCT`6w&P{V$gU>FwpXrEC$+G$bF+~~Homt~Ya zDCm>Eof1FRG96KQ7l+}qYWE>?Z-)EO!trgC?&C$3_2k;@VKvRl7(qPL_bw@9T4_&i znCuxOZo;{W;3}~8F76wa_(W|)J~J|VjrUm0npwL)ErZ1CHsUwBwS8*-BL&fJT)6BT zto4a+Tq{j8a(xp9r8VLok+Y!bmAOT`ul(FWF!G^-l6Kc<`74AhfKPVYy_*#}I($*8 zjmk%Oosr*dunR|E?zM@b#)X~d5sJma^Rb(V>wI;^Nx4G1GbBerV=wUGH{TRV#; z69N-?_1%?7vPa>tyAF`azvk*ICE`wn86WuMT~5*zj`?d&O_&G8?7BT4CH)vnbr<;t zlf))ZT*0ao@m26MALuc^sS!v0EF39RcYuN|{*|Pyk^)Qg+A)P?w^d7raB(rS@}eu# z7u@tW^t{<9(N91{O|X>%QU}-DP%}uvQ>$-z$+dp);v@YhxvBGOeB^Ls`L?r>M+Z_V z4kfx7mO01e){n`QrwqLJ#cyWa<-Pg=lnY3-lvTBPyRCgoame$2Cn-@4u+LKl>71+F z`;}j(ZcbPkG^)l}AV<&M^4fwYgV24%N0A3xI7Hag_6;WyA90Mv9k+Tsg(#f$u>F z@~P@$M;ZoC4d!Prx6($aFxIhGo=DFg|Fp-#2y5e8^HNVhnUfSPy%v80L0sAPeQ-B}jwO{(M+DOM%&PztH^tbuwwzKpI5b%)S zAJ7=b6_91%?D^g<@ByXczHXjj@6M#mE{prB7bogfqk*pZuS`sfB?D7@@70upP?zJ) zSRKy1%Lx^GdVn!)t;UsSMsWZLGinU&E|7SOpu(uAI^1xx`1J6CIGVM3AAxjRtzOk* z9q(aI*w3Znb+XUXZ0ngz=b^0o1n_QRrsoTw3jp=9O0?xLt}+?@wS|C|76iS|?!vj@ zJ$rvCQg)n?fjyoPJh_}g_DbH^&FHcVF8c$L-<6epUHMBm)F?%h(rVJ#ThN7yTBjd4 zV4hZ)^3ike8iP8`IpRK-C?^;pNS(`E(+u8VJ4`W(F1CXt?mnoruZGeOk7oF#4%BIFy>P+MtAYqdpwoh- zwlN>ahYi1@(0~=}Tu;zy6@pO;Rn58#u{6d>86(C*`R^kf=ajh|30f5s2e>!G=sfk4 z4c|L*s!p<9?aRS`M~Rtm-n4;pE%o@poLdxFhD5|STM5;b|JFhw&k-3^2&5yvz4*J7 z{;QtOgF@h3zvZB>4OQ3wDcqVo1+zfaeWLQm5Tco&_&)DeUs3jV=^L7AdUi2TRm3#& zq5oruJVh{sqrm*M=0BrC;}g6Hgxx@{7EbzOh;dpl1W&f(*H_ShKSzarM1knU_36w# zxj%-mNd#rp%+dNMn*V-b@S7Bf-1x;DFpB;dqD~gL7mC*NO+TaGe}<;QVP7Jls1H~B zV~9CwlaATqT1P{WPyY8KgD{~TdUsy@=MMd62uCdTC5Nfm`v3Q)IjBg=llj;4u~)!F z;<4zpc<$Gwe0CX&>$tqU1PF5;aoie4XtATvhFNrLEYcW9M@FhI;?mL8VSZ?ugdtq| zD}Efv{&tA_CbN>u%+^}7qd7)bxOTcMv*VY6c*5o8cON>+ELk7+KFNbwuqo)s<30ay zt!a zFP}hftStE@>7Nhx0GyVKr=013ZAXRZ4nX>WFu2l>F$6aAMgtFl0`FTOgeChLK$qMR zvw#PzJm&}6j*o6K;xhwHk^a zn;g$Rx}-QRJIq^}`~7G4f)gHtJlFN-r#WJS-8)a%(DvJL1N$ZdoN#IQZ>GN+1JR&* zl+$~o_1TZVLLd_j{D^^Dh@Tc)Ys~D>tKvuA{k713I-0`zV(I|%2XtdZ z^HM#K^u!yj&Jx7<(G&ZF1-^TWu69Z#~q$WAj&-1dg@i6s^ z?lmxR-YQc}<5<8=W8t6BQvbQ$L8aJxtd`qKx}k7Ck4HKAch)~vAP}w{vf-!K%7bO2PO$}h09#N_PXAF(I z>`Uc85i9>(<sN$ZvN2f2ZS*E9ZZw z69HX=(65XJ=EVzw5!nALPiKlZ!6Xur7tst5w#>ZSnvF$KVr_5fkv2modU zzb3Q3Tok$fBGPu4{oSHV-uQeHG}e4WE_tgA`Jf%-di}rqEH*EP5hEXe1o#Nh;Km2A zSk4Z?_edzXTIX$V1WFe-YsS}W#y7{tWIx{a34b9Rn*kkh=`Ii2KuaG$*p7u-U;`PB z&!B0hoZUx9Y@e$gYZ0G8^_&Ccn ziX?LSZmUfEkcM_pV5=~3^07bY(#Q%jAWlxK6$}O@_$!k@FWU#XSIc;*xJRpe%Z`#um^BT15=AyM(^F>I=B+!_GaG z@Qs@VydItzlMs-wfI16or+;`=R_P6REh1la$MT<4TCbGKI0L1V*5%ue@?JvJlIvx% zYtvKCRBS^4HfQsh9lGba1be6My_!75K)VPags4#ud`W0%S!0*f*&B5e_8`Eq=-Sz{4mhy} z%@^p=7W#~VZb69UMDTlvOo%?`dt7F$ctKFD^~NL*LF0*5d`{idADE61NoQl_a@3=( ziz)#i(i3YtsIcz1k3U56`_a5Oaxa;kFJG}q7`t<#`UJ>%6jO0?+yHWy*x=&Tu1D58 zz+wL}+uGZvP_(Hacw(jl^Xqtofk>O=^Ee4k8YV1al;lk-e%oFsR6uZ))`+6W=2B}t zfAtW}+M*uhc&1a!_>b@Ah5Ue6B3Kg>Nv6F}WAdH7m(s%9ppU|X)j07}85KE=bWW&$ z?_Q-xYh>)x0&reZ2{s6|@7qKDs{tDzGHTT#T{U~Qb#m9-U?&5N!-BSYvyC~ z>uU*sZ7!&|Z2)KvXSoaAAJYrG7{=3J93MtWQePOl&sqZB%=1jcjF1)9_UbOB$GsC{ z7SEa3H6`iH3R|A0PF&_NU4Ud4j(ts+U{%vLBO*slj!Gg!uuJne)j7~GiAWx%OPUwx zjfEIxKr#Ca(hdPCKm_I-LUq4P`x|6FTNRe5GX?*>5VAd;E70(UlgsXPbojm-jCq3S zdC2}gIJcjGP{85E(bXA~@tK@L)MLH}$gM$glxAN;j zKUyiIp=%7&!BV`4saDVdtPOBN3MmZ?Ar>K8x$@z(6_d=Ns(~tmou(83D983e6RzSX zUhmM1_JS~c12e5ywcWjLE*6y?q#jAl$pZs6#>tx0TLh6X;)yHNx|@bQpymY}Z)14q zX)!I&RRqf0R%zItRVEZdwCzPeQK=C{i51v>In!`xJonCwhYnP^vGyfLiWHxi$% z4dnE;@@36GCgvPb$6S95xcZ2W@7sx;6E}Hyxe1yWeW40LG^e?WrXqsK__tVSMpSEET%~uq+E#-|w-7u%OAm9b_z% zaKbe0<$Mwq3UM&-Ty7rxXdH{akHg$xYe3fMy+xqVZ~}~d!SS8k-9v%1RXP#=afAW! z4+%qpqlTR%N3W=^7H_T=1HKS1np7J4tFpUipdCY)dE=A=*=JLjhChQhf_4}Wy{=0M zgs=hAlanmx7mf5Z=$bjk&@nhlK(O1aXbATtt7olZBj-FI0s-bI!j^K&gXO&CUA zRfD@XC=n(>yC6|~+=#FOf1(&(Ol(0-E+xlQiysy}%Bqx8!+c%forWYR;=@eHrzPHwTCl@c2ORPS zBo3v-!f(SDed4@)U-CebWTp`xlbPk|r;}}ioA_8?YEDxE3-_Hi;opRU#?Lg^IKyirBs1Ncl}u^)D;RCw2)Z?1hh~bYtS*Re)jl_ z9&PdS6rCI@m9Qh5kPW^M&o!0>2#aR0*H9f6XZqIdLy=nmt&DXcw8ZfHc|9b{^sk>Y50D^JHkK6V0@9Gm9-%pE`F! zgRgw*r7$iAhbnsO3q;f9q9QtL%r2T)P{7`q$P9#+P<(tb%FXq(w48#{l5$aJY0*jK z5;21GOHp>+@Sah(j6x3bUOvvCXc?QtW$)&2IA3Bi_a&C1WrLco9;a=lUpLh~*nZIL zSb~c)luqSgtZDRcUWjJp*YES6PwIh&S*R~x4}2N-Q%=U@R!`Ipw`L2O4NW4?TBKJk z3R4`yf~QOoEQx~$>~{_~NrtG{NBK92S4T*c%OKA8 z@hRedb3~=Muk6-!M{=%DOXi&o2y6(vV-$44+tP|*!xp{nivZSavV14jcC%KqU<&hU z|8zOs+x;qUXylEvOTifSrp^0&)OzPT-gb7%A~oOfZ!2~yk;Gj1XC$aeA_wK8yVWZ& zO^Mkp7xFUjh87m4N5&+zgi&zHvSe|4)+DQJ1MCL%Yl_Y&g-LxfWUX43bcZ6+2X|V5 z4+YTsiai<}N4djAvD^~1F^`bITA7}YcjrF|`i~o=I&N3>>E$sU+Fd!w} zCDIJtBGLi_LkZF#Eg)Uef(n8X%1}ciB?c%U4RRO@&D!@``@ZK|_l|2{zyE)dlaH}tuMt|6Y|DZx0?ZtS6GgdEfxBw553~`!#x!`j zlROH=c`@|ih0nb2v4>D1hYmBOUnp^2&M@Sax~mSUKYVSCACgsCMbES|;dq-1@Xm#^ zlI{X#E90%P(}uZ+M(U8O5*!2x;uw`oE3}WyL(7Vt2R`AmBv)dLl@q>z8a}W4hq@NW zSDxpZN;s9O_3WA8VsxBpQkIt#4I0ETIH9vrpGS%ZlEDKlCX0oTR zB#QBJuCS+G_Zk1 zkQ5DwWU8*?@49PQ12r@49xvOfL3(-BK8x$O`qhHr0KX)N6*3@u)Zp;3ej>_Vz0JI2=I4W`J}CvP;?1?W(ueMw0TlGvgd{PB~cAIJ>e*Ixn~; zierv)dK=$qu+eYn=N+@Njr;Wq)Z};?W*L4+k$wvxL%(dJ+B5vVHNGaZj|;jlG*e2x z;*GDOP*p#7o%Eb8%*we^DfGcRL7ORBXI&7$G~BD-c=;gxVB6dMs?Tm9HnHlfAv~2>PmDfY`QRH! zf+X;il2S0&Nnbn%fjwC!ZvHwHfbO=u#x%*v%vdOCpzfO%uH;SBrOs(8F;s z%y+q2US(+)){qlsf5s9hp(_PioFA>oY45cTw~1a(bqP78yRYf|m%jR^)JCYZf>2Gq^Wj&rJPkS@?kmM-R08av?Eev#Sjxw37Axv}_`sJb*&xCD`55Aw4QqM^t0 zR+V^z5~tp-eE>?{KGjBUv2zAJd_GOO$*rfV@8IGsP9GTh6=i7gGg`VFVBSUliPrwg zczacS6j5;5!UaCbOkXyagn3Gw`EZxvCcp!thO63dySaEnkxO=|sEyQniWh=cg7kB^ zPVa2JRq*BEDhQEY-#&`Zt5ibU-H%YDzFi!}&&V-(j?*X>zU;43og&iJfA(_ihg$hs zpXM*mJO{{cerBClJYY>?y=ozAV7#1fRe-IQh!R1m-dL56M{UzYwNCMKLlDktUVE?Mo5mnsINf?Bj3SLU3)?2 zz^hG5=3Hlj0>HI}CSPn{_B^uUl@~^=^^8-4JxEzXFeR?K8|n#XwmGne3a=mbQN1FY z4?}C%xu#AgtlFbP{May}m+k?TQ$gMC{w(cjm%T|yO>0%d7Onl^eFj!7$Xx1*npW>+ z#hNwJ7sXPEti9Sf)#`p3FnByW(J5PpjE9yj-OpVUwz@PFDmFUU)&|!@vY#=%3Vry< zMc6Lf!0+ob74}ihKF|DFy#9VdvIhqn$AdDm^@K=TT%svURCWvun?vV!D44(V(U?^x zsNBM6!P)JlDF;PQJzuc3^ti4-wYW7@7oIL9U$C$ze{RE~9*vXI9%o;e7(DBJ(DqlT z|G3UAYu+`B^_zWcl0qg8ANE6f+RSzr)5ukQ4`$7%!upA%TzWo9lNiZ8A~`9vAvr|LErPIv7~yWP`l^}f9g6RuA#eiHklS;OglM~8C(P@cLFbqlea_)NS{jX_rUZ{j3z zy-7M=O?^6TSS`=F%){w1R7bc-F5%62<5`eqpOql^fsxGb$tfi3lrtZ+n$E61?9_u# zx!mR_m1SfzuAtAX=guQMwcR8BLgMrIy*!!4HdRKpb_%*;DVNCmNvBaD`^OnjA(nO7 zh$F~VSS~3t6BHLrl)2QHJXm5k+|Xhqh8?^4GxU&W_rwTA6ezvKg$!+N;I}N`VpJh^ z2rJ2M zcG?s=Ud-yv&rhtnFxZs!9$or1c%6`GM=l@dt#DUs5{PC@)4^GcVvORxXuAi`3En^_RymlV?T5(4;03Es{L9`u6muXJhllT|&QH=Cz<7?mR z(+=qn?WG&bof(wZ8q046L~jY-12&_UiVt-KbXr&snOa{k(ib+u%@c1){hX5 zX*6g;-yM3`l`Ymp;k!y=fvj+;=e8M1O)rY>&7rv0pM{`NqsM7hlMl$h>Sk-HwsLWZ zaDXS910`~Y|MEHGosz|KBzo=@cRp3KY}3Awl3JlHMzAW7%r~`K36vM2U<4BbU5vQA zVQZ@ldeKvdL!Hcy^{ir-<7JxFn2hWri9B7T)~oHG>TS9PRSdYr5Ad@*u_*D7-fWML zKtd0L7qq@yzp-~6=rnDrT(-+-oGuo_{Dt3`Ds1&CMlWucu}))0dvC%Y#-yVv89o@$-Y%bgJy2eF z6?{_>L<>Ti(%q+20$)z$=~n2VaqwF(FBNvrTwoi=;849U@Q5^<5EE*rsSOo80sNa^ z^52=Dch+%q=I+NwtL4Z`+~vy{z3a!^`$mM&Ji4L7hVtp9{$XA1CzRR8W~!fuxb#CV zW^~=h%z+aQo~v@YB?oRkgjyHef&{@LA<`0<0;`-D+^*hLcrk8&**Zb2Xeyw@R(iq- z{8A=Tvl0lQ{j?|R!Z9DYd>(lz2e4&GsapFV5!nlKhj-o8AlO-)-6KHT?Q5c0iwkwr zuFnowo!DoH21MRTY{C~3QR)lo!n2v{)+OfAjY+Vbl-(+;>&JH?BVV&el|_#`OK{u7 z8Sc_TT$;cdn-{c^q%Eu*v2=d=UYjN7RQSp|qrPO$C^wy|aCf|?vTYTLX$35+$88Mj z>SNu;)*oB5`8bOV$=~%``CM}@iC6E%YPDUo`Yf3g1|L$vG$_1$&OltgJG!yif<5f$ z+lcoE|CxK*WEyR)pL?dCJlJYq{)UAZi@zOAv3iFxfyy@Tx*?ni#?#4Q#mdi zdM?7iuoaA|^E$oD_Xn2vOR`jay^j?oHH(vIgfTH1UBm0V7o2TeYcvhl#s110g$sPh zqj#7PYtWy4Jb%E?T6oZ{8>N#YA7#5Q#(HBezM(2aVwWpK5X*%F(gusjW${Ka_oVAY z#>y~?8k#2WM1S1jiSBpU>9CvO`xw-cJI(b#yOVWLNw3|v;j1?f)UJj3^85tfXPfTz zDPaGjt<_rTY{9_BrK6I`J(>pr{~VXVTY$U!!m<#490}Bli~w8ph9sgy<~ej+_AbPp}-HB%pRnjbB>vQDM0`DyPQcRt z2q09v;rnL`Ln}y1&#QOam%0W2h*c++5L7?9Bc>oUK@muHW`2bD#vrwaQ{0;Ih-7~} zOMQ`T*h`IhZpM0-3FswX#~@8rsaWZ{FL8&(*0rj}xk1%u@AUWImb`|cy4XcSM4UOn zFw3GWPF6^X2ejeR72N#}5bIPu#czH~!go_vubj1Z@{JvfUWL1^o}u8dWJ*scOuf)-BXXddcdX*uLLN$5y&_KakHYV)i%5llN$~Xmb6Z?wDGNj*Rq;Qu9HZ@9{4sO-}6jE2@g(&^{wbDl;RZ$qQsK+E~`ly1G~wjL&Px68SX{DA!q!Rvd(uJvUJ(BhWGTl z^If*?et#la?RzzMy1=wT`+$eQ@gkB2PmM9TqeWf?+Dy~OqlJI5=F+DtoJ$L_q}zGz z3rhTY8PU$``M9_D2wo&D^WTzIp^6kGh#PhDq2Wp5pyF)9x@@zp?Tp*^NenHh_xD9) zB%JSqyjPxMx3+Ajxw4~SBYbV{gXI%J&o_=Zyc{wjH8ZA*29j!3b-;2WFS~WzApJ`UAixhbc~&!2K`gN>`>2goN^zaRFu$?cq7kJF3s(;go5U4!%MsF=J0S6 z*-;N&@z2D4vfONs(4Uoq7p))PTnqTr<%)TdP-0MJ>!agiyWQl}%4Lr60 zn^kzNeuMu9_~LctlRCi~fNbXFp)Q{k2%h_gkcS9dS-#ge#ABVy&^yH=6!$C&9c#!V zT%~hH%4nCxiB23uJVb+~%P|1_j4RgWOZ6`0Ace4a`u#ink}QtCoGN*6SKzZ0PkH?w z+u^F)?Lrgx58fth>5dX;;q5_aaHa%Q(w6k|;0$j=ilbZsRippM%^dB?ZBAu>1f5Q= z1sBgT*xWAkdE(i*QDL4k*}tt{MD&qz*38H4#^L6UC*$MvbtV&db@J4&Q9TeJOOcI7 zfmHBQtN5aAeTDf1p6|K6?H9LW((hAOVHy8|6vmSMZb!|HZX5rJqM=|m>*t6wJ%?2l zUXdC&-|(}dzUqGOAs&T4Haezv=&pO>^_KFIxKSoY!x2q*1By7OL;yJW8rjgFl+bu| zBVM(G1IMtF1IKv!OWTABCNBd+80;jw+KVMG6E%(c#A+*avK%g`Dz<2*GAYUcAA}^< zx|)RT4)(1i2e-T3Y%U@~dP^h4(2TF)0}<+Z7r##y4CRepy*PMO1=1PZhQb4R?3q|A z%cWosMiOewq{bt->XqT-8$|2r>`{C%JMFZk138J=uqmr<1b1}tK4OAM}2Itcm{{Lhiivb zS@pOQ=MBueRrJmh(Cfbh{%LZwnb3av2j{kHoht3TJf(FGe2Dy|(Uc-3)}r$Hj{;UW z%8|0piu7FkAvq1o<#N_q?f!!YYqzexly{gtGM4ZzAE%D-+*^Cw%ft^-AW*__% zgVrhZry}aMRwcS?WC>!7`(4vngI4WAU#`lU%ia({N!Vn<&4^@&C_I{*wydy=6`k5fFsy1H^@kkKno{Ao6p&1z);+3 z$~j>w_fb1!31+=XJc4|>&_OCGCe{*5$I&U$aX!U(LmCH%k{GV0Y=q-uCT1hjG{aHT zok$#K5|P=}Z}L{-fpdfqism*)o?2$1;F`Bd{w1}>Bcrhw(^hsy0zyqw58t%5ZQj3^ z+p5D2TYYlntYyV~pv>l;i1n<*v>jr80A|Sxas#TY$TGP@kqy4vxk;o`K&t?f+ipnz zrSnUbsxqT1%$*@)L)r_MPz?Tz-)vk0_;Sf|%_Iw~Vh)(kzaN7)kQoLSyIJYmNaF`K zs$8s%EROP(c2yG!EVn@{(F}F>BbuIeB@+iK>#oLDj$scr7qGDEfhETMO9bBOJc_ut zwls7ifivY_t*%dX)D~k6GamTt1lYsoo9q3Pt)z(+@{rV|>O;s7c`EyE$zRe-jkd|= z=<9Ed6mvoZ^aW<1WABk-2U+fkoKePYnp_B&*d5h#_yw{?%e&=t7Kk9T$ zOPC3uCFp6$7!6rj@+sgw0_HV6RZl_}fj6M`wD8AmF)+%Y~b_T}3^?s~I^?m7bZ&Y!)RyvN2pB$$I57AC(7gosID;56iE-q(NW0)@reNK$g-%`hz4>4EF~25M&`BHU<1}pL`_IMb?`u(D$VLg4{jNQtVK4@( zovHHP4~yDgAU4Q=%bCe{?+fxni5czUgfw4`)%UDg+|kY(^q56 zw?CSR!GuC$_AjH}ljw^+12A*Pd=a-D}_~| z`!Cc#L-jup>gU7H8Ee7&At@*-BQv@|+4qATJ$Z2>)$!eC-EO}{&_22d6pGEMp?zvU zE-=yo*~8=bFKuFNhZ4aZ`16buQC&uklZLnpZ+$PME=+D&)wJmolxxGSmar=`FP5a69@dk MFY2n5so327KS``}od5s; diff --git a/format/diagrams/layout-list.png b/format/diagrams/layout-list.png deleted file mode 100644 index 167b10b11e37e761de81de8fa9fc8c5c9a30e4f8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15906 zcmd_R1yEhVwk?VU34~z5-7UC#aCZp7U4mO+w>s>(9Yo)SKVfq{7@Co8EA0|UDceqxc~ zz?Fsf9(v$ESXXr!ahUQk;w|t6#Yy&!D+~-S4fF?APM!J?H27tsspF=jq$ptKXwPC| z?r3Vk;%V;$TEoByc?y7!_7-j?8!jz9K1i)wLWmZb^$0lyJ!jw8ns^k)m zE*9k6EL<#Xlp;^b$;pLW%q<1fC8ht_9efj}w03iI5@2QZ@bF;q;AC-hv0`QC=jUf- z<6!0BcnMm(bcHy$nRvc*aHaakApbFrq=l=Qi;a_;jiUoObX*ftM|U@2N=oR1{`Kb{ z^K`SZ{9jLUaQ)}9zyevJcUak3*jWEHHs~q@y(%E#Xz%1=;pz&;7vUCqZ24cV{jcZz zqrIxNqnjhx1s5ALIR`fj7tqh-n?9Kkq-v9a9Kidnj zLKps@HsT+<{CE{?v&d5+)_=V+k*Afb9DiV71lQ#xUuk;6?qwtS;7C36Q3R_deiuK` zDnOP!$ZOO2{G*xt$15pJ*eAX()EFq$@2JShpVDhJxiPTPB!8z!R>c;R6-PmUyCoww z=x-jj;BEZV{Kt2{n{H%G`Nr4Zb$o1WWWUhw_R^WrPc{Gn{2fv37$1qTnv;9{mV=R( zTtgNQgAyOGg`zs7bRbSB21hLhm+W#5i$xBL#U>0Z-?(KU3;)ZV{ar18*y; z4d?~~dd(t1?{=Ib(61GFP&@Xde_(sJa)1<9uYbnn8GaI$GeIu0qPE3ct2pE(*cux{(sP9^(vpAZcT3V z;u+01xii`6HNUg3Fl-%4VR~I@+mLX4H#5ZV{>9%@bprfL?Dl?2o);PaCTIoI8nSPFf z!l2*aG+CtdoS1JZ%TObFs%yH!NYwf}`8Nvx``gaWP8juKbumZyr+j z>gsB<$3YI$>+*7g<{V*fTpPdZtzs-1`GAqjqt!KLN}r4UKUHS^b=DJbYocdAHI_u} ze}C(bBNHq(k5)p-<2^KEhr@Vvx(*iEeljMA{zt&~`gAz;rE47?M*^)9VFDjHxflW* zJiJ=C#EMcT&oZ;}pJtB=tx|3DwTG|#~RsH$M2hLBz1D`yRO!U#IsHoPG(dJUc80T~yOuBog+sp?O zFY6@qDEh&wj4F*^UGu`goOV~)yJ+?iwqt=(NhOLCFLlM|?lvwO!K8NqK{C*V3eacFplF1Qp-yX{roS2xP zQ_j-)W<8NeqwqHw*^D}>N4&w~V8M5Po?qWH2eM8exx(qVAjIFwhTW}`C7uB`iA3<7 z-9S7AJ`NQkFRqJenTb+J5M2hWlwp-=??;I+CtH`xLlY;@ADfybPR{CY@zl|+VQ$va zj40W$l3(i8xvUK}kOi`!!gYr})2KGqXC)4!-TPgOy71}>f7)s!9TTEg?@(4?-Z2IY z9{02@p2BB*IE|Gx?)SUhnfJ0jmSb5Z=_J_L@Xj)S)7j0EWf8Eju*liN9lz(jCuS|U zm18JTueG1U#eOpMX5cBMGm4riM#lhVLUq=l)!N~>=)-NncQ7;i_|?^w6|hOco&$wG znCc^6_3HOWGkH^(<7KIpvju9ibofg-mJ+%`_C9b}bd^PXel5$N5f(v(8L>r6EIu1Y z%pb^g(00Ru&aPGX?PO9#R7!DwzR`ln8F$*UXElg{mWCa^j^=P&IC~TK)|LRSp|)S_ z1y-JpX1;VxajqX>nC;Y8HA?n1uhqV1BOGR*^dlr;Rsv0E;l4WzH2KH}vu_A`?CW3o zk!dXRghZ4~KwTJdo9(>8~m`b5}27LZHgVSa#J+K)2i0kL3!EZuH~y z8*wz+6$>KFAXK{bQA|~q=g22!dcb22H<7|Gw?Iw=(!=z;d-Pj8)gthif>ll4B=^pm z_C`{`aAWSc$m3#me2Y$3Q(LLp|^r3HC)3?>aXm|bes3AZVi+g6(&D~-lE`~xw+2*L8-7k&@-zM3&A>mwhe z3L88UZ2Rrw@hhp+kp=c0CE=VuQzJt3QJm;$4U;_);;XGEJ|SlbLDXZdW^xd66djJu z$lzwM;mlZ#+P7~VM=>H5p`Fy}b)d82fwk~|1F z>isk|vpQT7$|DPmSG{@9S129!Qamv+Y;9L6x*5*GbTE5G22ikRjh|tA#k5~8{f61= zUMZkQ(m1DFswUv0p{1NxD7dDD@X)L$JLyqTzfefl>F80W(1gv`cgWW?YCc5&v0A;s zd=aQMmc{=IZy;HlMvS5Uii(czXvP+X}0ww--+vyL5nT#??FCo>F@p0h$ zQ(28-_kw2jV&#dr-v+}9)*0suu)hv~L70GWz#ieT=yw>gPsatQ_=1=bXt@g4=(H?8 z@`S;Dt_4!Vk?=|K?b8lHN@3cbBd36(7DYQIxGbjngl2Z?ekw+`nO8%YL8GpS0Ucc= z=jmn>FXk}RNJ6u8s&LLsbqV5VDToOPx971WP4TE!deId&5OmF)P-Yul>1a1=E!~vk zAWlLsA(!vrk%k2Jvp5LYBa?|%^US}+zwrqMI6#KjM3U2TG$Z7R%I@z}vb|1EVt2FX z=;CKF);+oI#xFG$e;eDYmYjy*8H(XIFICkWMBuzv%a^7I;3udO>|OpkC1H2jjf{5# zA>^@3cj4~e$6>dpIZvXG=?r;0l*}+>att>U5KK}yaC5bhYsSqD31LekK~^1lxnuFQ zeR)~VgcO&pYD?9Q*XN-10(8hv{{t*%`&a2I25MsZ*t;1IWG(jvu1|lJI08!Sg{D_4 zVG+A^geGF$#m&QV7(>L$*sh``c)3Uf4JC!XwSf?n-q1nuu5Fm9P+nRI_N$!V-PtrH z>NcqbYbRscm^-|E|=Vb*rO;%P#vb89%{_58!nX;}vImJFqn z#|)wNGB8Fwe$3Khkd`M=A7)g;C8{jL$6}87xPEpT?#gI7{H-!X`dIP0g7h3ca*rk6 zkH7Ue9K&2H)`LJ>6Y+$uHwv%ci=1>=^IXO(a+YHH?_iSiBzar#j9kPA=PG-6Cck6& zc*-<3?94@yr=6U+pMoFm4*b!Josb#Q>I21wvM0%rp5t<_K*(n3(Sj++{BNtm z``1)|Ql1BxG0K%_+Tr&~N%e$VNmfy5$OXXIY5tCOZqo6DQR#eoqw}{1Z7vBW_xxMl zVhGwYH^Wd45?{9WTQtX6iGO0>SAfeB>FJ=+Snnj8)Ft7gSCxbm z_r$m!4S%IK^0<&UB!siM1jUK-Q^}t+$yN5c5C__X&Bfx6e;TlE1z8GZJUe7;Ih`b6 zFjZ`NBQ`v)$kkT{wX$Djro2q~!eOm~u-uo8E%LjDBdiz2X?Ai2y$Ax}@;h?iFM`r+ zr~q{D1t9$8zHx>O0MSTLbIwaUQkHCiRa>aGe??`31)4|(289;!p5TfB7{PzFH_f({B|&?975 z2A^H8W6Efum_GuhP(QGf97+n(lYqJZ|K<^w3MRRzeSLkQAtBiU?u@jwv?@75SI27r zA?v3yztQK>)G$q>VM|2t`2f@QB$LP9#o7cQD%ZaQ%NT@rrNx8wn!AGn8R-+le zI%#UG#@{(q6!nbF3KLc9;S-M6AwkR={JQ z8Qt+uQ?1*U`Wg;Ab^H(#tMzob0e0e_Pf}6TNR1g-J8$jQhEgVSug`XhZC^5HA&4yj zLL=V7$7tn`f#Bmjd(!b99`F}wmAjQFRya7gc_T+|ns-Jxgf;i6e+a5e)>9W~(hE#KmDSh$5XDY^PcH*T6mouhARDzV32tIYPv0xmfU)kLHF$ z#bdzo(Z#A)nv(fDv9m+!~F(fi#Q z>6POTPc)`K&~7ppe4n+qnJgkSvr9oz5fYNW!Rli`K}dw9cN`m0^Ok9uP$MQUPm)0!p?`5RyX zFCklnX~Zj9=a2-29lW8BEWBa5A1cRD)2~^)O>e`Mse-|=|8tqhs#mxB!72yS*K45# z5-BtH&E_PxU9>myg#?_x|3f>~{dN1B8cWL7q8`eq8LdRXBGELf)b0R>+FSt?ssqDi zFp)-{QxXQAz|KsDC~qC#Ma1(EMnV&9fzxs%d|;|fk7IQf-4md z5)u+Ar@HW)Qw{C`Twisn;M_kFs3KShg$x%+y#J)vetSF&SZqJqFBvjlge^(rB|M05g}zrC3{pNN>ubRNh=1yvE|So54t!QHYER#$te@M<{l3zg>+JC02g3gpqHc6Be^@aj@vY zwsCi{*k6K1z+vp&;38k@pAt=)=5pq1`0dpsp1e(6Mu0Z$?n&U6SoQmpanbyP75Mk@ z2Kab*uS?WvdL#;#c$^tM_UHYJ+%M?B2_BSXt~Tb1`IKF&;*gGoOXbV8wLY3DPYs8P z%Z#x%*XG+y%-SWrZ;ZJPgN!KSq99o>O~q+7)?zV?yOzvBoy?khd_s#~GME|S)F@6t zqmZg8FUuUDi;z^&b}cU#gjyG#T#9Yg>#o~I%-G0`p?WQZJg_e0bIIno&|-Z2+qPox zC?W*=q#|))RlU4+-1b_WVN_X;oE^G^w?C35Rs=spw;TCg(YTmo1-|-}MFNjKQcjLY z)x7BcV35_di9bXsl3pc6y&m!qd%5Eu4V#ppWB#fT*%&`IDlA-Xmz9*S3x|Kq6dm0@ zKIv5en@J3Qbp4v=YInLjpf5j+#pfH}%_F|2vm8v(uoz0lyb)x#`XtE+dua`5vBWcArQsKXsJze{EnL6d>{dD=`r4QmX&06RLe>hEYr*|+ZqH%jYh>|GKSvpOE z5T7|I#D09bF3=-vGn&DjVh$*`6n)5*H0F!4ZjG?b}~ap}_b2&B&FhXRLFn zei2q58ay>tnCUy&2pJg8k5W;%y*+_C%gHJP*nRebMWL!`KRnS2vr9v(SX&MQC4^}8 z4)sxR8A(>0`5fl?*z-SS!9hlXI}G&?dv7aTtQ) z>~*Ps83>X2j1e$|TFfN%hBjcBqVLaAqtBJghy}y&3-MXtFl~ga4*GOb?%g%SkCkcu zvDykbd6pE#nT@jy<4&wdC&WG~mV+;B&4CE;h-gkCr7koY=9;+;{tx%^4;`=3+xk=C zA=Lq^*Ms}BD5-zXASp)lg`?Z>Q1^>k=@m0nRY%pfq2rU(ekiZw z=D;N$F(C-^%5Jeum`B-|6nohDGD;-^iv~y_3~?zv@x(Fj=LHW!an{AO&K%0<$ux-5 zcRCbt(5tXeU-f&@c6IFdNfeWg0v`7>o)7b7TkuB`8j>S^j?BNoukMa|3YR>1wh6p7B~I*EeY%3`btma zGR=(l&l)L7bxS0_plq@;;>Ad!RyC?R1aai?*lD8MV|B%-@t>y;#b9<*hNX>+0(QyUs`t=?S;(lra(Iwz!O@BvKf@VU_0jsRTGof=UpK2OPS%UVC~r5C=I`-?Y&8 z-$CQhMOYJuvr+BI8`m>PYXPx-&(IveK0v?S4nV1Y8Q%F9(LsttnE(>L^+j= zRfSaLb9KzuW5%lEqPCKR7Hi2DJo3EbS^p(ahg=fy@Z$$yJHBS zvFjWyjDVAwmhN1WE>*p^9A44fgpBSdakI5POAwJ+ph<0>h6&^IUw6kFNNF8iA*id% z$YF?^+eQ{8ZyMiA>{-i!8C5PVe@`^OfP}f|`+X$-(IEOWCF15w_IO6=_zwMAytvQa z(UZ_3vz>7AClft~2sL4RJrWV7)H`SsHSC+6GfT|G`N`2cqNM))v>rV_KdVVZ$cWkH zOJ=f{Yvw59pyZWZ(uQXfx!?x(BA`b0?m&)Oq7z9`1JD_?NLS|Q-LQFgXM6(nXShNY zp1k`gMyZLrQewUWOOJNcQ-&7VyDz}(1DoJO;r1DQeABEIw#I%%PEC;`TsQ%J@Tq|8N^Y^txP%po`q---&&SIRh!zCr`{}>@fQ@Hvg3{zvU8EkJA~& z&8CRq940#N)G01pVcD)~t2!xyd@?7*5!&OzB5G&~z$Ouu)o{#_qqB6pgTR0R4 zeXJU_B!6u0i1-yu#nIsx%jj6}v=(Tw4iRZcc{_f9)(@;OuMLz|kJBBX($tSC>0o0g z=jSHpJ7J$)(TxYt8@y_V$-xiY%tKnrks?ste4Ra<$S(N4*qd!4nw`6Dav5E7^qAz* zSTtH!K0o3@C!Te4kztq21fbJAi4=tJfy>}Pmjn8qL>VB#A7&ts7 zQWhOJz6<0GuOp%dnWbUyloflI#KYTXF#ie}*|B@uprd#K!)e%+EB>7Yqup}E;kL=Q z9&+dj>ps7J*NMLFT{XUQ_N|y?UYf`Nt78tf$}rf#LMNo#sGw6&uBVyJpva0EB|STU z^%SC9ncj+brGHb^h%Z`5oe^^7*22(Q^t00C4U{-1Adti;gon6=uU?0Xv|ty@Gifsus8cA42F0#Zv= z>o-FbE7ue!#)FqOuFq5ZKFNwP!LGJ5?4IcQ7I3tt_)A$^UAm{>M5}1GN`@b$R7+zy z$t?{{sHPp|#EfV)Ra>E&+i<|zDpwBRawtMoi&T(}tZ#|30m4JT8CZSUftfClGmIh7 z)i@IlZjpvz&vg$2geR^YBjjRKi<$NPX%=xF8$m4YBJ|F2Knlt3%b56l$$KP5Pb3F zXLyU}{SAZB%LfYwyVIBn4S!iKB9-`nXUMmgd%VNi58Cir=8))3u^0uXOWeF0K0-?h zSVPGYAUZl+rs^F*LP4o_Sd5L0&@#n8WC00e+xyp4tjm(irZhcUfDZayjHcDJ-|5cB z8@6I?2@sjdb$miVcf3=ovvZtT4&7@WVn_2%gSkj)UiatT=1>kcACG8{KWih&9LW%1 zp_VC^b4|5Gdm7vB!OKJY+g-WuQEXTS*{Je_#psyMh^3J?bq+d{B{r>&7pUN%jUPB(W+$V&L%|o2w1ag>E+Db&rJjJuC}Vzy5mShe8_P${^t5}24Sz2 zm$0XTyuAVfAEOG6Pp!bX|HTQRVp^`yJL|!gH_QETnqPHohvqG2QqrwSbI4)6F+8&( z+u8Q@a@If3XN$@U8CdppE$;XC%NPpeRv2UY`>(i!MQ!X#EPbC?W#cj0*FR^U?SEOw zL$j9`;I{Sj#oZ3deSrq&6O@^Nj*J*RV~Yq=DLdm*e#L_1%x)Ds8Whijjls-QI>9eSl?NRzn1dch`!7Tq#UH#W|#o&sd>T2QKj5YLy#-r#!j2i8V z9Yc@iT-s*Y>V{|$dfHs5Vk~l}BY0%i1m0UzJwApQmPxdso%6QInVfryI_Landc>fd z7#^Io$X-blEHjrDW9-l@>c3flVZ3HBj^l+Vx&^ru+uTZ$$**|8OlEM+@p-~H{h26> z!S!jB#kX*ed2>2w_AIj= z_?*8uUXksT$>ISc`?Kmj!~$NoF|a}71}(`v zW&RPLq*$b+vvrB8R_dP}ocpcb@iuz>^<{O{ET2M}5eJ^YW9DF?(e>|A8;D7-`V$-$_xJKe2|ZV@x&LFNn|;-ppC#_D zNW%i(>_XGK`;`THPQH7>5vMep8CEMc#@tZFuQ<$;2(fVau2}DKiF$AMjP&jR4fcSA zZag6%1UXB<8+{6gNq?ZTsO##4Vtt#;e?Fp_D4jj1`wc0f9GHdHH%@c3uj(}m!Ed5m zHb&#_!smF(wNhdtWIM1B9R4&t6Rcd?GFB231Ifl)%oj*pp=9bq+~obTEpAux0~mlJ zgxV>}!_m^IHW>TqI4!Fe1d*hD%Guo-N@B7yHkJ<45aLP)Mg#A3oBC_^aCJcSljfUe zv+2nQipqJO30;-OdqpqN-!bd8qO-5bi|%2JORR#}{);0nuALV$LYGe84}Z7Dq@|G= zvnAVci_qk5okXjY0b+Ie>}6cWfuN`7@d&@Cp_^)Z&N}bJDK4L~--iYA=LAkWmb*8; z0&7D_qHOe=CgNScm=a{V&ikYn2eP>5J~O!fuv*$H2otNp=fv>*qLa{8;gQ4VtDSZa zzcck&x7t&UIe34jmG}Ps&coZoKQ!Nk%!zWbbLBUR>R#HHRMoMotE;uK*LiZKvG#^L z1o${#d*}o^JSwS)NI1RX1ByHueKICWI5s2e)}%Vxc>a_9F_}1!ks|5UibcCqmTGS+(>9Lc{J> z&hvk0^e74x^Pn!Pd851L(euCo2ZzlUp+pUhd>0_P?q3};KYF+T;NcLI zO|hXKt^-;j<}D(m0EJ?-ASuQBK50f0>O-7B<<|bc^B8Tv?9_wR4ZRhgMVTpwkDl47 z2|U%~JCd3CF4-N`|A?fq0V#{o4gcW=4P8pvv6GSlSP;sIpZzMF_s3&2bh$a;BZWkR zh)XIl82IQA^}wVTD961X=V=DM{C9W0w(Ncom%UhBeNKif?0al|dL|mkVKJ6eA+|>7 zeueGKP%yL$mgfaP5S{$l_VO`;^h?*OogC&ppARzb*LiiX^CF@tuHO4*GY)Z~VSiK(P4T0#(0=(ui8=+WmF_&%${Qtwx`U@m zva+8wTHmS~=VevNhq&+wL1@Ke&1aYp8VDvx&J!($?B~he<9Mv@FFAkQ!j>^BzM9pb zvt%*MWijyb;CC0~HR(b~TXk*}rxvTmq>>&;KuZ#<2GH3X9a4WVJC~U^Xnn2Zk~ zg!5QgiD!<PouALRiF4w6teH-~z^&=dM&@CUfszjU(F>;G)v1L_P$XHJE;UfF$ z!J_|?Q)*b7SO5UDt^bDS9@rGfysrY)bttH)3e`Cv2}AcG6h&5y1{UOq#zWC}uh+dN zH^z+WA+|A$FHY8n+z^)*sKm=V#cPeeT#uW^=H?ph z`#j0`wmq|6oAdrvXZeYBgwi>c_>2Xp%Tv3^)H%%i{(;lV<*)3PtD-I%hmvkpaKJ>9 zjqX<~iC?0W!P-z_)M;gQ9!wG))yzZ~av(KmDz%|V@S4qNfjViazlA^dq3sJ(XGxl%UX$+|R(t)) z%+V4ZeT+0w3t}qEpQ#g|o`X*i*P}5c9H{0>!I0ph*@tw&jr|}9UwG??rE;&5lRQlG z`MJi>uNrx2c@}S4_>7AhHB6iziY)r{ufL#t^3rb19Ml9`JMDY!&KXmr$TJzp#(-;eTt*Z{_i1PTPS&Tkg0g z*^!OPhsHdL{!J1~0F&AN3rtpNYxvcmU0P}b41uH=7V2@ZQggB%+G zG4tGqxmGHAwdQR*t|jVPWm^7SU!@|lErcFDydsE#gq|rG?f_7^iFmBY&Md=!-1=eV zB+`rj2(@vc4EY!kINXPcJ1ahs?9d4=y@b<0|0#A41c$CT5*D>Dc4gPk zxXjJeu~cP@=j!Z18t`@6)OSt$ri+K`IRZRZ8Sbs0t=mzfmaLu}%?-v|k;i97C9{=l z<*i6)_RpAHrNDF|06y{T0fWBbP!zM8l7#Q^~o%y9o;=Ji08ruG@bm z==N9A^WYVkVQge%tNTv7S>FM_lT$~fN==x#Ac?cXce>nC)<_0-fK^_jzS{}qri&$I z)+xAjW#v}O4BL_wqk0d-EX5q5M)?#*ke7!RrfO;?_raaYw^{U$g`?t4v)(!z=aM=* zdw92ByxQP-$-{hIr5OP-w%Luo8?lBjYV_Sf+}`FZl;<{Ev8MI~WZTa#xCobUk|whQ zq`wfK5hjBQLSX5v`VHTAuvvK>|B&8V9(;YnVfB3QX};6ta)2Llu=#Lc`*glpqCEqr zS1^p0+>5@wOuugBlX=KV`~0`sFKCLAa5?0Mn-lq9= zf|wX^#1D3!s82^W)oyfjNehn5H#6{gz^3OMWP9a2p1xfbOB8ZNi&sxfBHhjn5-xw^bewmvP+Cm}W$pPSWam2{I!#cm)rX~#p za;RYQk&_uT5T7oTXwdE2uzyjWGJ^s}59JII6C7n;$7veZDZBk2N5m+biUF=G0`NS( zMxL?`X!E{h(NJ;z}-P;gu6)O&b)2XbwIS(o@bYnEf{;3~i|PC1CN{UkWx`{ zaTbLFIqW#bXM^TCtEZ+6=Tsds!Mhu-xBm z%u=wy0A%$JpyIv%H_dgfKwkSmsZDp_!qe)mM+jNUeONa;v{J%*6`$6yIF zRBmxV_zD!=OgtM_6zVt_BbQ7e2a)vQ3_v6hMrpy6;sgA^iNCJ07<_b47{EKc%6sz+ z>Pt3d0@TaN^im&XPejbTWFmx&U*P(`-(|_sn-JP3paexStDVnru%h__)GT@LPwmZ7 z%vR&DsyE0Kzk4X@U!*!i>37lG$96ELlI`e6`KsG)bNetl-zv&%AolxH2&!RhQxoK7 zEY=rMF_Uw=U(X?F{ixG7lP|+EF#r{An{0lu%F1zfRPW4IYdl^=_-PX+Hs^Bl8;$l{ zqIjOYE#Y<~gaUW9>-Kz?l=5igHszf~O`&&^3>WzFxRP*1RC$I&*-G793XNW@{eN2P zO(x#X0$1tR+LoTw0m(k2W7ZWBy-{c2y{%bM4cWTgPRTu!_tq)lgcYIuI}*%JZHxz}ONM#9D(&W`XqOOp zRfX^+1(OW(hrFVO*XaFsO#$pA7!;idn9z+&E1D=~GgLYq=2dw@abQeFz?g2Xb5fxV zLV+=HlowEdgL@7wXRbJ~;RlD)0$PFlzqxtGw9V}F#kVYk=m%Kxwt4+x!+y0!W#c!G z1QlN<)QF?24sx=DVDMKqCnL#9UVDR_>4Z_~wH;d*b z5557BuR|SJU#bgT3^v%1%W-SQUTt^7l% z+((w|?-|P$Xxl4|Ig;*%QWa+BINqf==d?sg5uSlao9);S^%m30tLeOeu2zCx`Rq=#=%}`=IQm(@dp*qYwX^uFZy*u!Lb`Xg4Q828<{}#di@2U5H zxuY)XcRQo!USkN}$pKbJl8ALcg6O1T=fvPDamo%tYVV`c?+o5hUzMd2)Ia87gH8NT z?_J%2*mg=1sznF@|M)-A1#F-j+_0%!2LeLR&bMegdZXe(Z2Q21^ z0TcZEGBvHSI%6dw?J-`N0ZC1z|?%z-Kt{`9+H?5om0u9N@afa{$AY zi=c^#&+G2xrnsB=r}^(lf&l;-9H{`q_8nW=QyVM8ouo16o_Dtpu~)6zwVmlHG1Rxk z*-!w>TLXCG<$)<7G#%y(j>OJQIxg5kogA1n;eT9j!f3$uxX5141^!&O{B$z+8Oi2G zzw<|yEG=C5YsOdX1Ss(xq6jPd3p+opI;wx|>@T)I_&pf@Jo5kU$pHK{mZ!D?ToH{E ztH^z=-@)!oO}%Mjb^AzqB8@BUWQJ*WsB2)aKwAL~wK2^5%bSblQ?ds4jhvyi+z$_b zdY1CY;TbJn$N_39Bqs?i3q8bpfAgHQ<$fw}bk0M&Ovj}2RPLwq;uZmM%1tcm3>Ma* zJDh|}^wVd8nxDn)uO?{`Ek~AiMr$>0_!#Sx>TQ@%3Zlj8W9{KNErmRmS6JK272n8> z?3sYYKZmz)rZzT#vye|szZ%^HnI1F(_U&ozotJhcTZ;L~nPV1qY$%>dVt2Z~fS8NZ zzBA=5f#uz2|O-lVf#H4@=lmg}KYe1-{KkFY!D=KF0-vMzN z(ynyw@apXdWHu9~j1Kk&D}b`wlghv7Ltws!JPH{}-IO`fUIJ diff --git a/format/diagrams/layout-primitive-array.png b/format/diagrams/layout-primitive-array.png deleted file mode 100644 index bd212f081151234c01f5814be4a4e4e1e4841835..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10907 zcmeHtWmJ^yzb*_zBOpk3Bi#%=gp{;|bk|EaNDU<*l1d6PD2f4!fRfVE2+|1BEdqjc zog0|fF!V&z--xuIo4PI$A2%@Tl?7(9o`_sVeEAp+PFZ=MEeQ z_&0-Pkskbq?xUxofcEYy?J8*Cda4@xprMhoUHnB?(_`HN9fqCt@A}@=)R46GaO1PK z^RTh!3v}}Yz0uI510}&nH+x@e#y~e$cOS_>8Rp9#lHl{jV}54F%PzhyGR${1br=;r zyzLo9`9$~xm}T)885yO$?HnZal$24ggO&`lldrF*BtL&ZKmcEWFrSCFBfp@8gap5U z5WkQRFX+MR6Xfn|9mwnM!*Vsq-{UCR``CIrd-^(ixHDdiYi;A<=PSd^d@<2Kf3DW) z>+JCFOzu9YZGjE)Uwp$a$S1)6&)DEq>5He5iXLvB-u6B|V0>9o>C2w~<=MaUT=my+ z^6>Qlr{L{utLE-&?+sq|wZ2%LtPtw$|96l7K9;t(vpv}L<=cX&xBv4Qs=qY<#m4{R zAg)gN@+mlGSv+a}f38dxuXJAM2O1iurJ9nwejxg9>?o|M+&?>3lF>VGG^JmLSLBQ<|wX0U71&086=lKBHCA+ZtlkM)~joSAi{HMqJ`F3e)U73PLsGDPli0Ttgh3xcxux;0p zJN@HEt@8svKV-YJr>ne2sC%XPk8fWY)pr%pC3I5qnPDj{PwjeH`BzC7l|POQSN0b% zjZfaGrum=4|F?&cS+dXZ^SgH-grdu5PF=$eBBrMUa#$5D&pF^`7d^X+}2qh>U& zrA`se$GGZUARUEqcJUTsu2XkeZ*a|!%m zQ$fq059Sig26)1!gEv285J?M&cpuEhmM=3G4mMPT%*Ie`@6RX8ZH$+5L^XwbK2H== z?@kFjS}o$QKbT{j1n*Q2YDA=E^4D!OgDce-(YwuSP=-*r@NwYAB{(N0<u zp{@J-S%S8*U{QV}#>-!&Hq9Ke?dPJYNv6ZkWu_O`k5$MTACzu2w10gD|ntMhwhDY;DsNpa!lf<*rEmg z`Z)Sbnmw{o{?!=aBAO=KbI<6EQB#l4{p=>U*At-Sfd?;|w^xVuZ9a`VO!!Z@{r-%A z|0MuBJDBelw|RG!H;{DDvwevv>xL&D^;8qEA9Vg>)>+uwr$S9TYt};Jcq=Tn#CeOJ#3ktCC{7L1A2ogx zzJr=~)Se7fQ9tHM4u_`{Mcp(mt0V5l_J@Sf{|@P?;LxuRt)gF~!pA)!+2F zG!!_*jcx0kKE$nLJZamt5C5Y;a;EPt2Sro}XVeqxQs(IH8D7c(-m}p8L~(`eG>0hi zaue4E%_!#&A44J}mUBs^p_0@?GEoiuC8 zHv_4Om*X`;Rl#r=C&U;5UWX_&~Vp6$4lX@Ejne@Hj9g`daPND_OKze%(D>dImTOk?IG?z*sR5& zEDftKpQsZq)%<*ipMjBNuZ|&^&Qq-OPoe~S^_2X!QM>u|VC*l|en_*BU0augGL%B0 zgS>U=ld#>J$8~nSHp_Vd>S>nC1^1k65=Ke-> zSTRsZoseD{CTZ06AX=u;nZ=9*`)1dF-gGD-rQpQA1WW2l2P=#>-byL=9%4t1=EzHh zG4Q%Q4h|(pga7n`t_t0kc=ar7_u5}#3Zfomo$PN*K2e*}N_fQEUWd`(3S(iDFHE#Q z)fSW{$;ukP-WVXEd`9^~(6as(t_!E&_Pp4H>mapCx8JCF<`x$1Ae@sGBEfKPr6BZp zFVCa?aO&RAlG@GsC#J_BudQUxGd-jJHYdP?!go$=5lM!Gakt~ft$vnNI&|hbKe2Q` zYP5_EjIZRUR_>-rWVNK=ylRZ^oXz~y__`P15Dy)~W{6Ucy^XTVY+4I`WJ-ACs0e-9+#4%uv`z;3MubBKD>JMa!%D$w#L=FqO}B%S`68vrW7D;c!9>bJPYqv? zkK7=9T?4X+uon{*89gqi!Q_|6ceISt>s^(uCQBh%NwYlAE_camlm6-`!_Yp~lbviz z5Z1r(+w)1_)(5NW=2T~A`wc`?LeW_9F^GZwjSMLyvvpE`_FWseF4_(Gmr}?(wU_WY z0yhJmmmWp!)!vtn@xUq9;Ng-2CQtQ38c0k$b|AWy8_9_w`R- zTSuC9%ns8xf{wM(_m-UyJX9xAo`^Pm!kU21uMSZ9wO6{&$H?U?n*N}dbJ&2}S7?6s zC9NG^6r+g(Q|uy;#OmV4J0WL>%j0QqRkQ)&uP_!V!m)-%!CMCkNmRn@fhM#;3#sk2 zK0VQU9iJ^xu`>vA?Jce61OPcl05^?Y%L=Grr6rv@b0Sz={TuXUq2Y>@c@xuf%s#{s z)5w+A61J&?wDWDvQ!4JGYa!m(R`Oh;!xEi>zv3A#7~K&R!d{J%L&YU_ZU4_SLOR=7dy$m~$6f zMz38Y$FUE#RGp6bq2%p%^eBCH;`tj+ghUjW^G%fp%v6%)&JI98(*EKGl3?mB@|%MS zj-AYg&^WxXNPEYK)yv>`BOA`biT!2axRq4<Y!fO5HbFRacG`9B4}kz6e-W)D zo8WZMpsf9WK@w+D#@3#EwBleyI`rex>kTp;b>wVssY4Rs$om;l0PEVE9Q=k=dVKgo zevx$l1MMa^{5Q0#M=RmYT8ou(vZ@_E7g#95T?l}@`*)_(%!ZQ}M(-^SA;W3XM=Q^L zqo*5%`=o`JbA8^GHsf)>tC~}$aeFNHZWWxpGyupe0fU<2=58N{+Y)KSUB?9YUQ=TE z0OTusFcVeiYzHir`ML+G(x8^quRGc@eiLq$iS#l@05zGqe)!^Kx6>@FBC-8*$v<(=E;oAyZ~$wBKTUg@%MKW)Co=zw+3N z+nGX(rafH@g~s*jD_>0_^x6kG&wYCUsg`6wKa(l>=JDqBO9=KL!P$vR$wx!_Uf?zn zH7R>x18N*GHUX1fDqP8{^GQ;CH>}DYHaiX2g>msetUgql#0vU$i$btL3t-qo^ymGa zZMHAnU+x3)2D21Y^&hv-)DfRzFTZ&z6yx{?K`0v6-1LO$QMcS_QLeYT1EtwBAAygK zc7HLU>}H>CU>_2tuhf0?wPA;b`6 zkyv}({0(N_oouc-u#>G8MY8*&McNDmSY}^=Y5F#uGPml;lhwe96wBJ_gIoF3+0j1k ze3il;{%iF`X8rDn=}x6_*|RHF#mN`({U(pKGQrDrnf=Kfkn|ElucshhbHitE%XH0i z_|frVKYXycM5-+igWY<(*by&)d3=(-|IQOF|Y7trq653)JRNHc5i=!?ybNZ#!1)m zUMI0GH8Y=H(MOtMRMk!rucbXCkaG-YTe1CwZ3TcIGV@N?`#P1B_?4uSaLXq8R=i|m zjDV1I@F~QG8@@Tn^e{B+k0oOG%Hijl@$tg7M9sF%=@&88wmyf=L}0&+Y7$0!&&_TW z4ag;ABu1q2hqE;02JpL~T8OaX$=!B`t^D&7j$yK_@?`1lj(+n$&NFEg_&aF0pYCh` zA9|!Aq$bJAOoK&)wa%cMbe*{{;Jb4X60enK2$P;FbBrvKM!rX#bS#>a>M_9yZz&bv zRwB;mS1hGp4R^>)0&%-D#;0x}aza0QwTMr{G1sdNU{LO9Q5OakIzRoC^Ld8z4hpAO zJK*f?zWJ}KT1jq0aUIU<2HEHnnDz&?KSs1 zfXJuC74OV*?)Wr_eS+^9E9+Pt!idHB6BjW>b{>nWry)+Y&*@&betaemL+QuU2yA+I z82}xzz0`uE(U*AD*B>HkMEJde02uH>Yv!Twc6P5wSmaO zw%9bMCxp)=EH1@i?Az#<`Z@JvYp;W%4a2J=g(;A3PA-C!SdJs`dL#GYyy3)EhHXR4o5Z%~GV7wnSJ|y+o zI3r{jU5Yd_-Ix5r%z@x^?eCh#K3KGIM$RulO=g)&l(|iawUolvX>; zHzhC19X)w#A$5q3P1~r9Wup-h$D0&>`qQ#c@+VLI>bTHRzar6>6!mO$D~`rTsY2_w zTJ0lZ9$IC}(o=-PS{NgMiO6-;n+O*%g$NydIIm&#j;N1$eaS z53HS%4;CpD%4)v(JVV~NOxL3@&`oJP1vKUvT2nLCdIVYs0!_A__W;&=>ksChb(wUN zJEm{a1y}bo4rL^S?qNj-(s(Yh=0G3d$Wt1-tKQDZxX?%=wA7JzGoq@Bl47>f;FkAR zU!-kw;o*MhH*?9lMjyN`=sjL84rB&VqvH8RuO8QCp4U`LQ7W`F6VzT5l)?@gkE2_G zDA8+(8EJ?~3W?W1*I<_yh9?C(z%nFx_=q{U?u^~O#APcFp$}fr65kSpx9a5;sg)wD zq~!8Vd2ze9cs^&kCf*?#o75(+KVrilx=kt0(&{Wlhr5(78zIx^`(VQ)_wiTx2$rB^+fJl8`TYgmLpUICB~l0Snk_ahyF^&4h3bGvugcxXj@7 zvlA$rQrxkHsvVvk-*d7;jNDzD);W&+5FufL`lZSV_u8u6)X!K2aM&+MW()Bl><)f{ z!j`hq1ZRT;gnBtYJgmx?oaNaymO*6Zl}*Is+_){lC`1xV6G4<9zTp?rw6oat8ePL` zy{z`p7WaKZx~?fXEX7d_Y(k!fAA$t@8!lG0jG7;~ z^2NUy{Zv-4vNdU;Z=}Ktygp}$t5}Ck$#tr%8H0GJr15Ow(}X+CoQS6oM4rsc9;{MGN*U14>;Z*1?`CeU%< zOarD|)fF3l@Yg=fkE@PEOQQWXR_>^+m8{YJkOh%~nH{r}fnN4kX80-LeJajlxYFFqaOqrCFs(jF_G z!q3l6J!)28j%4;_u5W{i!_o_NRvmr_kmp|;jqY94UXa!}`mkyj38yUaf&%x#=FuAK zyHEo)>e&c&me3C`m=YhubKZ4vWXNLQ)I*8I=(rj~)#*_eM6e)++{F1sdPv&e9-^NB zU$!s|1O%)-iB1XtcO(4RUw>Ek`m1@dW#z7{83BEx1?+WA)$MPctQwfN@5b`-3d(y7 z0EgG{D-`3r@{W~B=wEXwFzt<@6>#cDW;^8Ev^)UGs2xPbHyM;ve&oXY?eb@nTTv0dsC^)!f;3fap~gZVP# z-&%RQC4JwU@aQX~$A5I!I);q%;a&<}QO;jn(JO4P1vItA#_evb3>Il~5v&5v5&|G| zFOk)KDu+P8mxLE}S;#E{XG5%6TamE!(iWk`&#u=Z8p}o5IHH9HRP)P33R6i`2kU7J zigZc4o1P``byXy~S@mQI0u5;$RN1aSH9I*5_!T+`VUPGiZpzc4dlSs~2o#TUyI&VQ z%~`<=iF|%d8jz%YPr8P3$li2a9zc01mTlc;1+}Xjt6VF4rt~N7*COB_|1YQY zZJQ(vF*VXK;aaYl#rVE%yDQ`eU;%hcXHXzGn0G9;2@s`N*ryxpfx4y|dp2vyE!I{{ z3NzwPmSpL)uD9vR6yq-MKRdaBN~^M0Hqc~jx_OAVc9W^laQ8072D=}iTF3MO>?20% zTIR_u^D!VHsUhv^PWL8s-4#$`)lHlsb%`04U{L9UG~EU!2k3ASPgeZ<+6_ro7^M(Z zEYFfTyZ*_+9sq$Ao*LNNkwB8j&_3iWHjEUG^a@3a(qd8u)sKUcO%YIGIxg-#MQXM1 z(S7K5q8!baUGW@G1NS`|ZWiSVJzBg99;(P0=wB~@f(Kv{fj9#NW;5HPr3>XC$76$W zcJra|*TAnH4NkY?j46KsUkR55g1N5VZY$u+ zn#lEv3@UL%Ws#9shWy5KM`(;)uvvybCqT2S)yQ76s0UTR0{LQ!UONB7amV}^A=lE@ z2MFxB6eh*H?pER~T5%_(Y&^hS!40kfp+15Ax(3}aT~P@tdrC&a*S_fNii{k15{2x? zwi(~SLC1)rO<7IajRb)+Yg|F|L|TP-6*g9Y|4DXVg4)*UwYF#)~b@Ye+tJ4-T1^AA_p0 zVz}a}4v5BVnGwNe5O=gBVnn7lKy{v$VH(E72dW|KeyySf;pb=X>yQ58G@i9&bTEl~ zM^we;|8s=*(UWSsPDb@o(ixTbme!KWsRs$nHINa)@9jgNS}0&frJP7D+Ranmy3LMq zr#uAwjr$NYT_mklEK%Zxe8L&c->Q}3-@ey?<^oF3Y%uE_vH>2wXb71JVdP&5cVN&~ z3lN$b`e7T7bjRwUMXC&eGZBJmBp;rBAN>k7<(7`a-P;ou=S#d1_aA{qF}8K9BGQyu zE2L_>fS5R&QsRz-gY$Wk)CSHX&~l(XSWJ0miMP*Qm)c3X@+ie}W=r|6kJaUlsmpf+ zgv*Mnj%5Z;d&tqUrk1`=vr=hQ>3W7@O0p}{+-)OMg(thVMfpT(|Mkw(v*(I1?z@!qqm2-KMh1< z+Pgj#!}Ls7ot3Z0Ct9A1qnBKMPn^U{qN61?_@%}@H33^b5FSG_PVPuJLueBz1({hE z((IFd1We~awJ=4SsF|>*9$M=%;xpA~wJE3F zT>aZIJ)r|J52#;A^m0y$$)2#rDd%oFt@IZZ5Y$I#RQ41Xvq=BxS8RzT$_FxY#{*rQ z?3oL-vtOa}2cA4;0iGaMDQO+huNXl1HV^m_Q_i#_f=&k8MobPZBStt|Ho63`DyZ6e zAW*+PnSC?DXx1zi2L849C;hFDDg*G(KY*`^3A!%sym9<{o}wX5WT-!B|KG0?jY z4}{v|Y3LXas4`UkYYc+A-)#~HE=u-}RI#xt0tnd0q90yUdfB9eL=p?cHrHav4{B%c z)7-CNOUIC=5jbdXc7|NT@PUYhM^h~#uq0R>usk2wcFw87^ip(KIj!6IdK-UX@S-4e zFZrdZBBPb7W+!gJLy>_Zo@>s9(1D6gv4I0<4buI_ zUyqdx!+;d}+q9Jkv3XFH)5s`@ODu*M%pIrCUL)>J%tzd&JakT^gKC^)->j=Pkxyb_ zQc;-MfN!z+#%wnc2?Uo$2^Pajk%&P>f?SaaSK=9)X~s=C{6*2(bf8ZlzV}&wy+nj>KJecohCqJ&*LZZ zQ5(3ZlHi>oXrH3m&o2;~h}SWysB$A0P=qFilonLmR|N2vBzE_36r+eF75ECi?}v)l zl_~~A*1CDz1sM%)fU8fbuGKAXkbghXKm&m`K~`yf`C~D5CrB<_fA-Wx;uSG+7=;28 zmEqm;&t-HsRZujB2~2{AZh~St{Z3$w)W72I$)P6218daB>0d)Nl>r6ochVj@PQ8j3 z`l?`y3J60K%MqalqbUbJn@UD)Q2~%7GDt-Ys`>5WcWvKEcf(Nelk9>d{g=fpBELO- zwa(n@!CNJ!VBkq{&mP(n&V5F{4e-JKFr((yg~_I};_ zoH5RS-+$+I42LkC&+NG8bzS!ztEsMtiAIVB0|SGptR$xm0|Rdc0|R%A0tcihif{Y| z{(<$>R+NFM8X?~YenE9pGW3LjA!2y`3#+V6e*$Efw%0ZAGEh?$v2=CjG`DiKu;%o0 zb^~(5z=-*Y06#igdzn-FIXk&{iuj4s{*yxl`1v`Ri8t!^OkH z0p#HD^mp+x_v3K!r2AJP|5J{fwWpo@E#qDHvrrX?`IJ0kErP>;_&Q=XoCc=jV#=dO!NL+cM?* z4<8=a@s18*kV*tzKb(bh5!^ww{{})?~0#4L>!m+ zp8|lgxZz+4yK&w8X#Uf@f-pO(f13HPBv9yEv<(!Y*8fV_(w%cDTZ+2r={$N+f`f!TKM`{c?d8Rk@|1LYQwx1XlyGCGWZWM};;a z!vc#H>wy7WG?kw7)|nrIF6U1|j=s$@=J{GxaD^^wnTy57iH|jI1EDcVeeKg1rqCA z_Z|}8CHEWA?Q|V1%G@!@bH(P1Ni6Rb`Hy`uk6gxhb<_)|ZnbeScIPIlEwkxlw9T+Q= z^^Z`vSq;8BQfB&)KIp&u;i@p2GVW1iK?w%2hN68`;*K5d;r@J7pUvXOpBeOVmzJXj z2Y1r*T`jzC6Q+-6nF-btX`f&FZ@=5lFy?GfpXsPOY=3%irTx}+y%AqmJMH$n{pqpt z9f|Yr$=|{GA&>qaguj0O^@X24H{jRjQIUg+IKx8X)P^1u!2r}3QqX#aKM}t!7hS2` z9}Y~PUVNeD7ol{nxj!2;1*(WeX$5LgfEXg9SL41wY`d{bo+77pRKZ9yWwDGX&&#!1HF(r#xP|0h%R|-)~(8x z&%fVq3+R{6USYI47)l^|++>;7{73cdYBhxFDEareX0bexq1E;~{ouozUYFKWg(%PK zlZVHv_K>S(UtX?N-j^-~v_nhaw~C#>l;6%6yH+wqpUCsw?iF)2%vsb8@t{*ZPz7C# z^Nxyz^Mp!7Xt(W`NcQTg=xfdRQxI}|9&g)&9SoL$6JO%jwYQwJf;s`+WgxUk4x{9ns@Ww z*g%`MQdRsm`175^gfAAIW$>LEmwiHj0VRoDGLNS4i~rW5C9oQJ-hRK$mEI}V1FYn^ znGLm)%diq zu}5ZImMxff*ri3Sw3D!V5A)BqZYGLK+@ExL&%VmUZmE*BEZcd!`#q`$ z>z(n{I4~;^70m?2dV`CslFel&Y!=vT34$wrsmhG~Xvx#jewn<{c*qTwd|nsM4f8hN zruE1iD{6|Eux!e+tqScy<6^tHo�X797#HVGfZ!sz-Zfh5U6d)dv!`Wk70jriy+N zWSTnMZ+;TRHNRS!U*cIN%=(QJtQ94q#$$2sA$+?f5StL@jh5!R^q8h5V0dvz(yJ!8 z+&$~zM{r`+yWp`NNjx%>$W;TZDuILI7IcIKKB4(PEIR_z(S;6ps{?*N5(=#3)2HwH#Rw(6)@|k5?pu1cl@N9NT z%TNXTHgP%E_A?e*@`lYF7mcGA+cTa+f4AMah%<;lO|U=B^xWvZI+f`BZ2t^dk|b80XOb%<|L)qVJti)y-<|SwMeJZDz^e_G3W7O1xa`*(-=6O0FT4)S9=?(XWXNL|$PixKLv`8%8 zb-m?@jS|{S5Pa=&79e%wCC(!|%dXO`gtho0!`t^&UKC~O5pjh3DO|a#{c{EuDHTJT(wqG-Rq(<`{vOn&xsy=;wRebB3R&|w6O1S>{ z?r!y|4HU`}L-nwyDE|~Eu{V%)bC&7bUH-Lg0EdDq160P|yp_r>b-!}erVoC}Bu)}w zTS?DwHh!J6dSB9Cw7HmpYFANzgZ8kF9b!;_K9T`v8^uvm*Lxj3e+1kmRBtR+);b>J zwod7|KU$24i~Y335xzgqf0Co=0PTP9T`@;oC-)rcAP+i)zUOF6Kb{#e=v{hWBMm_! zuaKvfP48^Vprxg>gn(VzA1)0{+`5BcE$Y{#^ar68KK(Oq348taMA{#YT*2z23`2(U zCl+(Qt3kz|O$gvy7{Wa4586Na*209<%HUya%8Xo$#ATZ;kkeMfTSIWD#DOay_yPlq zptr!TQ`o5J{?Jx(P(D6$h|or`(u!>sG^MAAV7!P_v;K%5=~{_t!Pj6Z8*Lybj+KEQ zxr|>!BEMM1q-?y&H*@+WmO!%XSEKZyYHzu7xZhbnQ*=)J4-PiHqd&8zw!-o;X?KIh z5!=~>I*6m85+NRj3@+hw3y=5*ub-SJvLD0EPH`jX286X0FF$1;3A68j?2Tx-d3yCw zr3d>NlU!cE=Rb?pc+6O&-dQnqk1wZ@jhad>1?lcC`Xunrz{rFBPt04u7tMVi_oU7T zKS39S%t_7ho(`O&R2_*87&Ss(W#As1ja#Jwo|P>_rWMf0lk)exw9~3T1#xf zi9F#Qg~X5<{OF`Is{1wMmfvlnM<7Z>4Y5-ME_o!%xP zOyb1Q#3o+S=*uq080|T(C~VN7^|&TUP=&z4hoG`ZaTldQU#A(#@5QhaN{+a6x`82) z%PlcocBT+vO~r&pj>@4SZzzVpOR+4k5|s>zs9cL#UD)kQ#GQF1JRi7qBwLe=EoiL) zMCl0h6*DQ!BEmrelc^%ZyzUWRo9Q4a_W?w@r!aqd9{=*#i9!*xMpFVX)w*CP>2^=i z%`#v4e5S#J?+jHJr;7<}iS7%uSs$@c$w36i{LF}(FE|ub(}m3@!G5W*MwcYUV>Rd; zrTtFLc#)#*yAollWmpszaVp;>4dw28FwO{3!Y`6Ivt6wFr_09nNR>R6c{@YF@0R34 zbUas50`)=126i9lV&H#C9yU<0z8*DA509IcLtFx`?2td@rr_+pbXtv@WU@Fo9U8(k z-HR=Bq2sK+^B8d_Hid&~TT6N^_bfHIEt5ygz1x=rB0ihIj_u z6m%;@r2|8vsMA#_Ex^n|q;DR#c=j7{yF*tPKkQ5Ly$FoNv_k;TzBe~Ylh7GOu?WPk zk%uv++N#}1O`6_#{ZJ@7bCc;C0A6&E;0HFvB@0I1%fZ4us)N4H-33?U6@sDr9S<`U z>X~#`0Vl~Z9$9iPQ}zxSy9C_s%_K<)C*f`^xu@%1Ng0oe@nr5duVsA(^FC!5{d(_M z)sL!kRJ$HA&Equjw3t%ob8l2cjr4{hkr0~2#YP|ASg1V5%5{+MIs275YBNYCk1vpu ziW9t#V*6_B95e#KwaDBqq)Hc7z_GM*8#Cqo_*gIxxc3p*UvH7thBvN@dh1man-Mx%p4`yEJf<{_|r|`I7UhZdW;Bb3dIY zLLX&iy$OH9qJOK!bAF?`zNG&0!!ZrW7w@EAvCt*Q5|)DG%&N&Mm?|-lTXJhFYv^IwIgv3}i90-KDhlgErkQk~_k`1e|CR0KB4l2pz;UW< zB>fQ^oY$Kpsy|(Aid2ZfuoG6BdFm`afh$*%)b*pIFHj)5LKF*z`y0iBWhQ3-4JV6( z$`E#xW(lcfXI$b^B@{+zRe8P*q|FbEtPTD=1x|JpO0BshB~Nda@byXN&2XFtl7_^p zf6a8v*jYg+Ym9DT&EsS>CT)3&K?)9EYz!{44ybjgR`vz|S~pgBz(AH_LKX#<@78nHnV4w2H)qjy&6zroQD%?5u0U0fI>jeo?7-1$du)g` z>x&dbvLgsqa8BIEk(qgCG_?ww(c4m*YiS&V(29YH|L`;tT zE@_MaG>lRsZ+R&Q%qOA!-v69%8h$a!MN#MQ zV5ivcIDD?=L>r*ml$sOVDgs{*37UYyMiAv?>*FSXothWKOrrWyahNC5+%j`0Gcv;b zGQaT9%qu1}nuzcmVrmaEwsPG870M+h{>i7s_u=0#PWxe>rDrxHk+p302hWx*PWi-~ zKGvtVLxvUu4+N9F#VPN^ z6#PalxD@Rk94stRJ>mEOm;*5-r zRFXBlN#*uH^@)UqoT$_CiX3y1L25U6s!eq%R6HHnT*jJc&^al zkt;1N5NAy{yXmLNZ!*GOkE6AaYpU)rU#q7h_eoJxt75ER&5n zy$pgSYN)$YRvOfzg{sc&3u18c;Jx($?a;-IU+i?SLWRqkpDDwr4Du~)mTF<>jxCtZ zx|01D*Z0t@@oI@<(-|p74&^Yld!I$;-d-9GJRXeV*zvcOMlAo>93LK!v~lWuD7Dbp z2MEHAtnKyLEj~&>&}(B=V=@*Ez?m7}$~sDKf_pv;G9yAT(-) z$X-nI{jRu?SEVGBBYMyj(HUYs4Rp78=}%kyA%Y$pJS45ii?E>(HKf0Dw%(Dj&)0I! zyF8ka-Jj}OvANA^e5sPG zI=y%9*=G34SVQV#+U!+dKNxqRI@*d(jQOy{5RfBxJ%bMp9^D1G>$cynozIf!M%}BL z4RAcaN`HXzd?gW$%HnNgLyrS%q6=aW9kZ);Ttl+MVi*)gL^eJLG5Z@nt(7pz>s(bJ zk<8u}z@1IYSQW`vGJIqRu6G$hBt-vZo?pgii)r}2!nIywm|_~^&}cEncwlu8Z$uxMucx<;Z8V?Rzn6UYA|G#=kz&)Zm^#mF z$f#N=u@ISkx2Rvj(YwwEcb**ScFghmfzX%O977`aa}HrK#qsrF0+MdC_E_HhFsq-9 z>HB88VUsgf^H6kO@o4vmpoVaVK0jEFmvp;P_%nTVj|Bp(G47H>4kbusJRLaE$cm1{ zR#T==IJ^=S!q#O*3!n%e&`kWxcwy?NCFh=Q{3tt&PEQ}_Z?5l~e1ux&t=Q8aE{ao$ z#oY_2dkob%v%}4P@{qv_=Ex&B^u-0(6Ib z4c>=RxklwX7=dJ7^kpHgw@j+s8x~ez()Jw>$HT(epLg-DW~yIg_CZU#-FCg!Vq)wo z&GXt>ZkFNN^(hs$=*xC~Tn-=){`v!U1@`o;7;XY;T8BC1Z4CZ9OJ z?-f;>k+6jcX{hB^Nrr^wvRviI)ZZ+AYnviSrqbGC9oj_SE z_zYK(95-xk@GUrYZjLlz3?@hFV#&KaB33A} zcG??@-j=Aay{hR5`<`}Sm}|y*1O|?!dgIBg&&TuAa($9Sqb8mV58oyH6sdF5IoXFY z2MeGGh9kSLAsC05*jbL|n=lSLrGM~vvS0tlGA@?O67q1?eX_p4ui7p9F|BXY3xqoL7c_+64~*%eX~shs&{1)~3djaHZsi6zCTa+qvE14AV}`|-0X7AfngC^;z+E1 z*o6h$2ieJ9u#rIP9nrOCpZq`Cz-0!94?33P|vqNb8%SJdFmugRY z$fC!2dIX9(EVk0TPhLa5(lT<5b!|wG&fBSd&&Dj2ug%uncIzjW_d&LgIUVeAT1oh% zWMCBy6^!GT{SruK1|wB!T~3OTaMKx!#+HLW-bkVm#=0yK98ZT72X|)~X-_f3a5B|5 zpg?gm&>h_X+LsY*OZE_3VMX^TgfYiP>BISiGL7*nE+8-HC1yu}CGD^t%E~ zB{6ip#m+CAI^xx;NfhG$&SS)&srv%Xlk>Q@noK}#^rP*4H>SH#c)WDLYi`KxdAB7z ziwlF#g*7>8m#NDSZqbfm`QLG$5ELErCB=@5wziV$PZ=`V1Wg!MLX5m~yqcbVW?|Hjl1hADe&&gHWy-OdAP@#k>X>UFdrg9th zo<}e-2@=40(j&Zip(v{4A{fOjgvn6F#X_Xpj-7fQh?OC>RH*vOOPF`}#V<^+Yn6`>kvYJoy{DGSZM{07 zqO4=s`DZ2c6*1?&bF)}Yecy<4p}R+>0-3)y*g^XUpVtQZr93(0dN7_F>Dw4CvPZ4; zORr*O+d=1-PpI#`*uhukZ!3M-`NRS&R0-rrE&3TUhZf3CUyY#XsN#{&e@@71eNU%9 z?4)N$FP!#vNm*yUZ@7u<*zb_%Rpo2>CkI7+ris(J3CL()9B7{q1Zyifw4pCo#&~Wx zef$+EPg0mh$CxFCot@^WzSlbvWflu@#i{PU zvp@;zeYUo(-5sQ_SFq-3JzTsx4)qy?t={~04Y^x1xlehm_a-DV;!vV%g+fav&rFP0 zi)b^r3&A?xZpV3$N#_`qWMEsCk35fr_%pX=DhGk@ebbi^1n^-*q(jU^_F(0u^|P6> z=(dM~Ip)U6k7x0=Z%pM7!4SF5FVL8HvZtlI3yn7QgyCT_|B#6={9=)s zH{#fKB%%OIZc#u)4;(BmLAYv4LsJ(Y7_Gn96_LZ76O!g?yu-2i9iv11+{%)YojQbqqoESTjHDhwo|( z0@)m0PD3H5(xarLPH%Q|T-$O7=ByqoUnGtUtVy2d1zv*r$Yyb-(j>9r7#+ZmYo9w9 z)3@ib5KKpw{ny_d;q5V3UY6Y#GQEO42flK&BGm zldZcSqo_+8IV$I3<;Nd3FN#ohv&ml6)#Ii3s+{s->aU$uLyCQ|~q0zI91BJ?xY_J=y0K z3M&=Z0#c4kRUM<0U?w+!aIYh!K%*0H`n;-a`1AtdQ1V^b9#iNeLqXVFyR+-?n<$tJ ze^7XBTO4UkT*jwbaUFiP30Szxd@QL36x8&@w*oPr9C;8kr1D_ioJ9te&e!I@o1Nc2 zx+#0DQXuv7QAesM8iD@gX`xB0!Ha=96nIIyYBjK!ufDo8-&c*zm{0+#{4w=@Mjoz` zb`8a%T=3m*#7%Lzmsid`+_^Zt*P<-ys|7QB%A^5+x3NVgS z0{C2*1LHGm!@!z6$PT`gaI1-STU zaR|VEGJf!y_}qTSQQJq0&eeJ%Rp+2{53rsaQ~+!Z*qc8NYX=|icO?O-3&&`KnpvUr z2;rw!7O@Ph0NtqoH~Y$$LU0FQ=!HgvHVL*v0M!X)$i4eLQ-0uCwryTBh{ip?G%p~o zkxa$~kN^Nm1QmJvAQipt38uWDN@|I_!#aHvcc_x_EdZBexdZx|@@M(V$J1EuE)3ph zrua`cpSmj!AeG_uS@GWm6cHgnH%XJju7`L>oxbmvJOMI@c=8iTfWwG?e04hSP+mDA za=;$9^DKaQ(}~LDB@ekpcV?RdWH+9`1$GfgJskZGI5zK)-WK52+(gJ)rvlgf5>6H|Y=hF?j&`KlprDP^r&z z2hfV-TP6y}O<-Mc7!FD2pW*E}B7y-aOx^x5lP}JOe4s6GY0a6X$*KhNHz-QiD$b2M{>O(Fg%l4~wqt4^5kii&BiSOu~TPC(I`%cWJG)jxhq*6@{DtaN^@RAQ#d@pKd<{1a$4r9e^$=09XPXZ;bWQ@-=<}DvEbw|D!-5 zQhVw_5xZLQgz%k|S--cD+H_FWaRJl=(xSP(EB>RxyX3l6+}GWEh^$y>L6)@N#0cNB zwVz~5?*w`*0iEb)jiG=MowrI`21G%IBaT(gj9qbtRLk;kfGTUP?e5t4LSxt#5Wpyk z%5iN8ux~E8i;HCwjViPNZ-ctb@r0rwuGc%{kgO%INi|^&t~XI{(JbiHgaLrt70`F_ zYX|>+vs#Yppvi?h=iIn_{aB(x?%s_H|Am+_C`s%LMR^C%xB%p>#9eF8LpasFnN;&P zex7h~)dE|lh1YM6zBOdbsqRp~3@Y6x)5zA!gfYMQ+@b^neXy0hBMFI@R=yFt7?TaT z?xMo62Oz>BZy) zIjjO{*(4k~`%lo~Th`ugub3p%Ks`}%O-73NAq)auwO*=_d()>|eY6h%1S_g{A%)IL zJQq4s)q098wDuO6(vLdmri&sW_s7GBQReF|Y{{Wg7RQn5yiQ83BkqThd2>{q?5m#cA$ntf$p{qAKa>rnddy$j{b zFA=OUKNf0d%JC7ZKchjbxziMnfW2oL^0}ia=c|uT&u7V(zLvF9xUgBiIX3U@zq2VP zObntlD0l}@@1>QPR63YWbZOB+eS73&Y29#*Z0K4uE z8GWC#%~XaTfab#dA}R5Cb!mj$;%7RY>sxYxe0D8Z@z32dW~pHoVL5>VtYTN`B!LIm zCOr%5KW{PD4GHr9RP0~{sLE?jr>>3e5F8G7GX_2Zd(yrnvhA+2X(J90R{1694%0*C zYg3T9R922-fA5Wh48Fu7dERViT81g~(xm}}K}Z}+iJ9?0wb5R*)rzS(E? z^FK};H-PHIbCWx{D`tc~O|>(QLF9gb?duS|Spw|~ublfG!FFk*U!5yhWhVTr?__WU zy}7XzKrS_s0M#e_W^zGB0$BA~+-JD}CW)=w7tZMvsUu40xLnnG_CkcxGUGxpfWjgI zldl$gJz3dubhik`OSoaXGap$g0oemwDDfck7BM#H&ax|VY)I%==+A~S8j0^(;-~nr z=YV_+xyBG`QmW*R+6hRul<;2XK1(Ioj*l~jlka7v)3g30x$q6*g!T{YWqVg2AtHeZ z^?zegh@>O7&-bx3Y5Dtqj(!o@9+)yq2O5x$8PzlqKb6e|R6i|6pXj>nsLv`gUUs3% zMz#K!0S=gg0$weH%5;KzGWhr{h||kOHlca~QKZ>2vn7>w1vaa+Re-_gZh8h`d$LZ%9IvpYQ@R{QbZDNsE^C=d@n`j4RK zVYTe3=%s%9Eur)@kGXMq{^LbyII=!PMMs-(W<$-cCnvr@`GzdT&3s=J;&t(S>a%yFongS~Nr7(K2JwGg+T-^0aD{#4EFLnS!Oj$N z3EPdHRFjsB9fvXi&WE}W+X$Gaq0zRe^19)nS!{R%D2R0(YMbMx5E`B5VSOqG7*TLB z7OF3Es+E_<{RQ{iBE#PHe7%974e?0bE85J~1dtHqD-mNX=xU&(bJ?xaNy5C{h141l z>HKB zZeZL3bu|Q+y>z2HxGCR-`fQ-l(uK@(6g$?x(UZR4D^A+jjKAf%ByYh_Tzxq9C3Gc< zA{!sIP|c$kGZU7Kr38t`YrN~p8x@A#TOsIh<1fi*F1Y1-iF084vL}l*W_2YBo1xU@95d>-4vC`1XlaJ-M<{wa^VCb2OhY?eRndEF z0P&iq+)Rd~!eQ`=Z0^h#ki`OeNX79lvPaj{v7-62nglWAh-J12xWf|ZeF7d(i7q?* zH-q3D(-b#A`({_rDd$|F4vc}vS>ZQ8uIe6uBdAZ%vKNE4L1gTNTVm(0UR!?5_O(^U zszQ3>bs=-0lgs#VR5E|sTlfj~)fZ|k z6#aeLa_Zo_0vBy*jM0F{GsJ$29|R&D%f71#B&OG=AJOq@8`eUC>F#Z}rPX(?BL_~9 zlEi}NuxvK#O{A@QI8ge#@sXtFYu$KlDH_by_;c6%oYfadGjsch$BNaWZqF=0h2XZ9 zpYM5dP)lUowNPvl=L^4BoWq!_-x3OR@LHGQ9LKwoW7j6DH!KlU=Up)AY5+)AzAEHU+rGbk8+*5xEipqJ4NJ!9Ntq`t=VG^Q-T zXjLgnV+haQ1acCMwikS{%HA^nWRkJp(~v14@! zw-4D_20_1O5$bSv;3u?#78WHS%F$>uV}0QAUc9$SS}OzVdy=Vqxz&ySJqwXERGh-f z&6jvPSsxj&GIDv%he2dSW{@*VVDs$}frvR<4r}==zCB-J1rM6ithK-cpqFOO;m5G5V(?yp6Xr|XW(-{k89A4@FM zF14iNx(;8zb<9V>*oqjc$&RNppN<=puq}t~GlV#CZWh=Z_!-pr%(+;cP(sr7qE$GX zt-4Uzc2?8!3^1Ytto18>3QE`yt;tlJbOFmkB(C_=$jxzY*bJZpj;d*F>JesBHFUvG zyuxfbFvlHCSBW4$=(G`Rs(CSl1RfBY*42RuZF>~VCtdhuao14(f>Ohox;NcWmU%8t z#Yu^r2UZ!>=RE*^A2rsM5EfKR7-?y%O_-?eB7cv;fxU6aTr=Gi0`@#i%0AykYZT*(LFV?Ce%V~=iP>OY%UTB)dwfHa0Ussj`;q&`DoNJ z;FU!TH^i^6g98?qtD(kHs2R_#T8{{l5goaW+$Mk;g+Bns3d&=%x-n#_8k=|U5ZVL? znQN8N9Ib_{QY+?6(kibvH@pG?lDK-*)91J;CUTkTM$c;4)TCb}_b_-IO3(18aC9l) zpi%%|FXpPKi~;6p7ujGZFeJMzH38ePP9e8<+|iCG*~?2V}KD-g)cb)<$K zBYCa+faBXN=i=Nu06!+%QSdt(A2de;^ZAJNK-he$p9T1tCLYR=ud;BjXn_61b694& z`@#2hUm|i)u4|>iPrzLUTGe_#_qyF8f+#?py8{8jnnL5+5%3pqQV1(wxFDo7scpzCHThoN$>H?mKU9NSX4yDV?I3?MsOl)C!Ku^vC-Sr#`NcRfbb39hS$D_;s`C7N;s$gOI|%Z+?)MR znMhymt>N#pBq?GgAWZO*?l#qzB{S|dl5pLDtRmM276+nwV0{c|+BS>y7eLG`Ok67H zk_PLXzP$dJ2LJkwC+}mJL}q;d%B=f=9@3c>f+sEr8>@1NF{XkwzA*O*~r!Z`K&Vr-m#=*d4@=IEY6%vj=ukkD4LON0bxV zD+qUJyaHgFbLY=bckVszN7uvAJZBf}6Kz4tb#IaSYg#&yVqlr_6r1sd$=<+#MTvUV z=1Eg=>9C~Y@!8_pYf-k5z;nz^1}qhSijyU;JFtb8k}9qJmo2t~0FS4dPVgM#BLKW` zm0(JIKsHuOO}WKZN}>_el*8AUa-8EykLgiG@97FFL@qNK2dug6$vK8{!}&!X0XEOo zT(mg@J?|1y<~hH&LaW{Ya>UUx3;4FdUG{4X}imcC+jl)CAs@g?(qzlKb^JP!{%-tbr4>6F|o!R{`iR_ zvGc`AZ8RKxaNN$dYXk%Ij)*r_fW`|6BYvOznWmg-Z0aTH!n(pLMs40o5YP9_uHWK8 z%U!rJ=lJ{~y9}`GDkzh~K$OSpu36s`BwDrnS|IiM&uV5cDwbYOBA@a(u1Dk!y=!b( zOtQPCfU)#aYt%eZ7Jg$`h&*D-R;y?oK3Fbs$BTw0#1Ug_>(?Afbprpe)F03b^C6Z- zYa^3j1fu#~2g`CyCZ`IAd0lqc9$JnD3M`4LT3n-JN@fG3J5M|$dw)URvjji)lz|>H zR}Y>4+EvCsY%w2xOX7NnCjyzsI1g;L)h?Yb zO!DS6TAWxVl5XzlJeJPct_(kgWzp?h=Ku$F@EgmS&(u8Mab}}!e($dB1Wt=2q1x1? z+Eo!x?35rbPR#`_?5HP}cwLrmFJ&G=xIYP=!*nh%mN#*Oy3$EZyR1aD5mSUJLafHX zSG98L^oloC*Bq4Y-snyQ1;M>if3lL#fhXI~*eZ8suotD5UE5~5gCSi>Y)sUqY9#;DTLoQvmxQ zvS6_yG-b_X{8O9SFMXWP-qCj&iJnJ;4rsKmVk%1#B_FSg_>}}?^S%cnS>N~WE|nHSZC=MGg2q3V)DeAvHU}IQ-EPUHk<^wxuw+`F}^}DR{NN7cm8N*5auo| zGAe`4AxF2P+vC6>kXvmO<$cSm6<;IGeZBdTn3#S#yA^(%S~kRT(Ln*9WaW6MZGNwI z9gLSpzm#N`4H4e%iFMR%*KkR@xWoQJnX#k_@PJV(qvu~nj8l7{Q@a2Lx!V!>nU<+G z{lE{=haF9H_KaeBeKW>ki5Te{L~3BcvBf(6ywE$5xLOPE@9 zsLyK9)ydEo@rmq$tskt~%`W(N8t)&*aJ%^yKfTm$9#$LFJqooU&=W~xeczAGhrFpz zPdyiB@cp2FjU298*ly{lx#i3t;Q6~GaXq*XOl!dT#RDLy=bLbg7G!3O+I-7w3S`OH z@iKE=Ve|m%BBW;x|H5&L6am2)UcE7_sofx1WuJZeFaskahe5MyvHLO2xdp3bJ{|XQ z8V#yYWR(2)L#p7VY>M>|lR9|Tr7weP?b)BPE7abA!hGC7!OMPteQs(2(eW0r2N^3= zcbPv-bsuQ>1NQ4&$oCcbmOmMX0Z>91xFGScm|!B`#;QWPh9!rrci@U$DD?*p&w1t+ zg>fXzn73xg5#{hHfF~kr14B{4u;6MguZM_ckvt?_Di-MAsFHErGVb?38BV@GhT19N zf_}s`Q%W7h)B%1LykUI{>D9}!qH%kpVu(3LExeLY*zMe%TEOY9gwc3)K&J@k*<|C7 zskl|8*}(yGD|52z)EEj3Yq^hSJa3A zm&H;a#`D)#IWz*;^o;*o(EpCnzt+|@S&qT~!~%GW@V{&7|Jrs5*TIn?35fa$_)33m zJsZgToJg1=C5dkk^k@1j04@hJnk_SojUpgnQc1MtAGj_XtC)=yNLLPzEdc=&IPp#p z$K|hyI6{x{+z@J7?%&j1q?@uzN51T|3CP39DI}iAh%`Uv=SN0WD)@*{FU@Y>#nSNi zAh06_q8#|7@SFZUKQy(0gMh(V2%7OXpu!O?@NnR)-C_HCqP%zkJa9i^md^b>KMGy} zjM1X}Od!Mm#88k@23U`b`gH>1zcCi-!Y3p`JW z4bR{Gt)Br902oX}={Wsu?Y=EtF zk`s759zT+=L$yg|uCx0x5P9GiLeyQF z)B#Gw@rqTj{wwck07^G)^^C$G3G11&+TlpyX6pjne~He-{!HoAJz};P)2;*Y3wR%| zH;GBAeq1gjNj~~@)ka1<=ahYqG5I@s+hN-&{s5Sn38t~a(=0B4a;*Cu444#Zr=LD0 zA<%~Es0&*I!16ObB>{Nm{{qYfK7fOb9s}YD#18_!fOqf4G5A>1_L8Q3rAKI_!psJ;cm%-?TK1zh1OOqX=gO;5PBLDymMpFfYY6F0{2Vp3Pto@IIO`@f=R33Bf0kz6Z>9N;y*3)>Fv8M9xn0*R>VD_OhUZmFmtF3l4JNHbb z*pTf4PAAWeMkjOt`4pIP{EWZ8m&sRVqw>q~R-pJ_V1h`_}0ip+8vI89r zaqs)nUVuk`V5SAQ#2eapb{QFV7Ec@7W>qC`U|>!&nS`SBRNtxd^wUqO@(!n}aNYp? z9aZ@^lR&OIge-rI%(`I#V?d3Z=RITC#gNicL~RqPJT^RZ2DlZ6nz3(xW?liJXn^hw zOSQpadHYwg#`CXPVNt!_N_n8s8d0cl0dT;K=FKbLR<{) z+LHNUMRiDOY+g6En`N1Znx0|gz~cUrJ~>#j{vDe@ZXU399Z~DGq$`BxqDTy(Xa!=0 zGLZ$VkC%<+8vg|J02)(n|AP~WOx8;D5KcDdjP=5GOXco_y3s2C#(K#U3#_x77% z*D;pxT>zi!OQtcxiY7m)HShT~u1GZuy{c8Bt|xfpk4msiUhu(PQwlP;#aF-q7jF@P z?C&2TnwP>2at6`S99t`CTsYT0+Q{VQxSDXMEwjyj7J>*e~twav-fA}*C zC?1x9bN|Es=5suhvdZ}gKtSOG%91X9mOI1~*+ZmvFlKA}y^-NS@Cb@E{B%Vi0=iym z3VxKg2LL37*ofiIm}Lfc97CFNC&uhkIX53CWuXZ%flT^S88@E~acIE-GJ^y(+64oI z-aXQ~A0_}J&<+u;fohUFS)~53!g2rLJ=3upLzg%Q;E$VY76R(rFX`>#IU4$Ik*{oW zoO@(u0l`Y*{&Le5&+Y1S(2ico#}uJwH3F8<8jSAutc-6prTM`knR8ec3t8VbEVMYd z^dSe?A4LJt0t=W;yNnbS@CBYP@`-r@*6QFoQ4KW(A!;{`?al$4!cSf#sG!%ktXoC6 ztclths%W0gF7gN9)Kt?*EEMrTc(H)(ylo;H@PFES>!>WZXm3++cuG!n7FosX6amKiz{6pb}qrFOEgo)uGE+XwTenmFRRcxH(hx6`jIx|lbHp|8OF6XeV1>QdQ#S8VyF$YX-967Zp4jv~CfS@#gmSpuvM!33 zf$7iccFcaEwZUkDj_tD zDZa42``3Xh_nT`ZF>zw|1Maa3ajMl(7=*|Ttma13d122oMZA|ZDJl1re33-lX}s?G z6!{DOzK;pXwR6;?S5qrOsU-$igl&;Ek}po^}a8F3GXTpS`^qXS<{rWFxO0 zm8Oaf@%Gw$BMk#8gr&7VcHr6syFzCqcB;2fFn-lk`%~7RAD?(gVv%18`6)K(c#+Kb z>vK?J{fxCOzQ?ZzcH-1*dtY`adW|EcE?g3daWsii4c*|5m|+mVIj)mh>-IRu)|W!& z;>x74Ip2yw*Km?R-xD0kdS4*qIMqfE7Rgk?sfuh|NyJi9)%kPjR`!OY;%msChH=@6 zcWlO?Pf#6Rp*v3!QRRlV6j}-1T{Oq!v2r>4E|RqA%uT|~V3x7-K|?QKl{uKrw2-9q zsN>wOW@MJelOd4ZQZSUb<36*VXEbnKm~#heo8X+vx4f8-S7+&_#- z4^dL|+Whb&?{z7ROt%d8?ROQvvI1M_z(r`UUcH#?=a_NbwC@c~1yn6-k5?kWp=1Or zb`VmAsS-q1GW@K)zX*fkTUN~irXqhr|pg!-NR(*|Cm5@g#9W zd|6=>!c&p8)Xh9@jYf~f8yoSpGPng@O%~ekQ-nS}kJ#T7`AY;($E1VuG9d5vCZ!eC z&f@E0s@c#BA|$r7lvq7E;VV4lVJ|*uyhw;7-@OpWM8f`-P-}LOJm2a9jyj_$ir(z@ z<3P{rcZ+2)&lQF=F5)=^!V$@ODmR6j_ks<%mZ!7#S^u|QLwroL02VXuU5?+?943f% zet*NOnkss*?%tFclwgrMdCT>E|G|k;p#xRRxN0}<<*~LRPxT!C@9pBUV?763Pzeiv zldATrqEWS^->Dxxr1wFTrec2IC3UY}jnTJ3^hZgpMeVyUu%yh?i{*b|Yx0Zb-$7k`-aY-cJt;^RlbA8I6bBARhbKR?D3OOSaBXl^e@&>{Vz zkI3o4_a~w{AvP~0M?4*#O0BQP$)h$!iZXGt)jGlEb45I}chDsalS0A3?h@0w%oA>A z-#q+N8|pRcp~N?2zXygJHS}`!;mA?RUYEQsCK(GydWYGQj{+DtEolmled6OA;bYTe z1T88_-SG!(YfAmmrwMqL4Fck*uko=yi%Co*@s>i1Wngzn=JpQU6!)gSU}qz+>9C_A z@7D7=p#WgjS3?7B@*S>Ugs` z#=o%vdZ|FD3l%vWuqU7?XmmxRf|=BqxP;f%p{F#g{ege_su`zEdYYkDUF2l{#OHaX zsQo7gjkE?Rl2tC0>@VqEWy)c*(>v(oA2cYkmp!w#*5qXEW!K3aqc6`&KDDS#?dMRd zTeX(Yn-g+3l~QiSXLfkmhsocl>q1Gmj#~t{QajzdY=$KLTJ#`0&zW#8o84L#Woq#Y zJ#>J0dvRb|b0u(}xSoIGMYIdbz&abGch8 zU#3ep^4Nd>9G$1imzq`UHdSNkW_)IRB_fWxt|F*=k{iXI4)RRt{4DB~lLV3{-iWf7 z_}Zvqj!!4i)@G!e70`|-z5L@0D>R{{MV}Vb%IA?vQ;z;AQ z=i7W`AT$80Qn3o`!*}ovaPF0bLs`62r<5-y%m)-<2p@Ph85ulY&R2^Pb}b5(0tjeVDWi zjKT!tSO?cKjnV|c0aAD^j!5mZ+9%=hu1pi$=V7QqpX6E(Lfy`v0Njd&nO(Pd?GSk!9*&6R*-*ghTOiW-QKKN1EU@ z#qz#z4V5VvzH%l)?R2rzqs_hGIGT4%Bdms86#BbeDdg*9#fiHpOH?-L^8a{n-IP#bk zQ9vakY)V4@-Qxn23eQPA;yN7jTY1RwUg}$N%_@P>$s$$%K0TauaS9)P;l(&d)GX>F zp=Awj$5nB>m*8DH6H!U~IMIZ2LP^arOjHu1FUEaUI0FB9QdkC#WO$C(C(it1VrLa8 zpUJP#rhfd+DZ=nchWWXfzg?jOhWWoXRLOW}PI{32n8mq|Xesc<`2G&|?eoBSFHPx* zA2$s>;*PNlbx%EhaYlQD&TdO;(m3uTKNQkjNBP8O9P>(CAcQ#IaVA9f0ow&tfK#&jfUk{az5UbBMcGrL)c#is7wa`#Z*M_3)Nl`vPRWpv;yJ`M8e(=fjR}M za|#Z?Gqo*~X~5VbB)jZ2>dbm~6jBTp$P7p>kRUf0hI|i%nF7wj1ph@vJ@yQ;#>Wd7VHb!a+64kL{R>L$q*Z3rXdG zBp={@hZvJ^ZBZjHx|wBx&D>7E1}>tx*8tmh8StY@1xm>tLaGGD2rf^J2hbW{1=z3f zhw&4G`PW)_t-B7C9es=Poy@Di_8^5d$##J4u802u3H>t|sdgwqD+g==QsI?$P-5sd zjs*-H2(KjS?QY*b83;{k#8-R#RJ4Yqj~`D3f*Yae=qPAOW@vWVnjuB05b|lRzXiEA z{`E2$2bNXft!K4+RJk|U=SmKvPt*E7!lcxlE>?^{*bkKrA_fOX{;n;hheMts#n&N= z-&dVIUL&FhSg<}>dIlV*0qR6;FBGG8RYO^=eDxfP^QLS&QbMl47dsxZ6g~kzTmZL-t&msFmfTlrEQ}@v>Bm)>g;mWt{ zTR=;T1fQq(|F{Y2^!Da?!UvXqSF(jMq**`dRcS~La+dKTWVC4DYs2`v-Ea~Tpmc=V zB3vqV?e1>@i-K2<`0@kf$xAa(RP|;S(cuwrmW_rUv(6PGs=H0mIOoMC4MD0o15B_a zK=OSLv0Afa;_GX7E0huu%Kc1r-eju60h}kKE-Vh_CQbXA|hTLsqPI9b5tSSVppB4AiHVu^Zo5dsO#kR!fa>Ypnd7WD%w(^h~s zjquF+NRby_wo$9Y4TlZs=9W!Z18f9QM9R?wwe&N(5F%Pa2RgRUIe(zu=@`Z}cySe` z7R2Ja5Fi0|oR#2pq$=h%Qv0a9u4$0FEK%IdBf8r)a2HDH(nvJNQQfjBijnDi-ZxSB z&r{$3tn$tA`jBPz`-yR%?|-Vxc}{X_W9$K?OS7;cYJ;)(w5?2~uk>@lK=r#KBccX% z>-4_U?_k3!t0E&*x@4MgD}KEktR}cd1;Ga&&j&H$1PB*EHWXLPZG4hd2|y$JO7Y(N z3YAy^YIdKzLC@Tb!zTxX=@QUKq-A#*9k@Qi+Dl{ZnW_b@`80E*&QgNN=$N(i-6_4C zUrdndDbRAw@EbcMwV5#RUJw30bjeHVDVYJ1FC^(u(?FlpK{mDds#qy=z?GA|gk>(7lofY&4OdiedoOL6zW zT`1Zc=~QRd{|*73tI}a$kAi_}4TQK#iXD=AQ2psXe261qReu9vp_tCf!-QDO9p(hDq%-?xXD|xEiBx#dJfX*AX&~ZF( zdRVb^wsbdEOLCt1Ts)%%K0hNe#y%01TQ-k&A+_L)@3hT?OUX@&Sumr`lkwF#zv~+z z?n?14T5>Vgh|A{a=p>=6?nc9|wskB?rFsu>FGxi`3=|QRSUW^lYv_7?V9gQ#P;0rxbWadYE<*`KpwbqWL)68fOB%6la|7 zzhoWrm{)I>)|?bU=dI@=OflrFV4gmsxvlWMDD1K-tF8;!1dfTo8ZMj{k@wi8229>R zyJ4rJKl-BSdYuE*T3uWFh5^fqj+vlzZ(e z7`N@TH}>PG5B46t^}DCVr>)@-8M=8RS2E1uK<&MeL6<(M(;Y`E&!^-F5j6RX`v!7S zik-Tdt?(I6rt_8BNC2mJ8ac+0g6A0tsoju0mV6Qm8%Yc;4Vh{jV9)Qp@Vz0`L_8yt zCx;d5RtZC{`EYWD5YD6miDMbtDpwa20-={)2vpP&~VjMj^cQjqhtD z@ZrDMFvE0z8Nyg}cWV8K&P6jt}8xGqn?zy_clX(^VdKaviu^`5nuP z>6ur6Zl_QJAx~AobPh-9(bJY;mgens2n-PiEf;v}kd0Vy3k)g2v^`??ZX`^YXar_H z3p2)x9X>0P(k~Z$ZAp5!##X&J1Y7cHK@j@d{D{q90;7Xpo7x@wEYm2x9-w+x?k!#J zjs-Mv9oamx-(H8kgCg=eHsM{=|L>pvs_mGIMX6SuYEQ7!{Xow;;|hEcI4md4Za9K(%p}?joOZ5 z->4GqOlV}LIQ!QTIwAodYreH@`Oml1ivjD^EL_Ozw|Rd<{{O3ok@x;` z3%J66>_2fs(={=we|_k&Zs_8Fiwr>pZuTKk#tef4NX#+;~%VSGRPrQ}wNv);VjBkF1+) zVFt$CgF93k<-byD-#Wv;5@*<3}#OzG&h;{WkjS#_BaDToWN>R%~FA>!BK3}*ek6+=@6rc))cyz=jq zlfuNvD9#A~b@XH5`B{~XWJbr!$tN6;83(6%)baBwogtQ2XQ?M53Ae+jP?OdBgD{^ zS581`FkG9^yYfuP5Jkc%5S9$3@LFHH2P}5ZL$Fi|gdn4kgFVFCJj_}4ZF|1NI0)kM z5Xh`YKsiuA6+m^%dO@=LC*2temM~vfm#!Rde3yg3QK*I3|2?e*Z(sOI9sM%|P~Aq? zfg@4dI_omsU?cS%Tm>&cov_L0q(TJDbr!%KK|BO1nFOhjZ||bVYn4&uKoyuwEe8N6 zKE!?B3-*aNDT0NdQhk2PX;MriOvMn17;iZw&kbn%Ui>E{TD;>vQVzQDJxD>U7G*Uz z38-}pmoE4KxU9?U5yZ)eh9UQ#by3k^xUtG43k32*2;n>QrW|!b?6&dOXQgbwB|K;< zv4q_mM79AHlm$22Kj?S7x)c%w0rF0`MgWP#1BQMNS1^2rgZwQ|rcUFTcgjJaNP-a5 zGJwQ_(wITkFf}CQS+nl-$Hx)QaJ(RlWhY} zb%+%>3F4&#VYI}GXe=Vbsv(43#Pr0L4dfMU3Gbjkd|6XFLU-?K5bLGOL8*Nte@~lz z_~UN&-Vo>!0{Xq-u*Q4&Q=7uZ3&*DrhWk~1F)_w_T=H{7w2vHVa58dF{HQ{210dUW z*<)M<%K8wlzJUBNQ}Ks)e<##iFAYd<=VWCLouj{I*sfw=sF+~qlaU8aapeneA~^CV zoM~|QKLg(I9&*nJ54AsOgR4C_aW4fMGd8NmNGcbH_|_ zimgBIabp~IVVXKAR4Wf-95pw0qt&L0{d^)kz=cmgTUJq#&auFAy6-(~qeN%H^Sx`U zNK^}oQBE_CjCiq7)03=HBfhaqCEjlTB)1qO@ZT&3hrHH|?k6q#R_Etn5O()1L=)~< zlfCtbV$VpWyBklz0Ges*o;%b~T5l9&c7f&R1#Ht$sJv}WMi&cWcBHaVE!3GGi=AgZ@;`P8V z{p(N=V&RDz6-bdbp-TT5Iia0R3606jf}a?h|NEKhry3s3;b z-H8M@JiED~Kf8t|4m5d&%z^;yPhFKm@!SmgL>Xbb2$9O)W0e}p5<}jY@zxt2ZY^P1 z^=L2?ic?p8*=t0dWbNVHfjmilp>e2gfE4?nKn*jY^=|nu=eh_#D+Y{O2Q!Mi%>>wd zlao57ioE)ddSVLOTwHyrDk$%XYqgNS=AFW#bsbwK8Gd*L{XQ#TJ=jb;hDsDldl<%- zfrHEl!l(kYX$yA!WQER|lXJq$)W``uQ8xSM+ zRQRJ097Wf|i-9mv0ery>_#!M7tSlW@=^}H3l(=xC`ggccq>^)vE!%=bjYujcIjk?v z6M-0-nxP`H%XwG$BfF1&{XmpswtNI2)@$e-0d?is>9YQRN0d8ePgj$N0M9T&f$lCP z;hQ+9XgA1%@X`tm{xruK|q(DVlfkP?|gdN?VOF)@swJ_(g*;I}P(Tr01VC&q23 zpEwR%1IX`$8LG0G25<^8zsl=##`m1W4)Ze_)JCx;*;C}ce_^Z9mb^K(Ox>jY2o5j$ z?6M{9d*FV6)=msWQvRdezlN6CYPxH5`dr-}JT@J!;UT=W#CKQy3Y%-Jy2FcGXgWn+ z`_5#k@&S|5LJ(Le2cDiHM-xsf*fhwUFe0-y;ujFDV>lX##7>pEf#=jAQ|P8U zqt39`NWCHW=zu#7P8A!(+ERTxx@3XZO+15}?>(90u4Y;Q@Yn2GlQACKI>GxFS$RmW z&v$Q|_ou|h*gT0U;Y|{oGU2t#wTDr+MkI@x7bhS#4)1Z*z5r362Ck=-D)bBCKA@T=A%j7oH+ z0hRnNMs*w#*gzC@Ru7l|LR!sp@O&TbJ3W7)(*v+duZLdE{eAvN1Ul6XkeM8JvLzcj z8Sp)?^Z5Ck^kI+6)7!Jr1kp`j^%HBfsNfy7qA?&q_@3zz@t+u;J%hUGD@S~PVh-HE)kQ3sW_rH@_q5M$^ zG@F2XANhd{%pYnNfgAo1?CyKGp$V}*+eZ*+1+ohgT>p&61ziZc0uU*IqbwFQP2FB4 z*Q~@@YzkGs1hzO(%?^(EBs=s=7`Xp*hGF`3pQ?ER7D4CBz>o5%zb%JABbXl^gGvbz z2-I+{+ofyogTYM%9sy39P2m2Ggmm1VJFwOidLKoKKN&N2zjI4eSuQL7yX4nb0MjFW z_9+vd6$OnpA_&KWtKI-D3&G82&zusa%$RL1tLAH>Oi^{Vqc6tz!n{bY1NK+Kq(C5^ z7*mYjYb0y{P^4*F`<~MQhgOgAru$2zYFEDdeJ-v3GOC0r_76GKFEIt>@z0U&pX^anKvRhK*@dP)oAuxSFpOQ+~6r7x19@Cv&N)@8{$_y{d| zMCXN~8K=Z9#+QW*q7kb>ATk8Uk{C-2b^~B@1f-njKV>6`e|9lN*+ljWoLk7fYv4dE zLri3vPm1p%R2l-7PfIW_7M;e})9aaX83^sW?7H{|@cLCx$K~x|5w)2iv5ZHnDKCU1 zkWnb}jUhZmwgG%QL>wgDGoH3j8n&t2v%nF(02anSYMC+25WDxy=_>^l2(mG}eKF|( zyEiIc+w?s0yDeQ1mgYjk69PO9z&KQ`#RBZ}Z~>V^u06$}9z$*dzuMS2a2kI*#5@9P zP=xFkfcoX2?#?2hk~wm$!{Zch>PGcD~@RoP8%* z3aG`oV=1ee^W-x!z0;jRrMia4xZ^ZjlqWyr0MjVA z^3nb|sr7E2?lVa|(;6@Q_x8s4lN>=X3KwTrL3{S-VM-|?(4|53j$LoJPwdCifVj1e zPz4eI4+|rv0t#6<91wqt*&ql^FUH>u$wS*dd3uL;kowaMd2O-{+IP#oIX~yx>en%r-UM;I4ac_8pW%LApRSP9DDtWo`E66M@ zPAlN&*+fzQncjql@5)Gng()?#44!s9`xDUSQimH7RWrTl=7rro^g@4- zD=d|?=a(}qJm$CXMJjA12kfV2o&jm#?$#EIf)h>fYa!m5yHvT#yV4#fquHN(_%` zuw&vysU_==8{lE#nmh{EcP2D4tg59+ZfT7x$5szBARs%-eU`I$ftXXbvba`nUmL&w zVWg;QQyf|)>r1NhGr`Y~JBPcxKM%i>5AOEcoa^}7aZ6@NHg`$K%Ia;q=LZU2lb)l8 z{7jdX%`xg2?~7dX{6(H0T0EI7h=E<_V^h{;vBAm5muV-&#-?gzpB8vuDprNW40HtZlfBHg1)8+_TT6$f$%t(u-ZN{GjgnM*Mqd;Ro{8n33AYuk;ySDOmm_mHhFzAR`yr5c{XeZLycH^Zm9{j|&K0KZuWCl@>+5a@T;0M#weG4Oj;ZM&SU^dJYrd9qldB_h_ z{8c1$o8`}-P=whHiRb@r@jiBetiRVl8rvm{!_NR#TqmoH+}+(PbPB7Cmy$y)ms2E( zbqzz&dgHN7yzpX-T|JO5k0@_5vcRifT3NYJBrCf*h-&e;n3 z`e~x&!VeFKXT>P1?v*|4Iv}nRuL#@jz4{HD!?OU?)P~@3SKi#e=I{T#)$Mgc8jY9> z89DhIpjN^x&zX>fL=yXEEV!&CYyHF~m9_U-$vV--G?!L!3F(O%Mu&@uiHQ^4A>_C* z$Oi{PJLYZfXFV1aUe#HaAA${|m-Hw;oTrjm5JVGgJ%HKyd;#MGk%EFm7+7MBBDsu8 zh`tU8UGAHxvwTgZSf-Y^Jd)4xoN;jZj&?p|rgVQ47k;mL0>cR1{&aoxd}Ro!;!o^1 zxl$_uMwhI9Vi(>jza9|tZ0qq&yjS4j!a=ZDn*rNb1D9_+5O`J4duk%h*Z0}ilaAe(ixj z;qQW&I@lH>(G1WS9BjRNf>bwcy!=dGUq5^b=MvS0$am(xe)W7G(w;q24c=Ru^l}V& z5_S6H$B$5H&7jkFQDbQr`7d=5G%VGtEK!bb8XXx||hI+nc8 zvP^qaAm!}UFnkX4ALkT9$}os>gQQpZuFw}n`yFd;+lB}AZ@Z-i_)7?o@nf1 zs3Xx{p$s!DVR_v3ndWCyzPrmK%8A{k%?^G1&0dvx8Rys<97giQ-Das~Y+PKb=6V^x z9f8Afy2aVRd1XkTJ7efRbU40Bo13Nh&WVdelLeHeO0a&z@z&?N=cmWv@|Xp&D#KLs z;XZ!Wp;AuS1|;;96pa zHn-Q}v-@|GL-kP>iiR%I*jTVt+7m+OB*k4Os*HFHjl5q$i3HLrf~9fXE+7YY#!Bee zl4+56xPvbz+s?;4`NMaFusYJ_(|GgZiBw->?h)&PNz*IrU?hQIT-ZS^uSMBBtZ>{E z=DGCH(#1l@da*xMEfZ#G@cm-gBoYB19zM^N{YB4R?UZpb8Y2;wRL!`Zsq&j9`=Oa; z$J%nrwtP`%eb{-$ot$4HPOL;lJI-7mGQzqJCynkCQNP?=TVyOb8Sl1km*gcRB#aiy zoI0y1XZ^A}B&1`UH(+UF8A3kjS~Vt=8Q1_9A?-&IT|i0>SETZaExyYM{&`@EJM$$O zX?n$sqFzYN@$D4?x|$9x{j8n`8NG}R-YYu(4|NG#VVZNEj67I&Bz$7%K#AhU^N2TH znqd-$SMA}}ZFbNuTmAC!>h@MR?IMGI?4RyPrvF8;Ft@dJh}^T? zw<_*FN9B^UTu1BP&V(!w5~uOfn|ktx4=#Jx!i8pJ_+cf=QMX3{=46@}(EoV!N58q+ShaJal$iib>Bj}sT7{y#_K?$9( zbcmA#6CL&$y~x$JzV7M9`Z2O@9npF=znw>T)u0b zxjmUYR$YoexJ{tx3!+!;5z28to&{%>S!30D0 z0dXJubG=y{*hHONEY>i|zmMh~Q&>rkLC*Al4(>4>SO))1{({l8Gv)zV10XSh#F)lu zww?}~?AwDIfXBgBNlWOelH=mybUQ5sRs>xADnh-V zSRVC$8Rg~ZlWL$`JgLp8*u&gIu7ZdC4?aN<={%-7(a&Hk!QkVOLy+PBAy|#oymbbk z+Gm_YV4L1HJ3a^R1rZp1P8@yC0;|%p!^_^&a~o{^%a<=jL_}o#HVX<1HylY<`sA*@ z?RxxJ$@vKKAU@7CS9sHp3!}Kf@>dDs^@fHAQPkNjU(Ev2XMze={$`M3Vu`H4F^L zCJbvwvnHm{G1sw6nw$-TNemAUSB3%~Ttu1W?r95a5Ok$CyV7dNSrmWVPj0c$zfMN?B#LvN(7Z&~Z< zrOw%B-EM$hl6^u|v%_cKi!4;daxE`s=&+kW+hTQpNv*dJ#Hv2iv@i2IYbat4>Nl;wCD<;+(|?;XI+h^G{g zgJTjj4$K!&{dpP7R>6$Lg(oWNeGSxlzy$PsMR{d6!x<}fWh#Squt{$JAvu_Kn(YKP4$yok0akts}5z1d#J$UGoqF3hlrkpcjSR{0iJtKbNG-#kOkV~o!rcld4lg9DEII=Rzy!#mI_5l=gKdF_BQ$|(aNh;##r4cm#e zhtDCAeTb$iP-!GkWwxnaBb|L`rKUUu9KNlhP*yhByjc>C_8a-n<$j#s6dp3mVL6wS z5xuzRTVY}G3Ln>rZfqWM0v~nGz_|><806@Hmx92*%g=G}0^PI;jVcp((HaV8e9{93`(lEjD2P0e`Ieo&nWq*OEgejWy&p&7 zSkcrlUv@^&o4Q>+Ys}88PN;vMn@fF%y^~1SG}L^U`#C`)6S#rnIQ;?lL>xzJ^va^> z#Qk>`(>X4v`N`g6AxbeKd@|tz3%Bv2Trkxg8u+3GIMs0%blbQ3lIR0|gE+gJ@x@DI z%%P#1sGY}6_=;~!GmpZP*m!uV-Cnmgt;KaUFlRR{{s@4VqH0$7wj#5Xs#~@+F)@+T zxuAfy0E^-iglkJ!e0Ba|Bg1eiZ0wTIt~6$bAua)dZ2SEzUQ-embgkLFG_>CSpr4P@ zWQnW;Mu#8W@^n*^@MQskv}irs>;nYMg(Srg!3p#2R;Y!T=$TLv`F6?-hi8ya?GIk; z7%o{K3**(JNNEdxhC0(`wSCPqAwalBSE)&%*1)r_l0T;s14}kgfbo(+Mk3&y{&QsA z0<~YX-dc)yk@=t5hIy!NiN3Xu4SpiW4%2e6n|7}75~x^GEio}M6?t@HLiu4c-5)0N zM2LZToSf%bWE2#euiKu7Ge4&v{3@GJkv^cuD=F@d$l7;)+G|BYalGxm~#oFPj`dl!@Y z_t~OxgjUlS`Ec$I<}1wh@YMq{BIqn5QK150W;hJMh*Pu^H1j@|@OfGwhM~^QApHi>87jQ#=!o@>kEa_Mid@?&pVg_Y(`N_rt3EVg zxe%^t_4fX?E0=?IB$e+;JeG`j=OGVMd|%|p+7W4SaOCE2!R~`9f%@Rc(dYVIXDhE( z21hoIg#G5qrWd6&WiZ@@u1MIU6Oh+Jm=Z}I;7w`-YL97?S!9;}oBRdq70SkJl(^jB zW;wACMJHNPVm-~UgC3Z&8O8r$%KinC|GP|?8D2(qHd4(7PVgHLX}j<9v2@P_`wRw} zpGVis?7r91;QROQ3uXN7R8%-RIQVYPNy*6kXgZ~j0j3FCKp~v!?YZU$h%_F894v}{ z8?#2uV>px;=u%=ypeZWB2ouoX*k7L+d}ahz@4_uPD6-`B@{Tr7_AV_n5u*t_5jTH1 z&+|f=uWK7k>F=H}+c#l;05%T6>4{>jWVGB$1v__~Ql$v>PUbQ!3iE3fKc zGuF{R7{@|kf71i5=B)6iQ-o(S@`uh)<39oP(}02pi8MU{Lcyroc;M(TJ}z#NznS$W z*r*g`=<8E?Y#W>z;4HI&z3jqI;qMHATm?i9B6iCd062j|0yTqg%)&K6E%cR+T zA7?PYBXmIDwno-+R^&w2 z@=#5CN=5q~XvHxVMmDzckfUEDu9b^dfr9MQy`DM~%LOH^FCf86K&giAfzON1g5Yij z&W-ON!XKPp+o23tdT^+wa1MMxddH0)Igpyfo?HozE27Go&)&fUZk}f=J^tt>%{*{& zSpt{%19o?JmF-hXlFAR8P!Q@>1!$7QAkQymU|@g=I@7CJ@)P!Ani_OVQg0w zP%TH`H5oZ_grk4cFmL0?d7=mzz=ZC>Ej!g}i8Sx3ng}t_vV4qpV=a1-yPNtFX*uE- zU`WmvN>9FI@pbe_`%XRc4yhmi2(GMnsQ(SX`#`2D&L;@Pu`2pF-dLS9hPN=6BAuQd zNat8uZsN`QfyD8Fb_+ZEk{pH2$;$ZriI~fT`omkSSIenvU|oLNXqyLU->Rf{gPClg zYe~2Se+eSwL2%0QT~6B@SkhMXA3e9ZX5!lz8=3?stn}!tpU}`b1K;&&CLW7J;ASVU z;rAv&r6@f0DX;bE#=#M)uM`k6mZBJ}8zwpW3`?qiBglCL&^Z2VUTTI`6F;SozY!ZJ z@y&ma+xh5&uiZzMn{1Yl0k$TM#``V=s0Iec*FMd=MVcvBv--kOpvB@pSFvY^AGtBR8^s@f0Kpg zqKyizqsFhQ?0OOiMDoB$rT^&9G|!UABV>@{Wst40=)a2oLcV}oD&tMLJ0IiazW^92 zy?80#3^?*cwi~8I+aLu?eLOd3(UHz6kA+XQ1-C*@=<}tGY{@T(Tvj3cmQ{JN{3)Z@1G z&9{=z*e*V57gJ}bCfgH{lqlNtKr9uNMcn~hHi zvHvvZ`T8c4>_ZN6chJ)cPEIcwg*^=VapRhk97DF9E7SpPB9-33&aS^yeB*Unax3N%Sw1WY=31;zAN*Az$iCe zbQgRBjOC8S&khpSdB6DlbllPu$aL$3wun3UMy$$00)E` z<_G~_efqO8%xK2qBxqXNn^=!^N2fH9i^y-_yLtWJGojlTDeT`lQepljv`E+FeAH zWB~vvNMh`M4+DBZLzCBN$9dJ{f`~w6xjpO)yerxGPEOqDhZ589LG_l&L`bz?oWcIb0P{zxsh}-nO z&)xNr!NIiWo2suw?>)iokH$&(+5PRQz+(PT)BBu2-wO{P^IpLj2s$b)W!_D%KuRIn z8*MZsHj-jYVqTA*DWs0u)xNK*nBSITdP;~a^Lq<=Pg{XPr2?HT}LOwI2x#4Ot%DGj zDuxmnk(I#uwIM*jyUk=*3B_p!XED7#6}Pk)T4|(Weg|fi@~Q`AavKsgZmcFY@4E*4 zFf6&DA%!Lr6&;-39jZIBwfEr5-F=Cam==601IL7x=?IE>eWU61rLf(IjBR^-n&vv03F?0$4 z2>{c}UT>(ZRNoZV(w{C-KIQBK!BEI}@JOijrh>($TaJ5?=#3QK#1XCD{~{EuyUC?x z)eiio**zDx`G-FN;nvcP{QCK0cV{PwPRa{OorM(k$?A*sT@pL<+bUH405iS$?);(_Q|}cF|s2KtS@&l#pr0c2zLTj}}NZJHKTA{wM_LJU;9L zp<+)&t+9~z8}UAr0mLhc{TuNDKS9;qbNGGt)`M@iw#Qb0^ioSc69h`0q4-5$8)LeI z%oKXAVZ62V4Z_7nmVAuTzi`$|0l;?nnO65V(q#pdxe*7``rjZ2JDFNUEssXZ>$b(q zjThxZa6%?zfmhs}>|FVZSrE-@dGF(SuHevy=vltG1#DhCD%360qSS{5&m z^enx-{4AN_>UgOt>YyACT|fb_I6q z8?ZD4c!%<0n)XMKdzp|FQb`Ymw*@JgEtCRQ7Tv)?N4o98FMfRPMuO&bKH3%7lH5H? zGf6v9u5(%Cy1@L~>&+Dhoxdgb;8J(6qMxdXlU5}0zUn;$A9}kRvKwu+@io)=SnTlE z2Ht#bg6jxjds6oE*k&6|H%C>k&(zV^TEA+?#luT%%RywVa*(_s-sgHTQann<;1}5f zMYH@$p&mi7RuMS(#*w4_ssapl`6C-rUp#{u$4_lU&e2OJF=i5`Dw2aW-lx z2zzzrmC~a#NFcW0B3+7i2bjQddIh(6Y!CPA*Q%B>Gk{31>gqZKv9NiQokpcitIMP& ziONjC>R5^HHO@BGo_3QfS_ZJ_^bs2H-=ID=2jj)l5cS{^8z$VutaS_C;FFM$aKgkz zg0)+)I^P9NBazWcwv@s`K8!**uR+a6NR!*u*$UujhRsjD9*PzhU*km7Gmn8jiu!6J z0L!^JEA;zeXqWnx;Gqs6va=dS-_MbSBk;;4rN7TQG2%2D%C8MuFr2{9m3JVkN!Puw8Uj!9ZYv>;@e9 zq2b|{Ue=)U^#~gUA*?Eko=jwt+?lP1FT{fy_`%Sgg zQlF+sOpIzG+Xu%@Q%eix0K_n_sxJzdMC)bGP}xib$@qsNbeR&zN^~nZOr4LT@Wc^^ z!vD<<>dx!_#J-$=^0&~Xalkvn7v^Z{-^lK7RN!F@kyF!4X2-gx7C5c{#D4$({uk;~ z{I&v^u7MeEv{-C;eQ`Y8r`hg$$qYA{H%A7+Za+~3?+H7LBl4{Lk+ zgE|j3nsg!L(j!AoWyYA_YN-_4xZfa!j#RMD`tITL`U7HVaVb$)OY=A$n5nyepa_Tq zNMM<1NlyQRwUEyl6&LCL{?@J%iryw65I@E|AX$?-_4hY^gu#i7IQh1y4`UNQ{-0m5TCmRg+v7INn0J zAkeG7h4g1k0f-YhDp4oh$S{7Bh6V8SV;h7gk*^y7Ol8MJLV5r8`yr`%Rb{0D3i*{W z6$SC87nZI^hd|0-g*{Dr(4bvgo4_%Uny9a`lv<{Nu!|jDSfNK5rDJjH2T(gZJ3GL9j04Ml?bDP|EzPU>BOR615*#~D?CX~p>p~Q} zosgiiZjQ#?Z)yPAl8Ddc~$ zTJ^mBi=L_LBQ*PFm`S05p~iQDENt2!s|4L^5kYKXX5E5;TnN5lILkSm}NWB53 zixgkS4|&L0BoHk=_00WkSI2{ylC#_6xG3yZUWM?=-F0HXTHj1Ue+egHe` zTaZJ5yl8yrfrJN0bCd=_m^sY@YbxNZGjd$~-X>!pJ=FWbWp@JUG;}x=LPM-E`!`G_D*sx(|)c zUSK-;rwQ~rIE*c?(x;9a-9YB*mJ-%Ky?_T#gRkfGZ;0x5JF{>MyU5?f|FgS^8ZN@& zVRgam*wt&HRG$h)|Net91H@l=-V+PUsY?J}`tx%k7C6i&(Lj_N;&BNUfi=dY(1(R7 zM~qmBtpQ!?V=-;00xQT?5>#TPkKZ5})=QA+4F0i>B?fG)5DDAb@k`6WN%&>lPQ>uf zxG*ACT0NsO;`hM9k8kSG;dALDx_@N7Vd~H-i(V Date: Sun, 17 Apr 2016 15:25:39 +0200 Subject: [PATCH 061/210] ARROW-103: Add files to gitignore Patches [ARROW-103](https://issues.apache.org/jira/browse/ARROW-103), though perhaps it would make sense to leave that issue open to cover any future .gitignore-related pull requests. Author: Dan Robinson Closes #62 from danrobinson/ARROW-103 and squashes the following commits: 7c1c7d8 [Dan Robinson] ARROW-103: Added '*-build' to cpp/.gitignore 633bacf [Dan Robinson] ARROW-103: Added '.cache' to python/.gitignore 59f58ba [Dan Robinson] ARROW-103: Add '*.dylib to python/.gitignore' 52572ab [Dan Robinson] ARROW-103: Add 'dev-build/' to cpp/.gitignore --- cpp/.gitignore | 1 + python/.gitignore | 3 +++ 2 files changed, 4 insertions(+) diff --git a/cpp/.gitignore b/cpp/.gitignore index ab30247d493..4910544ec87 100644 --- a/cpp/.gitignore +++ b/cpp/.gitignore @@ -5,6 +5,7 @@ CTestTestfile.cmake Makefile cmake_install.cmake build/ +*-build/ Testing/ ######################################### diff --git a/python/.gitignore b/python/.gitignore index 3cb591ea766..7e2e360557a 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -18,6 +18,7 @@ Testing/ *.o *.py[ocd] *.so +*.dylib .build_cache_dir MANIFEST @@ -35,6 +36,8 @@ dist # coverage .coverage coverage.xml +# cache +.cache # benchmark working dir .asv From 0b472d860260f7063aee742939be23b921382741 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 18 Apr 2016 19:44:29 +0200 Subject: [PATCH 062/210] ARROW-82: Initial IPC support for ListArray This is a work in progress because I can't get clang-tidy to shut-up on parameterized test files (see last commit which I need to revert before merge). I'd like to make sure this is a clean build and make sure people are ok with these change. This PR also has a lot of collateral damage for small/large things I cleaned up my way to make this work. I tried to split the commits up logically but if people would prefer separate pull requests I can try to do that as well. Open questions: 1. For supporting strings, binary, etc. I was thinking of changing thei4 type definitions to inherit from ListType, and to hard-code the child type. This would allow for simpler IPC code (all of the instantiation of types would happen in construct.h/.cc?) vs handling each of there types separately for IPC. 2. There are some TODOs I left sprinkled in the code and would like peoples thoughts on if they are urgent/worthwhile for following up on. Open issues: 1. Supporting the rest of the List backed logical types 2. More unit tests for added functionality. As part of this commit I also refactored the Builder interfaces a little bit for the following reasons: 1. It seems that if ArrayBuilder owns the bitmap it should be responsible for having methods to manipulate it. 2. This allows ListBuilder to use the parent class + a BufferBuilder instead of inheriting Int32Builder, which means it doesn't need to do strange length/capacity hacks. Other misc things here: 1. Native popcount in test-util.h 2. Ability to build a new list on top of an existing by incrementally add offsets/sizes 3. Added missing types primitive types in construct.h for primitive. Author: Micah Kornfield Closes #59 from emkornfield/emk_list_ipc_PR and squashes the following commits: 0c5162d [Micah Kornfield] another format fix 0af558b [Micah Kornfield] remove a now unnecessary NOLINT, but mostly to trigger another travis-ci job that failed due to apt get issue 7789205 [Micah Kornfield] make clang-format-3.7 happy 6e57728 [Micah Kornfield] make format fixes 5e15815 [Micah Kornfield] fix make lint 8982723 [Micah Kornfield] remaining style cleanup be04b3e [Micah Kornfield] add unit tests for zero length row batches and non-null batches. fix bugs 10e6651 [Micah Kornfield] add in maximum recursion depth, surfaced possible recursion issue with flatbuffers 3b219a1 [Micah Kornfield] Make append is_null parameter is_valid for api consistency 2e6c477 [Micah Kornfield] add missing RETURN_NOT_OK e71810b [Micah Kornfield] make Resize and Init virtual on builder 8ab5315 [Micah Kornfield] make clang tidy ignore a little bit less hacky 53d37bc [Micah Kornfield] filter out ipc-adapter-test from tidy 8e464b5 [Micah Kornfield] Fixes per tidy and lint aa0602c [Micah Kornfield] add potentially useful pool factories to test utils 39c57ed [Micah Kornfield] add potentially useful methods for generative arrays to ipc test-common a2e1e52 [Micah Kornfield] native popcount 61b0481 [Micah Kornfield] small fixes to naming/style for c++ and potential bugs 5f87aef [Micah Kornfield] Refactor ipc-adapter-test to make it paramaterizable. add unit test for lists. make unit test pass and and construction method for list arrays 45e41c0 [Micah Kornfield] Make BufferBuilder more useable for appending primitives 1374485 [Micah Kornfield] augment python unittest to have null element in list 20f984b [Micah Kornfield] refactor primitive builders to use parent builders bitmap 3895d34 [Micah Kornfield] Refactor list builder to use ArrayBuilders bitmap methods and a separate buffer builder 01c50be [Micah Kornfield] Add utility methods for managing null bitmap directly to ArrayBuilder cc7f851 [Micah Kornfield] add Validate method to array and implementation for ListArray --- cpp/CMakeLists.txt | 2 +- cpp/README.md | 9 +- cpp/src/.clang-tidy-ignore | 1 + cpp/src/arrow/array.cc | 5 + cpp/src/arrow/array.h | 6 +- cpp/src/arrow/builder.cc | 56 +++++++ cpp/src/arrow/builder.h | 46 ++++-- cpp/src/arrow/ipc/adapter.cc | 136 +++++++++++----- cpp/src/arrow/ipc/adapter.h | 11 +- cpp/src/arrow/ipc/ipc-adapter-test.cc | 216 +++++++++++++++++++++---- cpp/src/arrow/ipc/memory.cc | 1 + cpp/src/arrow/ipc/metadata-internal.cc | 3 +- cpp/src/arrow/ipc/metadata-internal.h | 3 +- cpp/src/arrow/ipc/metadata.cc | 3 +- cpp/src/arrow/ipc/test-common.h | 67 ++++++++ cpp/src/arrow/parquet/schema.cc | 2 +- cpp/src/arrow/schema.cc | 2 +- cpp/src/arrow/test-util.h | 49 +++++- cpp/src/arrow/type.h | 2 +- cpp/src/arrow/types/construct.cc | 43 ++++- cpp/src/arrow/types/construct.h | 9 ++ cpp/src/arrow/types/list-test.cc | 80 ++++++--- cpp/src/arrow/types/list.cc | 60 ++++++- cpp/src/arrow/types/list.h | 112 +++++++------ cpp/src/arrow/types/primitive-test.cc | 6 +- cpp/src/arrow/types/primitive.cc | 45 +----- cpp/src/arrow/types/primitive.h | 41 +++-- cpp/src/arrow/types/string.h | 5 +- cpp/src/arrow/util/buffer.h | 59 +++++-- cpp/src/arrow/util/logging.h | 2 +- cpp/src/arrow/util/memory-pool.cc | 2 +- python/pyarrow/tests/test_array.py | 5 +- 32 files changed, 839 insertions(+), 250 deletions(-) create mode 100644 cpp/src/.clang-tidy-ignore diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f803c0fb3e4..b38f91e5d68 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -565,7 +565,7 @@ if (${CLANG_TIDY_FOUND}) `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) # runs clang-tidy and exits with a non-zero exit code if any errors are found. add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json - 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) + 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore | sed -e '/_generated/g'`) endif() diff --git a/cpp/README.md b/cpp/README.md index 3f5da21b7d4..c8cd86fedc6 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -76,4 +76,11 @@ build failures by running the following checks before submitting your pull reque Note that the clang-tidy target may take a while to run. You might consider running clang-tidy separately on the files you have added/changed before -invoking the make target to reduce iteration time. +invoking the make target to reduce iteration time. Also, it might generate warnings +that aren't valid. To avoid these you can use add a line comment `// NOLINT`. If +NOLINT doesn't suppress the warnings, you add the file in question to +the .clang-tidy-ignore file. This will allow `make check-clang-tidy` to pass in +travis-CI (but still surface the potential warnings in `make clang-tidy`). Ideally, +both of these options would be used rarely. Current known uses-cases whent hey are required: + +* Parameterized tests in google test. diff --git a/cpp/src/.clang-tidy-ignore b/cpp/src/.clang-tidy-ignore new file mode 100644 index 00000000000..a128c388896 --- /dev/null +++ b/cpp/src/.clang-tidy-ignore @@ -0,0 +1 @@ +ipc-adapter-test.cc diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index a1536861a20..c6b9b1599cd 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -20,6 +20,7 @@ #include #include "arrow/util/buffer.h" +#include "arrow/util/status.h" namespace arrow { @@ -47,6 +48,10 @@ bool Array::EqualsExact(const Array& other) const { return true; } +Status Array::Validate() const { + return Status::OK(); +} + bool NullArray::Equals(const std::shared_ptr& arr) const { if (this == arr.get()) { return true; } if (Type::NA != arr->type_enum()) { return false; } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c6735f87d8f..f98c4c28310 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -28,6 +28,7 @@ namespace arrow { class Buffer; +class Status; // Immutable data array with some logical type and some length. Any memory is // owned by the respective Buffer instance (or its parents). @@ -39,7 +40,7 @@ class Array { Array(const std::shared_ptr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); - virtual ~Array() {} + virtual ~Array() = default; // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { @@ -58,6 +59,9 @@ class Array { bool EqualsExact(const Array& arr) const; virtual bool Equals(const std::shared_ptr& arr) const = 0; + // Determines if the array is internally consistent. Defaults to always + // returning Status::OK. This can be an expensive check. + virtual Status Validate() const; protected: std::shared_ptr type_; diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1447078f760..87c1219025d 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -25,6 +25,25 @@ namespace arrow { +Status ArrayBuilder::AppendToBitmap(bool is_valid) { + if (length_ == capacity_) { + // If the capacity was not already a multiple of 2, do so here + // TODO(emkornfield) doubling isn't great default allocation practice + // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md + // fo discussion + RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); + } + UnsafeAppendToBitmap(is_valid); + return Status::OK(); +} + +Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int32_t length) { + RETURN_NOT_OK(Reserve(length)); + + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); +} + Status ArrayBuilder::Init(int32_t capacity) { capacity_ = capacity; int32_t to_alloc = util::ceil_byte(capacity) / 8; @@ -36,6 +55,7 @@ Status ArrayBuilder::Init(int32_t capacity) { } Status ArrayBuilder::Resize(int32_t new_bits) { + if (!null_bitmap_) { return Init(new_bits); } int32_t new_bytes = util::ceil_byte(new_bits) / 8; int32_t old_bytes = null_bitmap_->size(); RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); @@ -56,10 +76,46 @@ Status ArrayBuilder::Advance(int32_t elements) { Status ArrayBuilder::Reserve(int32_t elements) { if (length_ + elements > capacity_) { + // TODO(emkornfield) power of 2 growth is potentially suboptimal int32_t new_capacity = util::next_power2(length_ + elements); return Resize(new_capacity); } return Status::OK(); } +Status ArrayBuilder::SetNotNull(int32_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeSetNotNull(length); + return Status::OK(); +} + +void ArrayBuilder::UnsafeAppendToBitmap(bool is_valid) { + if (is_valid) { + util::set_bit(null_bitmap_data_, length_); + } else { + ++null_count_; + } + ++length_; +} + +void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t length) { + if (valid_bytes == nullptr) { + UnsafeSetNotNull(length); + return; + } + for (int32_t i = 0; i < length; ++i) { + // TODO(emkornfield) Optimize for large values of length? + UnsafeAppendToBitmap(valid_bytes[i] > 0); + } +} + +void ArrayBuilder::UnsafeSetNotNull(int32_t length) { + const int32_t new_length = length + length_; + // TODO(emkornfield) Optimize for large values of length? + for (int32_t i = length_; i < new_length; ++i) { + util::set_bit(null_bitmap_data_, i); + } + length_ = new_length; +} + } // namespace arrow diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 21a6341ef50..7d3f4398d73 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -34,7 +34,10 @@ class PoolBuffer; static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; -// Base class for all data array builders +// Base class for all data array builders. +// This class provides a facilities for incrementally building the null bitmap +// (see Append methods) and as a side effect the current number of slots and +// the null count. class ArrayBuilder { public: explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) @@ -46,7 +49,7 @@ class ArrayBuilder { length_(0), capacity_(0) {} - virtual ~ArrayBuilder() {} + virtual ~ArrayBuilder() = default; // For nested types. Since the objects are owned by this class instance, we // skip shared pointers and just return a raw pointer @@ -58,14 +61,27 @@ class ArrayBuilder { int32_t null_count() const { return null_count_; } int32_t capacity() const { return capacity_; } - // Allocates requires memory at this level, but children need to be - // initialized independently - Status Init(int32_t capacity); + // Append to null bitmap + Status AppendToBitmap(bool is_valid); + // Vector append. Treat each zero byte as a null. If valid_bytes is null + // assume all of length bits are valid. + Status AppendToBitmap(const uint8_t* valid_bytes, int32_t length); + // Set the next length bits to not null (i.e. valid). + Status SetNotNull(int32_t length); - // Resizes the null_bitmap array - Status Resize(int32_t new_bits); + // Allocates initial capacity requirements for the builder. In most + // cases subclasses should override and call there parent classes + // method as well. + virtual Status Init(int32_t capacity); - Status Reserve(int32_t extra_bits); + // Resizes the null_bitmap array. In most + // cases subclasses should override and call there parent classes + // method as well. + virtual Status Resize(int32_t new_bits); + + // Ensures there is enough space for adding the number of elements by checking + // capacity and calling Resize if necessary. + Status Reserve(int32_t elements); // For cases where raw data was memcpy'd into the internal buffers, allows us // to advance the length of the builder. It is your responsibility to use @@ -75,7 +91,7 @@ class ArrayBuilder { const std::shared_ptr& null_bitmap() const { return null_bitmap_; } // Creates new array object to hold the contents of the builder and transfers - // ownership of the data + // ownership of the data. This resets all variables on the builder. virtual std::shared_ptr Finish() = 0; const std::shared_ptr& type() const { return type_; } @@ -97,6 +113,18 @@ class ArrayBuilder { // Child value array builders. These are owned by this class std::vector> children_; + // + // Unsafe operations (don't check capacity/don't resize) + // + + // Append to null bitmap. + void UnsafeAppendToBitmap(bool is_valid); + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null + // assume all of length bits are valid. + void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t length); + // Set the next length bits to not null (i.e. valid). + void UnsafeSetNotNull(int32_t length); + private: DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 2f72c3aa846..bf6fa94dea7 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -19,17 +19,19 @@ #include #include +#include #include #include "arrow/array.h" -#include "arrow/ipc/memory.h" #include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/metadata.h" +#include "arrow/ipc/memory.h" #include "arrow/ipc/metadata-internal.h" +#include "arrow/ipc/metadata.h" #include "arrow/schema.h" #include "arrow/table.h" #include "arrow/type.h" #include "arrow/types/construct.h" +#include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" @@ -63,44 +65,70 @@ static bool IsPrimitive(const DataType* type) { } } +static bool IsListType(const DataType* type) { + DCHECK(type != nullptr); + switch (type->type) { + // TODO(emkornfield) grouping like this are used in a few places in the + // code consider using pattern like: + // http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type + // + // TODO(emkornfield) Fix type systems so these are all considered lists and + // the types behave the same way? + // case Type::BINARY: + // case Type::CHAR: + case Type::LIST: + // see todo on common types + // case Type::STRING: + // case Type::VARCHAR: + return true; + default: + return false; + } +} + // ---------------------------------------------------------------------- // Row batch write path Status VisitArray(const Array* arr, std::vector* field_nodes, - std::vector>* buffers) { - if (IsPrimitive(arr->type().get())) { - const PrimitiveArray* prim_arr = static_cast(arr); - - field_nodes->push_back( - flatbuf::FieldNode(prim_arr->length(), prim_arr->null_count())); + std::vector>* buffers, int max_recursion_depth) { + if (max_recursion_depth <= 0) { return Status::Invalid("Max recursion depth reached"); } + DCHECK(arr); + DCHECK(field_nodes); + // push back all common elements + field_nodes->push_back(flatbuf::FieldNode(arr->length(), arr->null_count())); + if (arr->null_count() > 0) { + buffers->push_back(arr->null_bitmap()); + } else { + // Push a dummy zero-length buffer, not to be copied + buffers->push_back(std::make_shared(nullptr, 0)); + } - if (prim_arr->null_count() > 0) { - buffers->push_back(prim_arr->null_bitmap()); - } else { - // Push a dummy zero-length buffer, not to be copied - buffers->push_back(std::make_shared(nullptr, 0)); - } + const DataType* arr_type = arr->type().get(); + if (IsPrimitive(arr_type)) { + const auto prim_arr = static_cast(arr); buffers->push_back(prim_arr->data()); - } else if (arr->type_enum() == Type::LIST) { - // TODO(wesm) - return Status::NotImplemented("List type"); + } else if (IsListType(arr_type)) { + const auto list_arr = static_cast(arr); + buffers->push_back(list_arr->offset_buffer()); + RETURN_NOT_OK(VisitArray( + list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1)); } else if (arr->type_enum() == Type::STRUCT) { // TODO(wesm) return Status::NotImplemented("Struct type"); } - return Status::OK(); } class RowBatchWriter { public: - explicit RowBatchWriter(const RowBatch* batch) : batch_(batch) {} + RowBatchWriter(const RowBatch* batch, int max_recursion_depth) + : batch_(batch), max_recursion_depth_(max_recursion_depth) {} Status AssemblePayload() { // Perform depth-first traversal of the row-batch for (int i = 0; i < batch_->num_columns(); ++i) { const Array* arr = batch_->column(i).get(); - RETURN_NOT_OK(VisitArray(arr, &field_nodes_, &buffers_)); + RETURN_NOT_OK(VisitArray(arr, &field_nodes_, &buffers_, max_recursion_depth_)); } return Status::OK(); } @@ -111,8 +139,10 @@ class RowBatchWriter { int64_t offset = 0; for (size_t i = 0; i < buffers_.size(); ++i) { const Buffer* buffer = buffers_[i].get(); - int64_t size = buffer->size(); + int64_t size = 0; + // The buffer might be null if we are handling zero row lengths. + if (buffer) { size = buffer->size(); } // TODO(wesm): We currently have no notion of shared memory page id's, // but we've included it in the metadata IDL for when we have it in the // future. Use page=0 for now @@ -171,11 +201,13 @@ class RowBatchWriter { std::vector field_nodes_; std::vector buffer_meta_; std::vector> buffers_; + int max_recursion_depth_; }; -Status WriteRowBatch( - MemorySource* dst, const RowBatch* batch, int64_t position, int64_t* header_offset) { - RowBatchWriter serializer(batch); +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset, int max_recursion_depth) { + DCHECK_GT(max_recursion_depth, 0); + RowBatchWriter serializer(batch, max_recursion_depth); RETURN_NOT_OK(serializer.AssemblePayload()); return serializer.Write(dst, position, header_offset); } @@ -186,8 +218,9 @@ static constexpr int64_t INIT_METADATA_SIZE = 4096; class RowBatchReader::Impl { public: - Impl(MemorySource* source, const std::shared_ptr& metadata) - : source_(source), metadata_(metadata) { + Impl(MemorySource* source, const std::shared_ptr& metadata, + int max_recursion_depth) + : source_(source), metadata_(metadata), max_recursion_depth_(max_recursion_depth) { num_buffers_ = metadata->num_buffers(); num_flattened_fields_ = metadata->num_fields(); } @@ -203,7 +236,7 @@ class RowBatchReader::Impl { buffer_index_ = 0; for (int i = 0; i < schema->num_fields(); ++i) { const Field* field = schema->field(i).get(); - RETURN_NOT_OK(NextArray(field, &arrays[i])); + RETURN_NOT_OK(NextArray(field, max_recursion_depth_, &arrays[i])); } *out = std::make_shared(schema, metadata_->length(), arrays); @@ -213,8 +246,12 @@ class RowBatchReader::Impl { private: // Traverse the flattened record batch metadata and reassemble the // corresponding array containers - Status NextArray(const Field* field, std::shared_ptr* out) { - const std::shared_ptr& type = field->type; + Status NextArray( + const Field* field, int max_recursion_depth, std::shared_ptr* out) { + const TypePtr& type = field->type; + if (max_recursion_depth <= 0) { + return Status::Invalid("Max recursion depth reached"); + } // pop off a field if (field_index_ >= num_flattened_fields_) { @@ -226,23 +263,42 @@ class RowBatchReader::Impl { // we can skip that buffer without reading from shared memory FieldMetadata field_meta = metadata_->field(field_index_++); + // extract null_bitmap which is common to all arrays + std::shared_ptr null_bitmap; + if (field_meta.null_count == 0) { + ++buffer_index_; + } else { + RETURN_NOT_OK(GetBuffer(buffer_index_++, &null_bitmap)); + } + if (IsPrimitive(type.get())) { - std::shared_ptr null_bitmap; std::shared_ptr data; - if (field_meta.null_count == 0) { - null_bitmap = nullptr; - ++buffer_index_; - } else { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &null_bitmap)); - } if (field_meta.length > 0) { RETURN_NOT_OK(GetBuffer(buffer_index_++, &data)); } else { + buffer_index_++; data.reset(new Buffer(nullptr, 0)); } return MakePrimitiveArray( type, field_meta.length, data, field_meta.null_count, null_bitmap, out); } + + if (IsListType(type.get())) { + std::shared_ptr offsets; + RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); + const int num_children = type->num_children(); + if (num_children != 1) { + std::stringstream ss; + ss << "Field: " << field->ToString() + << " has wrong number of children:" << num_children; + return Status::Invalid(ss.str()); + } + std::shared_ptr values_array; + RETURN_NOT_OK( + NextArray(type->child(0).get(), max_recursion_depth - 1, &values_array)); + return MakeListArray(type, field_meta.length, offsets, values_array, + field_meta.null_count, null_bitmap, out); + } return Status::NotImplemented("Non-primitive types not complete yet"); } @@ -256,12 +312,18 @@ class RowBatchReader::Impl { int field_index_; int buffer_index_; + int max_recursion_depth_; int num_buffers_; int num_flattened_fields_; }; Status RowBatchReader::Open( MemorySource* source, int64_t position, std::shared_ptr* out) { + return Open(source, position, kMaxIpcRecursionDepth, out); +} + +Status RowBatchReader::Open(MemorySource* source, int64_t position, + int max_recursion_depth, std::shared_ptr* out) { std::shared_ptr metadata; RETURN_NOT_OK(source->ReadAt(position, INIT_METADATA_SIZE, &metadata)); @@ -286,7 +348,7 @@ Status RowBatchReader::Open( std::shared_ptr batch_meta = message->GetRecordBatch(); std::shared_ptr result(new RowBatchReader()); - result->impl_.reset(new Impl(source, batch_meta)); + result->impl_.reset(new Impl(source, batch_meta, max_recursion_depth)); *out = result; return Status::OK(); diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index d453fa05f49..4c9a8a9d8ee 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -38,7 +38,9 @@ class RecordBatchMessage; // ---------------------------------------------------------------------- // Write path - +// We have trouble decoding flatbuffers if the size i > 70, so 64 is a nice round number +// TODO(emkornfield) investigate this more +constexpr int kMaxIpcRecursionDepth = 64; // Write the RowBatch (collection of equal-length Arrow arrays) to the memory // source at the indicated position // @@ -52,8 +54,8 @@ class RecordBatchMessage; // // Finally, the memory offset to the start of the metadata / data header is // returned in an out-variable -Status WriteRowBatch( - MemorySource* dst, const RowBatch* batch, int64_t position, int64_t* header_offset); +Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, + int64_t* header_offset, int max_recursion_depth = kMaxIpcRecursionDepth); // int64_t GetRowBatchMetadata(const RowBatch* batch); @@ -70,6 +72,9 @@ class RowBatchReader { static Status Open( MemorySource* source, int64_t position, std::shared_ptr* out); + static Status Open(MemorySource* source, int64_t position, int max_recursion_depth, + std::shared_ptr* out); + // Reassemble the row batch. A Schema is required to be able to construct the // right array containers Status GetRowBatch( diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index fbdda77e491..c243cfba820 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -18,9 +18,7 @@ #include #include #include -#include #include -#include #include #include @@ -31,6 +29,7 @@ #include "arrow/ipc/test-common.h" #include "arrow/test-util.h" +#include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -40,25 +39,56 @@ namespace arrow { namespace ipc { -class TestWriteRowBatch : public ::testing::Test, public MemoryMapFixture { +// TODO(emkornfield) convert to google style kInt32, etc? +const auto INT32 = std::make_shared(); +const auto LIST_INT32 = std::make_shared(INT32); +const auto LIST_LIST_INT32 = std::make_shared(LIST_INT32); + +typedef Status MakeRowBatch(std::shared_ptr* out); + +class TestWriteRowBatch : public ::testing::TestWithParam, + public MemoryMapFixture { public: void SetUp() { pool_ = default_memory_pool(); } void TearDown() { MemoryMapFixture::TearDown(); } - void InitMemoryMap(int64_t size) { + Status RoundTripHelper(const RowBatch& batch, int memory_map_size, + std::shared_ptr* batch_result) { std::string path = "test-write-row-batch"; - MemoryMapFixture::CreateFile(path, size); - ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &mmap_)); + MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); + int64_t header_location; + RETURN_NOT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); + + std::shared_ptr reader; + RETURN_NOT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); + + RETURN_NOT_OK(reader->GetRowBatch(batch.schema(), batch_result)); + return Status::OK(); } protected: - MemoryPool* pool_; std::shared_ptr mmap_; + MemoryPool* pool_; }; -const auto INT32 = std::make_shared(); +TEST_P(TestWriteRowBatch, RoundTrip) { + std::shared_ptr batch; + ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue + std::shared_ptr batch_result; + ASSERT_OK(RoundTripHelper(*batch, 1 << 16, &batch_result)); + + // do checks + ASSERT_TRUE(batch->schema()->Equals(batch_result->schema())); + ASSERT_EQ(batch->num_columns(), batch_result->num_columns()) + << batch->schema()->ToString() << " result: " << batch_result->schema()->ToString(); + EXPECT_EQ(batch->num_rows(), batch_result->num_rows()); + for (int i = 0; i < batch->num_columns(); ++i) { + EXPECT_TRUE(batch->column(i)->Equals(batch_result->column(i))) + << "Idx: " << i << " Name: " << batch->column_name(i); + } +} -TEST_F(TestWriteRowBatch, IntegerRoundTrip) { +Status MakeIntRowBatch(std::shared_ptr* out) { const int length = 1000; // Make the schema @@ -67,41 +97,159 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { std::shared_ptr schema(new Schema({f0, f1})); // Example data + std::shared_ptr a0, a1; + MemoryPool* pool = default_memory_pool(); + RETURN_NOT_OK(MakeRandomInt32Array(length, false, pool, &a0)); + RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &a1)); + out->reset(new RowBatch(schema, length, {a0, a1})); + return Status::OK(); +} - auto data = std::make_shared(pool_); - ASSERT_OK(data->Resize(length * sizeof(int32_t))); - test::rand_uniform_int(length, 0, 0, std::numeric_limits::max(), - reinterpret_cast(data->mutable_data())); +Status MakeListRowBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", LIST_INT32); + auto f1 = std::make_shared("f1", LIST_LIST_INT32); + auto f2 = std::make_shared("f2", INT32); + std::shared_ptr schema(new Schema({f0, f1, f2})); - auto null_bitmap = std::make_shared(pool_); - int null_bytes = util::bytes_for_bits(length); - ASSERT_OK(null_bitmap->Resize(null_bytes)); - test::random_bytes(null_bytes, 0, null_bitmap->mutable_data()); + // Example data - auto a0 = std::make_shared(length, data); - auto a1 = std::make_shared( - length, data, test::bitmap_popcount(null_bitmap->data(), length), null_bitmap); + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK( + MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); + out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} - RowBatch batch(schema, length, {a0, a1}); +Status MakeZeroLengthRowBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", LIST_INT32); + auto f1 = std::make_shared("f1", LIST_LIST_INT32); + auto f2 = std::make_shared("f2", INT32); + std::shared_ptr schema(new Schema({f0, f1, f2})); - // TODO(wesm): computing memory requirements for a row batch - // 64k is plenty of space - InitMemoryMap(1 << 16); + // Example data + MemoryPool* pool = default_memory_pool(); + const int length = 200; + const bool include_nulls = true; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK(MakeRandomListArray(leaf_values, 0, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); + out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} - int64_t header_location; - ASSERT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); +Status MakeNonNullRowBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", LIST_INT32); + auto f1 = std::make_shared("f1", LIST_LIST_INT32); + auto f2 = std::make_shared("f2", INT32); + std::shared_ptr schema(new Schema({f0, f1, f2})); - std::shared_ptr result; - ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &result)); + // Example data + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; - std::shared_ptr batch_result; - ASSERT_OK(result->GetRowBatch(schema, &batch_result)); - EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); + RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); + bool include_nulls = false; + RETURN_NOT_OK(MakeRandomListArray(leaf_values, 50, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, 50, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); + out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} - for (int i = 0; i < batch.num_columns(); ++i) { - EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) << i - << batch.column_name(i); +Status MakeDeeplyNestedList(std::shared_ptr* out) { + const int batch_length = 5; + TypePtr type = INT32; + + MemoryPool* pool = default_memory_pool(); + ArrayPtr array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); + for (int i = 0; i < 63; ++i) { + type = std::static_pointer_cast(std::make_shared(type)); + RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); + } + + auto f0 = std::make_shared("f0", type); + std::shared_ptr schema(new Schema({f0})); + std::vector arrays = {array}; + out->reset(new RowBatch(schema, batch_length, arrays)); + return Status::OK(); +} + +INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, + ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, + &MakeZeroLengthRowBatch, &MakeDeeplyNestedList)); + +class RecursionLimits : public ::testing::Test, public MemoryMapFixture { + public: + void SetUp() { pool_ = default_memory_pool(); } + void TearDown() { MemoryMapFixture::TearDown(); } + + Status WriteToMmap(int recursion_level, bool override_level, + int64_t* header_out = nullptr, std::shared_ptr* schema_out = nullptr) { + const int batch_length = 5; + TypePtr type = INT32; + ArrayPtr array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool_, &array)); + for (int i = 0; i < recursion_level; ++i) { + type = std::static_pointer_cast(std::make_shared(type)); + RETURN_NOT_OK( + MakeRandomListArray(array, batch_length, include_nulls, pool_, &array)); + } + + auto f0 = std::make_shared("f0", type); + std::shared_ptr schema(new Schema({f0})); + if (schema_out != nullptr) { *schema_out = schema; } + std::vector arrays = {array}; + auto batch = std::make_shared(schema, batch_length, arrays); + + std::string path = "test-write-past-max-recursion"; + const int memory_map_size = 1 << 16; + MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); + int64_t header_location; + int64_t* header_out_param = header_out == nullptr ? &header_location : header_out; + if (override_level) { + return WriteRowBatch( + mmap_.get(), batch.get(), 0, header_out_param, recursion_level + 1); + } else { + return WriteRowBatch(mmap_.get(), batch.get(), 0, header_out_param); + } } + + protected: + std::shared_ptr mmap_; + MemoryPool* pool_; +}; + +TEST_F(RecursionLimits, WriteLimit) { + ASSERT_RAISES(Invalid, WriteToMmap((1 << 8) + 1, false)); +} + +TEST_F(RecursionLimits, ReadLimit) { + int64_t header_location; + std::shared_ptr schema; + ASSERT_OK(WriteToMmap(64, true, &header_location, &schema)); + + std::shared_ptr reader; + ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); + std::shared_ptr batch_result; + ASSERT_RAISES(Invalid, reader->GetRowBatch(schema, &batch_result)); } } // namespace ipc diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index 2b077e97929..84cbc182cd2 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -18,6 +18,7 @@ #include "arrow/ipc/memory.h" #include // For memory-mapping + #include #include #include diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index ad5951d17e2..1b1d50f96ea 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -17,13 +17,14 @@ #include "arrow/ipc/metadata-internal.h" -#include #include #include #include #include #include +#include "flatbuffers/flatbuffers.h" + #include "arrow/ipc/Message_generated.h" #include "arrow/schema.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 779c5a30a04..871b5bc4bf6 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -18,11 +18,12 @@ #ifndef ARROW_IPC_METADATA_INTERNAL_H #define ARROW_IPC_METADATA_INTERNAL_H -#include #include #include #include +#include "flatbuffers/flatbuffers.h" + #include "arrow/ipc/Message_generated.h" namespace arrow { diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index bcf104f0b8b..4fc8ec50eb7 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -17,11 +17,12 @@ #include "arrow/ipc/metadata.h" -#include #include #include #include +#include "flatbuffers/flatbuffers.h" + // Generated C++ flatbuffer IDL #include "arrow/ipc/Message_generated.h" #include "arrow/ipc/metadata-internal.h" diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 65c837dc8b1..e7dbb84d790 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -18,11 +18,19 @@ #ifndef ARROW_IPC_TEST_COMMON_H #define ARROW_IPC_TEST_COMMON_H +#include #include #include #include #include +#include "arrow/array.h" +#include "arrow/test-util.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" + namespace arrow { namespace ipc { @@ -41,10 +49,69 @@ class MemoryMapFixture { fclose(file); } + Status InitMemoryMap( + int64_t size, const std::string& path, std::shared_ptr* mmap) { + CreateFile(path, size); + return MemoryMappedSource::Open(path, MemorySource::READ_WRITE, mmap); + } + private: std::vector tmp_files_; }; +Status MakeRandomInt32Array( + int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { + std::shared_ptr data; + test::MakeRandomInt32PoolBuffer(length, pool, &data); + const auto INT32 = std::make_shared(); + Int32Builder builder(pool, INT32); + if (include_nulls) { + std::shared_ptr valid_bytes; + test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); + RETURN_NOT_OK(builder.Append( + reinterpret_cast(data->data()), length, valid_bytes->data())); + *array = builder.Finish(); + return Status::OK(); + } + RETURN_NOT_OK(builder.Append(reinterpret_cast(data->data()), length)); + *array = builder.Finish(); + return Status::OK(); +} + +Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, + bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { + // Create the null list values + std::vector valid_lists(num_lists); + const double null_percent = include_nulls ? 0.1 : 0; + test::random_null_bytes(num_lists, null_percent, valid_lists.data()); + + // Create list offsets + const int max_list_size = 10; + + std::vector list_sizes(num_lists, 0); + std::vector offsets( + num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below. + const int seed = child_array->length(); + if (num_lists > 0) { + test::rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data()); + // make sure sizes are consistent with null + std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(), + list_sizes.begin(), + [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; }); + std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin()); + + // Force invariants + const int child_length = child_array->length(); + offsets[0] = 0; + std::replace_if(offsets.begin(), offsets.end(), + [child_length](int32_t offset) { return offset > child_length; }, child_length); + } + ListBuilder builder(pool, child_array); + RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); + *array = builder.Finish(); + return (*array)->Validate(); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 066388b4d0e..560e2837406 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -21,8 +21,8 @@ #include "parquet/api/schema.h" -#include "arrow/util/status.h" #include "arrow/types/decimal.h" +#include "arrow/util/status.h" using parquet::schema::Node; using parquet::schema::NodePtr; diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc index a38acaa94ba..ff3ea1990e5 100644 --- a/cpp/src/arrow/schema.cc +++ b/cpp/src/arrow/schema.cc @@ -18,8 +18,8 @@ #include "arrow/schema.h" #include -#include #include +#include #include #include "arrow/type.h" diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 538d9b233d9..2f81161d1d6 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -19,6 +19,7 @@ #define ARROW_TEST_UTIL_H_ #include +#include #include #include #include @@ -26,12 +27,13 @@ #include "gtest/gtest.h" -#include "arrow/type.h" #include "arrow/column.h" #include "arrow/schema.h" #include "arrow/table.h" +#include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" +#include "arrow/util/logging.h" #include "arrow/util/memory-pool.h" #include "arrow/util/random.h" #include "arrow/util/status.h" @@ -103,10 +105,12 @@ std::shared_ptr to_buffer(const std::vector& values) { reinterpret_cast(values.data()), values.size() * sizeof(T)); } -void random_null_bitmap(int64_t n, double pct_null, uint8_t* null_bitmap) { +// Sets approximately pct_null of the first n bytes in null_bytes to zero +// and the rest to non-zero (true) values. +void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { - null_bitmap[i] = rng.NextDoubleFraction() > pct_null; + null_bytes[i] = rng.NextDoubleFraction() > pct_null; } } @@ -121,6 +125,7 @@ static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { template void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { + DCHECK(out); std::mt19937 gen(seed); std::uniform_int_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { @@ -129,11 +134,25 @@ void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { } static inline int bitmap_popcount(const uint8_t* data, int length) { + // book keeping + constexpr int pop_len = sizeof(uint64_t); + const uint64_t* i64_data = reinterpret_cast(data); + const int fast_counts = length / pop_len; + const uint64_t* end = i64_data + fast_counts; + int count = 0; - for (int i = 0; i < length; ++i) { - // TODO(wesm): accelerate this + // popcount as much as possible with the widest possible count + for (auto iter = i64_data; iter < end; ++iter) { + count += __builtin_popcountll(*iter); + } + + // Account for left over bytes (in theory we could fall back to smaller + // versions of popcount but the code complexity is likely not worth it) + const int loop_tail_index = fast_counts * pop_len; + for (int i = loop_tail_index; i < length; ++i) { if (util::get_bit(data, i)) { ++count; } } + return count; } @@ -153,6 +172,26 @@ std::shared_ptr bytes_to_null_buffer(const std::vector& bytes) return out; } +Status MakeRandomInt32PoolBuffer(int32_t length, MemoryPool* pool, + std::shared_ptr* pool_buffer, uint32_t seed = 0) { + DCHECK(pool); + auto data = std::make_shared(pool); + RETURN_NOT_OK(data->Resize(length * sizeof(int32_t))); + test::rand_uniform_int(length, seed, 0, std::numeric_limits::max(), + reinterpret_cast(data->mutable_data())); + *pool_buffer = data; + return Status::OK(); +} + +Status MakeRandomBytePoolBuffer(int32_t length, MemoryPool* pool, + std::shared_ptr* pool_buffer, uint32_t seed = 0) { + auto bytes = std::make_shared(pool); + RETURN_NOT_OK(bytes->Resize(length)); + test::random_bytes(length, seed, bytes->mutable_data()); + *pool_buffer = bytes; + return Status::OK(); +} + } // namespace test } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 051ab46b199..77404cd7025 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -116,7 +116,7 @@ struct DataType { bool Equals(const DataType* other) { // Call with a pointer so more friendly to subclasses - return this == other || (this->type == other->type); + return other && ((this == other) || (this->type == other->type)); } bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 0a30929b97c..78036d4bf57 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -20,8 +20,8 @@ #include #include "arrow/type.h" -#include "arrow/types/primitive.h" #include "arrow/types/list.h" +#include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -60,11 +60,10 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, case Type::LIST: { std::shared_ptr value_builder; - const std::shared_ptr& value_type = static_cast(type.get())->value_type(); RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder)); - out->reset(new ListBuilder(pool, type, value_builder)); + out->reset(new ListBuilder(pool, value_builder)); return Status::OK(); } default: @@ -75,11 +74,11 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \ case Type::ENUM: \ out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \ - return Status::OK(); + break; -Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, +Status MakePrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, - const std::shared_ptr& null_bitmap, std::shared_ptr* out) { + const std::shared_ptr& null_bitmap, ArrayPtr* out) { switch (type->type) { MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray); MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array); @@ -90,11 +89,43 @@ Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, Int64Array); MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray); + default: + return Status::NotImplemented(type->ToString()); + } +#ifdef NDEBUG + return Status::OK(); +#else + return (*out)->Validate(); +#endif +} + +Status MakeListArray(const TypePtr& type, int32_t length, + const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap, ArrayPtr* out) { + switch (type->type) { + case Type::BINARY: + case Type::LIST: + out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); + break; + case Type::CHAR: + case Type::DECIMAL_TEXT: + case Type::STRING: + case Type::VARCHAR: + out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); + break; default: return Status::NotImplemented(type->ToString()); } +#ifdef NDEBUG + return Status::OK(); +#else + return (*out)->Validate(); +#endif } } // namespace arrow diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 27fb7bd2149..43c0018c67e 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -33,10 +33,19 @@ class Status; Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); +// Create new arrays for logical types that are backed by primitive arrays. Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out); +// Create new list arrays for logical types that are backed by ListArrays (e.g. list of +// primitives and strings) +// TODO(emkornfield) split up string vs list? +Status MakeListArray(const std::shared_ptr& type, int32_t length, + const std::shared_ptr& offests, const std::shared_ptr& values, + int32_t null_count, const std::shared_ptr& null_bitmap, + std::shared_ptr* out); + } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index aa34f23cc02..6a8ad9aa59e 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#include #include +#include #include #include #include @@ -94,6 +94,7 @@ TEST_F(TestListBuilder, TestAppendNull) { Done(); + ASSERT_OK(result_->Validate()); ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); @@ -105,50 +106,93 @@ TEST_F(TestListBuilder, TestAppendNull) { ASSERT_EQ(0, values->length()); } +void ValidateBasicListArray(const ListArray* result, const vector& values, + const vector& is_valid) { + ASSERT_OK(result->Validate()); + ASSERT_EQ(1, result->null_count()); + ASSERT_EQ(0, result->values()->null_count()); + + ASSERT_EQ(3, result->length()); + vector ex_offsets = {0, 3, 3, 7}; + for (size_t i = 0; i < ex_offsets.size(); ++i) { + ASSERT_EQ(ex_offsets[i], result->offset(i)); + } + + for (int i = 0; i < result->length(); ++i) { + ASSERT_EQ(!static_cast(is_valid[i]), result->IsNull(i)); + } + + ASSERT_EQ(7, result->values()->length()); + Int32Array* varr = static_cast(result->values().get()); + + for (size_t i = 0; i < values.size(); ++i) { + ASSERT_EQ(values[i], varr->Value(i)); + } +} + TEST_F(TestListBuilder, TestBasics) { vector values = {0, 1, 2, 3, 4, 5, 6}; vector lengths = {3, 0, 4}; - vector is_null = {0, 1, 0}; + vector is_valid = {1, 0, 1}; Int32Builder* vb = static_cast(builder_->value_builder().get()); - EXPECT_OK(builder_->Reserve(lengths.size())); - EXPECT_OK(vb->Reserve(values.size())); + ASSERT_OK(builder_->Reserve(lengths.size())); + ASSERT_OK(vb->Reserve(values.size())); int pos = 0; for (size_t i = 0; i < lengths.size(); ++i) { - ASSERT_OK(builder_->Append(is_null[i] > 0)); + ASSERT_OK(builder_->Append(is_valid[i] > 0)); for (int j = 0; j < lengths[i]; ++j) { vb->Append(values[pos++]); } } Done(); + ValidateBasicListArray(result_.get(), values, is_valid); +} - ASSERT_EQ(1, result_->null_count()); - ASSERT_EQ(0, result_->values()->null_count()); +TEST_F(TestListBuilder, BulkAppend) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_valid = {1, 0, 1}; + vector offsets = {0, 3, 3}; - ASSERT_EQ(3, result_->length()); - vector ex_offsets = {0, 3, 3, 7}; - for (size_t i = 0; i < ex_offsets.size(); ++i) { - ASSERT_EQ(ex_offsets[i], result_->offset(i)); - } + Int32Builder* vb = static_cast(builder_->value_builder().get()); + ASSERT_OK(vb->Reserve(values.size())); - for (int i = 0; i < result_->length(); ++i) { - ASSERT_EQ(static_cast(is_null[i]), result_->IsNull(i)); + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + for (int32_t value : values) { + vb->Append(value); } + Done(); + ValidateBasicListArray(result_.get(), values, is_valid); +} - ASSERT_EQ(7, result_->values()->length()); - Int32Array* varr = static_cast(result_->values().get()); +TEST_F(TestListBuilder, BulkAppendInvalid) { + vector values = {0, 1, 2, 3, 4, 5, 6}; + vector lengths = {3, 0, 4}; + vector is_null = {0, 1, 0}; + vector is_valid = {1, 0, 1}; + vector offsets = {0, 2, 4}; // should be 0, 3, 3 given the is_null array - for (size_t i = 0; i < values.size(); ++i) { - ASSERT_EQ(values[i], varr->Value(i)); + Int32Builder* vb = static_cast(builder_->value_builder().get()); + ASSERT_OK(vb->Reserve(values.size())); + + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + builder_->Append(offsets.data(), offsets.size(), is_valid.data()); + for (int32_t value : values) { + vb->Append(value); } + + Done(); + ASSERT_RAISES(Invalid, result_->Validate()); } TEST_F(TestListBuilder, TestZeroLength) { // All buffers are null Done(); + ASSERT_OK(result_->Validate()); } } // namespace arrow diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 23f12ddc4ec..fc3331139c6 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -14,23 +14,26 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - #include "arrow/types/list.h" +#include + namespace arrow { bool ListArray::EqualsExact(const ListArray& other) const { if (this == &other) { return true; } if (null_count_ != other.null_count_) { return false; } - bool equal_offsets = offset_buf_->Equals(*other.offset_buf_, length_ + 1); + bool equal_offsets = + offset_buf_->Equals(*other.offset_buf_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } bool equal_null_bitmap = true; if (null_count_ > 0) { equal_null_bitmap = null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); } - if (!(equal_offsets && equal_null_bitmap)) { return false; } + if (!equal_null_bitmap) { return false; } return values()->Equals(other.values()); } @@ -41,4 +44,55 @@ bool ListArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +Status ListArray::Validate() const { + if (length_ < 0) { return Status::Invalid("Length was negative"); } + if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } + if (offset_buf_->size() / sizeof(int32_t) < length_) { + std::stringstream ss; + ss << "offset buffer size (bytes): " << offset_buf_->size() + << " isn't large enough for length: " << length_; + return Status::Invalid(ss.str()); + } + const int32_t last_offset = offset(length_); + if (last_offset > 0) { + if (!values_) { + return Status::Invalid("last offset was non-zero and values was null"); + } + if (values_->length() != last_offset) { + std::stringstream ss; + ss << "Final offset invariant not equal to values length: " << last_offset + << "!=" << values_->length(); + return Status::Invalid(ss.str()); + } + + const Status child_valid = values_->Validate(); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString(); + return Status::Invalid(ss.str()); + } + } + + int32_t prev_offset = offset(0); + if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); } + for (int32_t i = 1; i <= length_; ++i) { + int32_t current_offset = offset(i); + if (IsNull(i - 1) && current_offset != prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure at: " << i << " inconsistent offsets for null slot" + << current_offset << "!=" << prev_offset; + return Status::Invalid(ss.str()); + } + if (current_offset < prev_offset) { + std::stringstream ss; + ss << "Offset invariant failure: " << i + << " inconsistent offset for non-null slot: " << current_offset << "<" + << prev_offset; + return Status::Invalid(ss.str()); + } + prev_offset = current_offset; + } + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 6b815460ecb..e2302d917b8 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -28,6 +28,7 @@ #include "arrow/types/primitive.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" +#include "arrow/util/logging.h" #include "arrow/util/status.h" namespace arrow { @@ -46,11 +47,16 @@ class ListArray : public Array { values_ = values; } - virtual ~ListArray() {} + Status Validate() const override; + + virtual ~ListArray() = default; // Return a shared pointer in case the requestor desires to share ownership // with this array. const std::shared_ptr& values() const { return values_; } + const std::shared_ptr offset_buffer() const { + return std::static_pointer_cast(offset_buf_); + } const std::shared_ptr& value_type() const { return values_->type(); } @@ -78,59 +84,73 @@ class ListArray : public Array { // // To use this class, you must append values to the child array builder and use // the Append function to delimit each distinct list value (once the values -// have been appended to the child array) -class ListBuilder : public Int32Builder { +// have been appended to the child array) or use the bulk API to append +// a sequence of offests and null values. +// +// A note on types. Per arrow/type.h all types in the c++ implementation are +// logical so even though this class always builds an Array of lists, this can +// represent multiple different logical types. If no logical type is provided +// at construction time, the class defaults to List where t is take from the +// value_builder/values that the object is constructed with. +class ListBuilder : public ArrayBuilder { public: + // Use this constructor to incrementally build the value array along with offsets and + // null bitmap. + ListBuilder(MemoryPool* pool, std::shared_ptr value_builder, + const TypePtr& type = nullptr) + : ArrayBuilder( + pool, type ? type : std::static_pointer_cast( + std::make_shared(value_builder->type()))), + offset_builder_(pool), + value_builder_(value_builder) {} + + // Use this constructor to build the list with a pre-existing values array ListBuilder( - MemoryPool* pool, const TypePtr& type, std::shared_ptr value_builder) - : Int32Builder(pool, type), value_builder_(value_builder) {} - - Status Init(int32_t elements) { - // One more than requested. - // - // XXX: This is slightly imprecise, because we might trigger null mask - // resizes that are unnecessary when creating arrays with power-of-two size - return Int32Builder::Init(elements + 1); + MemoryPool* pool, std::shared_ptr values, const TypePtr& type = nullptr) + : ArrayBuilder(pool, type ? type : std::static_pointer_cast( + std::make_shared(values->type()))), + offset_builder_(pool), + values_(values) {} + + Status Init(int32_t elements) override { + RETURN_NOT_OK(ArrayBuilder::Init(elements)); + // one more then requested for offsets + return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); } - Status Resize(int32_t capacity) { - // Need space for the end offset - RETURN_NOT_OK(Int32Builder::Resize(capacity + 1)); - - // Slight hack, as the "real" capacity is one less - --capacity_; - return Status::OK(); + Status Resize(int32_t capacity) override { + // one more then requested for offsets + RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); } // Vector append // // If passed, valid_bytes is of equal length to values, and any zero byte // will be considered as a null for that slot - Status Append(value_type* values, int32_t length, uint8_t* valid_bytes = nullptr) { - if (length_ + length > capacity_) { - int32_t new_capacity = util::next_power2(length_ + length); - RETURN_NOT_OK(Resize(new_capacity)); - } - memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); - - if (valid_bytes != nullptr) { AppendNulls(valid_bytes, length); } - - length_ += length; + Status Append( + const int32_t* offsets, int32_t length, const uint8_t* valid_bytes = nullptr) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + offset_builder_.UnsafeAppend(offsets, length); return Status::OK(); } + // The same as Finalize but allows for overridding the c++ type template std::shared_ptr Transfer() { - std::shared_ptr items = value_builder_->Finish(); + std::shared_ptr items = values_; + if (!items) { items = value_builder_->Finish(); } - // Add final offset if the length is non-zero - if (length_) { raw_data_[length_] = items->length(); } + offset_builder_.Append(items->length()); + const auto offsets_buffer = offset_builder_.Finish(); auto result = std::make_shared( - type_, length_, data_, items, null_count_, null_bitmap_); + type_, length_, offsets_buffer, items, null_count_, null_bitmap_); - data_ = null_bitmap_ = nullptr; + // TODO(emkornfield) make a reset method capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; return result; } @@ -141,26 +161,24 @@ class ListBuilder : public Int32Builder { // // This function should be called before beginning to append elements to the // value builder - Status Append(bool is_null = false) { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); - } - if (is_null) { - ++null_count_; - } else { - util::set_bit(null_bitmap_data_, length_); - } - raw_data_[length_++] = value_builder_->length(); + Status Append(bool is_valid = true) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + RETURN_NOT_OK(offset_builder_.Append(value_builder_->length())); return Status::OK(); } - Status AppendNull() { return Append(true); } + Status AppendNull() { return Append(false); } - const std::shared_ptr& value_builder() const { return value_builder_; } + const std::shared_ptr& value_builder() const { + DCHECK(!values_) << "Using value builder is pointless when values_ is set"; + return value_builder_; + } protected: + BufferBuilder offset_builder_; std::shared_ptr value_builder_; + std::shared_ptr values_; }; } // namespace arrow diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 6bd9e73eb46..2b4c0879a28 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -102,7 +102,7 @@ class TestPrimitiveBuilder : public TestBuilder { Attrs::draw(N, &draws_); valid_bytes_.resize(N); - test::random_null_bitmap(N, pct_null, valid_bytes_.data()); + test::random_null_bytes(N, pct_null, valid_bytes_.data()); } void Check(const std::shared_ptr& builder, bool nullable) { @@ -193,8 +193,8 @@ void TestPrimitiveBuilder::RandomData(int N, double pct_null) { draws_.resize(N); valid_bytes_.resize(N); - test::random_null_bitmap(N, 0.5, draws_.data()); - test::random_null_bitmap(N, pct_null, valid_bytes_.data()); + test::random_null_bytes(N, 0.5, draws_.data()); + test::random_null_bytes(N, pct_null, valid_bytes_.data()); } template <> diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 9549c47b411..9102c530e25 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -57,12 +57,14 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { } return true; } else { + if (length_ == 0 && other.length_ == 0) { return true; } return data_->Equals(*other.data_, length_); } } bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { if (this == arr.get()) { return true; } + if (!arr) { return false; } if (this->type_enum() != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } @@ -101,48 +103,21 @@ Status PrimitiveBuilder::Resize(int32_t capacity) { return Status::OK(); } -template -Status PrimitiveBuilder::Reserve(int32_t elements) { - if (length_ + elements > capacity_) { - int32_t new_capacity = util::next_power2(length_ + elements); - return Resize(new_capacity); - } - return Status::OK(); -} - template Status PrimitiveBuilder::Append( const value_type* values, int32_t length, const uint8_t* valid_bytes) { - RETURN_NOT_OK(PrimitiveBuilder::Reserve(length)); + RETURN_NOT_OK(Reserve(length)); if (length > 0) { memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); } - if (valid_bytes != nullptr) { - PrimitiveBuilder::AppendNulls(valid_bytes, length); - } else { - for (int i = 0; i < length; ++i) { - util::set_bit(null_bitmap_data_, length_ + i); - } - } + // length_ is update by these + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); - length_ += length; return Status::OK(); } -template -void PrimitiveBuilder::AppendNulls(const uint8_t* valid_bytes, int32_t length) { - // If valid_bytes is all not null, then none of the values are null - for (int i = 0; i < length; ++i) { - if (valid_bytes[i] == 0) { - ++null_count_; - } else { - util::set_bit(null_bitmap_data_, length_ + i); - } - } -} - template std::shared_ptr PrimitiveBuilder::Finish() { std::shared_ptr result = std::make_shared::ArrayType>( @@ -166,14 +141,8 @@ Status PrimitiveBuilder::Append( } } - if (valid_bytes != nullptr) { - PrimitiveBuilder::AppendNulls(valid_bytes, length); - } else { - for (int i = 0; i < length; ++i) { - util::set_bit(null_bitmap_data_, length_ + i); - } - } - length_ += length; + // this updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, length); return Status::OK(); } diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index fcd3db4e96e..6f6b2fed5a3 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -95,15 +95,13 @@ class PrimitiveBuilder : public ArrayBuilder { using ArrayBuilder::Advance; // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - void AppendNulls(const uint8_t* valid_bytes, int32_t length); + void AppendNulls(const uint8_t* valid_bytes, int32_t length) { + UnsafeAppendToBitmap(valid_bytes, length); + } Status AppendNull() { - if (length_ == capacity_) { - // If the capacity was not already a multiple of 2, do so here - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); - } - ++null_count_; - ++length_; + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); return Status::OK(); } @@ -116,21 +114,17 @@ class PrimitiveBuilder : public ArrayBuilder { Status Append( const value_type* values, int32_t length, const uint8_t* valid_bytes = nullptr); - // Ensure that builder can accommodate an additional number of - // elements. Resizes if the current capacity is not sufficient - Status Reserve(int32_t elements); - std::shared_ptr Finish() override; - protected: - std::shared_ptr data_; - value_type* raw_data_; - - Status Init(int32_t capacity); + Status Init(int32_t capacity) override; // Increase the capacity of the builder to accommodate at least the indicated // number of elements - Status Resize(int32_t capacity); + Status Resize(int32_t capacity) override; + + protected: + std::shared_ptr data_; + value_type* raw_data_; }; template @@ -140,9 +134,17 @@ class NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::PrimitiveBuilder; using PrimitiveBuilder::Append; + using PrimitiveBuilder::Init; + using PrimitiveBuilder::Resize; - // Scalar append. Does not capacity-check; make sure to call Reserve beforehand + // Scalar append. void Append(value_type val) { + ArrayBuilder::Reserve(1); + UnsafeAppend(val); + } + + // Does not capacity-check; make sure to call Reserve beforehand + void UnsafeAppend(value_type val) { util::set_bit(null_bitmap_data_, length_); raw_data_[length_++] = val; } @@ -151,9 +153,6 @@ class NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::length_; using PrimitiveBuilder::null_bitmap_data_; using PrimitiveBuilder::raw_data_; - - using PrimitiveBuilder::Init; - using PrimitiveBuilder::Resize; }; template <> diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index c5cbe1058c7..d2d3c5b6b5a 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -89,11 +89,11 @@ class StringArray : public ListArray { const uint8_t* raw_bytes_; }; -// Array builder +// String builder class StringBuilder : public ListBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) - : ListBuilder(pool, type, std::make_shared(pool, value_type_)) { + : ListBuilder(pool, std::make_shared(pool, value_type_), type) { byte_builder_ = static_cast(value_builder_.get()); } @@ -110,7 +110,6 @@ class StringBuilder : public ListBuilder { } protected: - std::shared_ptr list_builder_; UInt8Builder* byte_builder_; static TypePtr value_type_; diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 56532be8070..5ef0076953c 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -23,6 +23,7 @@ #include #include +#include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/status.h" @@ -137,26 +138,64 @@ class BufferBuilder { public: explicit BufferBuilder(MemoryPool* pool) : pool_(pool), capacity_(0), size_(0) {} + Status Resize(int32_t elements) { + if (capacity_ == 0) { buffer_ = std::make_shared(pool_); } + capacity_ = elements; + RETURN_NOT_OK(buffer_->Resize(capacity_)); + data_ = buffer_->mutable_data(); + return Status::OK(); + } + Status Append(const uint8_t* data, int length) { - if (capacity_ < length + size_) { - if (capacity_ == 0) { buffer_ = std::make_shared(pool_); } - capacity_ = std::max(MIN_BUFFER_CAPACITY, capacity_); - while (capacity_ < length + size_) { - capacity_ *= 2; - } - RETURN_NOT_OK(buffer_->Resize(capacity_)); - data_ = buffer_->mutable_data(); - } + if (capacity_ < length + size_) { RETURN_NOT_OK(Resize(length + size_)); } + UnsafeAppend(data, length); + return Status::OK(); + } + + template + Status Append(T arithmetic_value) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + return Append(reinterpret_cast(&arithmetic_value), sizeof(T)); + } + + template + Status Append(const T* arithmetic_values, int num_elements) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + return Append( + reinterpret_cast(arithmetic_values), num_elements * sizeof(T)); + } + + // Unsafe methods don't check existing size + void UnsafeAppend(const uint8_t* data, int length) { memcpy(data_ + size_, data, length); size_ += length; - return Status::OK(); + } + + template + void UnsafeAppend(T arithmetic_value) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + UnsafeAppend(reinterpret_cast(&arithmetic_value), sizeof(T)); + } + + template + void UnsafeAppend(const T* arithmetic_values, int num_elements) { + static_assert(std::is_arithmetic::value, + "Convenience buffer append only supports arithmetic types"); + UnsafeAppend( + reinterpret_cast(arithmetic_values), num_elements * sizeof(T)); } std::shared_ptr Finish() { auto result = buffer_; buffer_ = nullptr; + capacity_ = size_ = 0; return result; } + int capacity() { return capacity_; } + int length() { return size_; } private: std::shared_ptr buffer_; diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 527ce423e77..fccc5e3085d 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -18,8 +18,8 @@ #ifndef ARROW_UTIL_LOGGING_H #define ARROW_UTIL_LOGGING_H -#include #include +#include namespace arrow { diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index fb417e74daf..961554fe06b 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -18,8 +18,8 @@ #include "arrow/util/memory-pool.h" #include -#include #include +#include #include "arrow/util/status.h" diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index d608f8167df..bf5a22089cd 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -31,14 +31,15 @@ def test_getitem_NA(self): assert arr[1] is pyarrow.NA def test_list_format(self): - arr = pyarrow.from_pylist([[1], None, [2, 3]]) + arr = pyarrow.from_pylist([[1], None, [2, 3, None]]) result = fmt.array_format(arr) expected = """\ [ [1], NA, [2, - 3] + 3, + NA] ]""" assert result == expected From a541644721ba4cb4723931b2a5eff1ac58c8aedd Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 23 Apr 2016 11:11:05 -0400 Subject: [PATCH 063/210] ARROW-100: [C++] Computing RowBatch size Implement RowBatchWriter::DataHeaderSize and arrow::ipc::GetRowBatchSize. To achieve this, the Flatbuffer metadata is written to a temporary buffer and its size is determined. This commit also adds MockMemorySource, a new MemorySource that tracks the amount of memory written. Author: Philipp Moritz Author: Philipp Moritz Closes #61 from pcmoritz/rowbatchsize and squashes the following commits: e95fc5c [Philipp Moritz] fix formating 253c9f0 [Philipp Moritz] rename MockMemorySource methods to reflect better what they are doing 3484458 [Philipp Moritz] add tests for more datatypes 6b798f8 [Philipp Moritz] fix maximum recursion depth 67af8e1 [Philipp Moritz] merge GetRowBatchSize 9b69f12 [Philipp Moritz] factor out GetRowBatchSize test, use MockMemorySource to implement GetRowBatchSize, unify DataHeaderSize and TotalBytes into GetTotalSize aa48cdf [Philipp Moritz] ARROW-100: [C++] Computing RowBatch size --- cpp/src/arrow/ipc/adapter.cc | 29 ++++++++++++++------------- cpp/src/arrow/ipc/adapter.h | 2 +- cpp/src/arrow/ipc/ipc-adapter-test.cc | 28 ++++++++++++++++++++++++++ cpp/src/arrow/ipc/memory.cc | 25 +++++++++++++++++++++++ cpp/src/arrow/ipc/memory.h | 22 ++++++++++++++++++++ 5 files changed, 91 insertions(+), 15 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index bf6fa94dea7..34700080746 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -179,20 +179,13 @@ class RowBatchWriter { } // This must be called after invoking AssemblePayload - int64_t DataHeaderSize() { - // TODO(wesm): In case it is needed, compute the upper bound for the size - // of the buffer containing the flatbuffer data header. - return 0; - } - - // Total footprint of buffers. This must be called after invoking - // AssemblePayload - int64_t TotalBytes() { - int64_t total = 0; - for (const std::shared_ptr& buffer : buffers_) { - total += buffer->size(); - } - return total; + Status GetTotalSize(int64_t* size) { + // emulates the behavior of Write without actually writing + int64_t data_header_offset; + MockMemorySource source(0); + RETURN_NOT_OK(Write(&source, 0, &data_header_offset)); + *size = source.GetExtentBytesWritten(); + return Status::OK(); } private: @@ -211,6 +204,14 @@ Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, RETURN_NOT_OK(serializer.AssemblePayload()); return serializer.Write(dst, position, header_offset); } + +Status GetRowBatchSize(const RowBatch* batch, int64_t* size) { + RowBatchWriter serializer(batch, kMaxIpcRecursionDepth); + RETURN_NOT_OK(serializer.AssemblePayload()); + RETURN_NOT_OK(serializer.GetTotalSize(size)); + return Status::OK(); +} + // ---------------------------------------------------------------------- // Row batch read path diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index 4c9a8a9d8ee..0d2b77f5ace 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -62,7 +62,7 @@ Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, // Compute the precise number of bytes needed in a contiguous memory segment to // write the row batch. This involves generating the complete serialized // Flatbuffers metadata. -int64_t GetRowBatchSize(const RowBatch* batch); +Status GetRowBatchSize(const RowBatch* batch, int64_t* size); // ---------------------------------------------------------------------- // "Read" path; does not copy data if the MemorySource does not diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index c243cfba820..3b147343f77 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -195,6 +195,34 @@ INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, &MakeZeroLengthRowBatch, &MakeDeeplyNestedList)); +void TestGetRowBatchSize(std::shared_ptr batch) { + MockMemorySource mock_source(1 << 16); + int64_t mock_header_location; + int64_t size; + ASSERT_OK(WriteRowBatch(&mock_source, batch.get(), 0, &mock_header_location)); + ASSERT_OK(GetRowBatchSize(batch.get(), &size)); + ASSERT_EQ(mock_source.GetExtentBytesWritten(), size); +} + +TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { + std::shared_ptr batch; + + ASSERT_OK(MakeIntRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeListRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeZeroLengthRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeNonNullRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeDeeplyNestedList(&batch)); + TestGetRowBatchSize(batch); +} + class RecursionLimits : public ::testing::Test, public MemoryMapFixture { public: void SetUp() { pool_ = default_memory_pool(); } diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index 84cbc182cd2..caff2c610b9 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -145,5 +145,30 @@ Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t return Status::OK(); } +MockMemorySource::MockMemorySource(int64_t size) + : size_(size), extent_bytes_written_(0) {} + +Status MockMemorySource::Close() { + return Status::OK(); +} + +Status MockMemorySource::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { + return Status::OK(); +} + +Status MockMemorySource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { + extent_bytes_written_ = std::max(extent_bytes_written_, position + nbytes); + return Status::OK(); +} + +int64_t MockMemorySource::Size() const { + return size_; +} + +int64_t MockMemorySource::GetExtentBytesWritten() const { + return extent_bytes_written_; +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h index e529603dc6e..c6fd7a71899 100644 --- a/cpp/src/arrow/ipc/memory.h +++ b/cpp/src/arrow/ipc/memory.h @@ -121,6 +121,28 @@ class MemoryMappedSource : public MemorySource { std::unique_ptr impl_; }; +// A MemorySource that tracks the size of allocations from a memory source +class MockMemorySource : public MemorySource { + public: + explicit MockMemorySource(int64_t size); + + Status Close() override; + + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status Write(int64_t position, const uint8_t* data, int64_t nbytes) override; + + int64_t Size() const override; + + // @return: the smallest number of bytes containing the modified region of the + // MockMemorySource + int64_t GetExtentBytesWritten() const; + + private: + int64_t size_; + int64_t extent_bytes_written_; +}; + } // namespace ipc } // namespace arrow From 56514d93a2d1c5ad9419c807f23127eb07d9ccfe Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Fri, 29 Apr 2016 19:31:25 -0700 Subject: [PATCH 064/210] ARROW-104: [FORMAT] Add alignment and padding requirements + union clarification I believe this change captures the discussion we had on the mailing list about alignment and padding for arrays. It also captures the update to UnionArrays. The rendered version should be viewable here: https://github.com/emkornfield/arrow/blob/emk_format_changes/format/Layout.md Author: Micah Kornfield Closes #67 from emkornfield/emk_format_changes and squashes the following commits: c91421e [Micah Kornfield] fixes per code review b33d4c2 [Micah Kornfield] Add alignment and padding requirements. update union types buffer to reflect using only 1 type buffer --- format/Layout.md | 165 +++++++++++++++++++++++++++-------------------- 1 file changed, 95 insertions(+), 70 deletions(-) diff --git a/format/Layout.md b/format/Layout.md index 92553d944c2..34eade31341 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -10,6 +10,8 @@ concepts, here is a small glossary to help disambiguate. * Contiguous memory region: a sequential virtual address space with a given length. Any byte can be reached via a single pointer offset less than the region's length. +* Contiguous memory buffer: A contiguous memory region that stores + a multi-value component of an Array. Sometimes referred to as just "buffer". * Primitive type: a data type that occupies a fixed-size memory slot specified in bit width or byte width * Nested or parametric type: a data type whose full structure depends on one or @@ -41,7 +43,7 @@ Base requirements linearly in the nesting level * Capable of representing fully-materialized and decoded / decompressed Parquet data -* All leaf nodes (primitive value arrays) use contiguous memory regions +* All contiguous memory buffers are aligned at 64-byte boundaries and padded to a multiple of 64 bytes. * Any relative type can have null slots * Arrays are immutable once created. Implementations can provide APIs to mutate an array, but applying mutations will require a new array data structure to @@ -78,6 +80,28 @@ Base requirements The Arrow format is little endian. +## Alignment and Padding + +As noted above, all buffers are intended to be aligned in memory at 64 byte +boundaries and padded to a length that is a multiple of 64 bytes. The alignment +requirement follows best practices for optimized memory access: + +* Elements in numeric arrays will be guaranteed to be retrieved via aligned access. +* On some architectures alignment can help limit partially used cache lines. +* 64 byte alignment is recommended by the [Intel performance guide][2] for +data-structures over 64 bytes (which will be a common case for Arrow Arrays). + +Requiring padding to a multiple of 64 bytes allows for using SIMD instructions +consistently in loops without additional conditional checks. +This should allow for simpler and more efficient code. +The specific padding length was chosen because it matches the largest known +SIMD instruction registers available as of April 2016 (Intel AVX-512). +Guaranteed padding can also allow certain compilers +to generate more optimized code directly (e.g. One can safely use Intel's +`-qopt-assume-safe-padding`). + +Unless otherwise noted, padded bytes do not need to have a specific value. + ## Array lengths Any array has a known and fixed length, stored as a 32-bit signed integer, so a @@ -101,14 +125,14 @@ signed integer, as it may be as large as the array length. Any relative type can have null value slots, whether primitive or nested type. An array with nulls must have a contiguous memory buffer, known as the null (or -validity) bitmap, whose length is a multiple of 8 bytes (to avoid -word-alignment concerns) and large enough to have at least 1 bit for each array +validity) bitmap, whose length is a multiple of 64 bytes (as discussed above) +and large enough to have at least 1 bit for each array slot. Whether any array slot is valid (non-null) is encoded in the respective bits of this bitmap. A 1 (set bit) for index `j` indicates that the value is not null, while a 0 (bit not set) indicates that it is null. Bitmaps are to be -initialized to be all unset at allocation time. +initialized to be all unset at allocation time (this includes padding). ``` is_valid[j] -> bitmap[j / 8] & (1 << (j % 8)) @@ -158,15 +182,15 @@ Would look like: * Length: 5, Null count: 1 * Null bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-7 | + |Byte 0 (validity bitmap) | Bytes 1-63 | |-------------------------|-----------------------| |00011011 | 0 (padding) | * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | - |------------|-------------|-------------|-------------|-------------| - | 1 | 2 | unspecified | 4 | 8 | + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | 2 | unspecified | 4 | 8 | unspecified | ``` ### Example Layout: Non-null int32 Array @@ -177,15 +201,15 @@ Would look like: * Length: 5, Null count: 0 * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-7 | + | Byte 0 (validity bitmap) | Bytes 1-63 | |--------------------------|-----------------------| | 00011111 | 0 (padding) | * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | - |------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | unspecified | ``` or with the bitmap elided: @@ -195,9 +219,9 @@ or with the bitmap elided: * Null bitmap buffer: Not required * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | - |------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | unspecified | ``` ## List type @@ -243,23 +267,23 @@ will have the following representation: * Length: 4, Null count: 1 * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-7 | + | Byte 0 (validity bitmap) | Bytes 1-63 | |--------------------------|-----------------------| | 00001101 | 0 (padding) | * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | - |------------|-------------|-------------|-------------|-------------| - | 0 | 3 | 3 | 7 | 7 | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 0 | 3 | 3 | 7 | 7 | unspecified | * Values array (char array): * Length: 7, Null count: 0 * Null bitmap buffer: Not required - | Bytes 0-7 | - |------------| - | joemark | + | Bytes 0-7 | Bytes 8-63 | + |------------|-------------| + | joemark | unspecified | ``` ### Example Layout: `List>` @@ -273,31 +297,31 @@ will be be represented as follows: * Null bitmap buffer: Not required * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | - |------------|------------|------------|-------------| - | 0 | 2 | 6 | 7 | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |------------|------------|------------|-------------|-------------| + | 0 | 2 | 6 | 7 | unspecified | * Values array (`List`) * Length: 6, Null count: 1 * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-7 | + | Byte 0 (validity bitmap) | Bytes 1-63 | |--------------------------|-------------| | 00110111 | 0 (padding) | * Offsets buffer (int32) - | Bytes 0-28 | - |----------------------| - | 0, 2, 4, 7, 7, 8, 10 | + | Bytes 0-28 | Bytes 29-63 | + |----------------------|-------------| + | 0, 2, 4, 7, 7, 8, 10 | unspecified | * Values array (bytes): * Length: 10, Null count: 0 * Null bitmap buffer: Not required - | Bytes 0-9 | - |-------------------------------| - | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | + | Bytes 0-9 | Bytes 10-63 | + |-------------------------------|-------------| + | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified | ``` ## Struct type @@ -333,9 +357,9 @@ The layout for [{'joe', 1}, {null, 2}, null, {'mark', 4}] would be: * Length: 4, Null count: 1 * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-7 | - |--------------------------|-------------| - | 00001011 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-7 | Bytes 8-63 | + |--------------------------|-------------|-------------| + | 00001011 | 0 (padding) | unspecified | * Children arrays: * field-0 array (`List`): @@ -396,13 +420,13 @@ The union types may be named, but like structs this will be a matter of the metadata and will not affect the physical memory layout. We define two distinct union types that are optimized for different use -cases. This first, the dense union, represents a mixed-type array with 6 bytes +cases. This first, the dense union, represents a mixed-type array with 5 bytes of overhead for each value. Its physical layout is as follows: * One child array for each relative type -* Types buffer: A buffer of unsigned integers, enumerated from 0 corresponding - to each type, with the smallest byte width capable of representing the number - of types in the union. +* Types buffer: A buffer of 8-bit signed integers, enumerated from 0 corresponding + to each type. A union with more then 127 possible types can be modeled as a + union of unions. * Offsets buffer: A buffer of signed int32 values indicating the relative offset into the respective child array for the type in a given slot. The respective offsets for each child value array must be in order / increasing. @@ -420,21 +444,21 @@ An example layout for logical union of: ``` * Length: 4, Null count: 1 * Null bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-7 | + |Byte 0 (validity bitmap) | Bytes 1-63 | |-------------------------|-----------------------| |00001101 | 0 (padding) | * Types buffer: - |Byte 0-1 | Byte 2-3 | Byte 4-5 | Byte 6-7 | - |---------|-------------|----------|----------| - | 0 | unspecified | 0 | 1 | + |Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | + |---------|-------------|----------|----------|-------------| + | 0 | unspecified | 0 | 1 | unspecified | * Offset buffer: - |Byte 0-3 | Byte 4-7 | Byte 8-11 | Byte 12-15 | - |---------|-------------|-----------|------------| - | 0 | unspecified | 1 | 0 | + |Byte 0-3 | Byte 4-7 | Byte 8-11 | Byte 12-15 | Bytes 16-63 | + |---------|-------------|-----------|------------|-------------| + | 0 | unspecified | 1 | 0 | unspecified | * Children arrays: * Field-0 array (f: float): @@ -443,9 +467,9 @@ An example layout for logical union of: * Value Buffer: - | Bytes 0-7 | - |-----------| - | 1.2, 3.4 | + | Bytes 0-7 | Bytes 8-63 | + |-----------|-------------| + | 1.2, 3.4 | unspecified | * Field-1 array (f: float): @@ -454,9 +478,9 @@ An example layout for logical union of: * Value Buffer: - | Bytes 0-3 | - |-----------| - | 5 | + | Bytes 0-3 | Bytes 4-63 | + |-----------|-------------| + | 5 | unspecified | ``` ## Sparse union type @@ -484,9 +508,9 @@ will have the following layout: * Types buffer: - | Bytes 0-1 | Bytes 2-3 | Bytes 4-5 | Bytes 6-7 | Bytes 8-9 | Bytes 10-11 | - |------------|-------------|-------------|-------------|-------------|--------------| - | 0 | 1 | 2 | 1 | 0 | 2 | + | Byte 0 | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Bytes 6-63 | + |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 0 | 1 | 2 | 1 | 0 | 2 | unspecified (padding) | * Children arrays: @@ -494,51 +518,51 @@ will have the following layout: * Length: 6, Null count: 4 * Null bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-7 | + |Byte 0 (validity bitmap) | Bytes 1-63 | |-------------------------|-----------------------| |00010001 | 0 (padding) | * Value buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | - |------------|-------------|-------------|-------------|-------------|--------------| - | 1 | unspecified | unspecified | unspecified | 4 | unspecified | + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 1 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | * u1 (float): * Length: 6, Null count: 4 * Null bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-7 | + |Byte 0 (validity bitmap) | Bytes 1-63 | |-------------------------|-----------------------| |00001010 | 0 (padding) | * Value buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | - |-------------|-------------|-------------|-------------|-------------|--------------| - | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | * u2 (`List`) * Length: 6, Null count: 4 * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-7 | + | Byte 0 (validity bitmap) | Bytes 1-63 | |--------------------------|-----------------------| | 00100100 | 0 (padding) | * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | - |------------|-------------|-------------|-------------|-------------|-------------|-------------| - | 0 | 0 | 0 | 3 | 3 | 3 | 7 | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | + |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| + | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified | * Values array (char array): * Length: 7, Null count: 0 * Null bitmap buffer: Not required - | Bytes 0-7 | - |------------| - | joemark | + | Bytes 0-7 | Bytes 8-63 | + |------------|-----------------------| + | joemark | unspecified (padding) | ``` Note that nested types in a sparse union must be internally consistent @@ -557,3 +581,4 @@ the the types array indicates that a slot contains a different type at the index Drill docs https://drill.apache.org/docs/value-vectors/ [1]: https://en.wikipedia.org/wiki/Bit_numbering +[2]: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors From 355f7c96a194c65bad523466586f51a9ae0e8627 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 1 May 2016 15:53:37 -0700 Subject: [PATCH 065/210] ARROW-92: Arrow to Parquet Schema conversion My current WIP state. To make the actual schema conversion complete, we probably need the physical structure too as Arrow schemas only care about logical types whereas Parquet schema is about logical and physical types. Author: Uwe L. Korn Closes #68 from xhochy/arrow-92 and squashes the following commits: e3aa261 [Uwe L. Korn] Add macro to convert ParquetException to Status 9c5b085 [Uwe L. Korn] Include string 42ed0ea [Uwe L. Korn] Add struct conversion 38e68e5 [Uwe L. Korn] make format 9a6c876 [Uwe L. Korn] Add more types 8a0293e [Uwe L. Korn] ARROW-92: Arrow to Parquet Schema conversion --- cpp/src/arrow/parquet/parquet-schema-test.cc | 75 +++++++++++ cpp/src/arrow/parquet/schema.cc | 130 +++++++++++++++++++ cpp/src/arrow/parquet/schema.h | 5 + 3 files changed, 210 insertions(+) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index e2280f41189..8de739491b5 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -161,6 +161,81 @@ TEST_F(TestConvertParquetSchema, UnsupportedThings) { } } +class TestConvertArrowSchema : public ::testing::Test { + public: + virtual void SetUp() {} + + void CheckFlatSchema(const std::vector& nodes) { + NodePtr schema_node = GroupNode::Make("schema", Repetition::REPEATED, nodes); + const GroupNode* expected_schema_node = + static_cast(schema_node.get()); + const GroupNode* result_schema_node = + static_cast(result_schema_->schema().get()); + + ASSERT_EQ(expected_schema_node->field_count(), result_schema_node->field_count()); + + for (int i = 0; i < expected_schema_node->field_count(); i++) { + auto lhs = result_schema_node->field(i); + auto rhs = expected_schema_node->field(i); + EXPECT_TRUE(lhs->Equals(rhs.get())); + } + } + + Status ConvertSchema(const std::vector>& fields) { + arrow_schema_ = std::make_shared(fields); + return ToParquetSchema(arrow_schema_.get(), &result_schema_); + } + + protected: + std::shared_ptr arrow_schema_; + std::shared_ptr<::parquet::SchemaDescriptor> result_schema_; +}; + +TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { + std::vector parquet_fields; + std::vector> arrow_fields; + + parquet_fields.push_back( + PrimitiveNode::Make("boolean", Repetition::REQUIRED, ParquetType::BOOLEAN)); + arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32)); + arrow_fields.push_back(std::make_shared("int32", INT32, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64)); + arrow_fields.push_back(std::make_shared("int64", INT64, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); + arrow_fields.push_back(std::make_shared("float", FLOAT)); + + parquet_fields.push_back( + PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE)); + arrow_fields.push_back(std::make_shared("double", DOUBLE)); + + // TODO: String types need to be clarified a bit more in the Arrow spec + parquet_fields.push_back(PrimitiveNode::Make( + "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8)); + arrow_fields.push_back(std::make_shared("string", UTF8)); + + ASSERT_OK(ConvertSchema(arrow_fields)); + + CheckFlatSchema(parquet_fields); +} + +TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) { + std::vector parquet_fields; + std::vector> arrow_fields; + + // TODO: Test Decimal Arrow -> Parquet conversion + + ASSERT_OK(ConvertSchema(arrow_fields)); + + CheckFlatSchema(parquet_fields); +} + TEST(TestNodeConversion, DateAndTime) {} } // namespace parquet diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 560e2837406..214c764f08b 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -17,13 +17,18 @@ #include "arrow/parquet/schema.h" +#include #include #include "parquet/api/schema.h" +#include "parquet/exception.h" #include "arrow/types/decimal.h" +#include "arrow/types/string.h" #include "arrow/util/status.h" +using parquet::ParquetException; +using parquet::Repetition; using parquet::schema::Node; using parquet::schema::NodePtr; using parquet::schema::GroupNode; @@ -36,6 +41,11 @@ namespace arrow { namespace parquet { +#define PARQUET_CATCH_NOT_OK(s) \ + try { \ + (s); \ + } catch (const ParquetException& e) { return Status::Invalid(e.what()); } + const auto BOOL = std::make_shared(); const auto UINT8 = std::make_shared(); const auto INT32 = std::make_shared(); @@ -182,6 +192,126 @@ Status FromParquetSchema( return Status::OK(); } +Status StructToNode(const std::shared_ptr& type, const std::string& name, + bool nullable, NodePtr* out) { + Repetition::type repetition = Repetition::REQUIRED; + if (nullable) { repetition = Repetition::OPTIONAL; } + + std::vector children(type->num_children()); + for (int i = 0; i < type->num_children(); i++) { + RETURN_NOT_OK(FieldToNode(type->child(i), &children[i])); + } + + *out = GroupNode::Make(name, repetition, children); + return Status::OK(); +} + +Status FieldToNode(const std::shared_ptr& field, NodePtr* out) { + LogicalType::type logical_type = LogicalType::NONE; + ParquetType::type type; + Repetition::type repetition = Repetition::REQUIRED; + if (field->nullable) { repetition = Repetition::OPTIONAL; } + int length = -1; + + switch (field->type->type) { + // TODO: + // case Type::NA: + // break; + case Type::BOOL: + type = ParquetType::BOOLEAN; + break; + case Type::UINT8: + type = ParquetType::INT32; + logical_type = LogicalType::UINT_8; + break; + case Type::INT8: + type = ParquetType::INT32; + logical_type = LogicalType::INT_8; + break; + case Type::UINT16: + type = ParquetType::INT32; + logical_type = LogicalType::UINT_16; + break; + case Type::INT16: + type = ParquetType::INT32; + logical_type = LogicalType::INT_16; + break; + case Type::UINT32: + type = ParquetType::INT32; + logical_type = LogicalType::UINT_32; + break; + case Type::INT32: + type = ParquetType::INT32; + break; + case Type::UINT64: + type = ParquetType::INT64; + logical_type = LogicalType::UINT_64; + break; + case Type::INT64: + type = ParquetType::INT64; + break; + case Type::FLOAT: + type = ParquetType::FLOAT; + break; + case Type::DOUBLE: + type = ParquetType::DOUBLE; + break; + case Type::CHAR: + type = ParquetType::FIXED_LEN_BYTE_ARRAY; + logical_type = LogicalType::UTF8; + length = static_cast(field->type.get())->size; + break; + case Type::STRING: + type = ParquetType::BYTE_ARRAY; + logical_type = LogicalType::UTF8; + break; + case Type::BINARY: + type = ParquetType::BYTE_ARRAY; + break; + case Type::DATE: + type = ParquetType::INT32; + logical_type = LogicalType::DATE; + break; + case Type::TIMESTAMP: + type = ParquetType::INT64; + logical_type = LogicalType::TIMESTAMP_MILLIS; + break; + case Type::TIMESTAMP_DOUBLE: + type = ParquetType::INT64; + // This is specified as seconds since the UNIX epoch + // TODO: Converted type in Parquet? + // logical_type = LogicalType::TIMESTAMP_MILLIS; + break; + case Type::TIME: + type = ParquetType::INT64; + logical_type = LogicalType::TIME_MILLIS; + break; + case Type::STRUCT: { + auto struct_type = std::static_pointer_cast(field->type); + return StructToNode(struct_type, field->name, field->nullable, out); + } break; + default: + // TODO: LIST, DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL, DECIMAL_TEXT, VARCHAR + return Status::NotImplemented("unhandled type"); + } + *out = PrimitiveNode::Make(field->name, repetition, type, logical_type, length); + return Status::OK(); +} + +Status ToParquetSchema( + const Schema* arrow_schema, std::shared_ptr<::parquet::SchemaDescriptor>* out) { + std::vector nodes(arrow_schema->num_fields()); + for (int i = 0; i < arrow_schema->num_fields(); i++) { + RETURN_NOT_OK(FieldToNode(arrow_schema->field(i), &nodes[i])); + } + + NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); + *out = std::make_shared<::parquet::SchemaDescriptor>(); + PARQUET_CATCH_NOT_OK((*out)->Init(schema)); + + return Status::OK(); +} + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h index a44a9a4b6a8..bfc7d211381 100644 --- a/cpp/src/arrow/parquet/schema.h +++ b/cpp/src/arrow/parquet/schema.h @@ -36,6 +36,11 @@ Status NodeToField(const ::parquet::schema::NodePtr& node, std::shared_ptr* out); +Status FieldToNode(const std::shared_ptr& field, ::parquet::schema::NodePtr* out); + +Status ToParquetSchema( + const Schema* arrow_schema, std::shared_ptr<::parquet::SchemaDescriptor>* out); + } // namespace parquet } // namespace arrow From ad3d01dd5c47f6d21771a53d437772cf71bee10f Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 3 May 2016 18:23:43 -0700 Subject: [PATCH 066/210] ARROW-188: Add numpy as install requirement Successfully tested with NumPy 1.9 which should be a recent but still old version that we can support for now. Author: Uwe L. Korn Closes #69 from xhochy/arrow-188 and squashes the following commits: 651a9aa [Uwe L. Korn] ARROW-188: Add numpy as install requirement --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index ebd80de46b4..5f228ed0af2 100644 --- a/python/setup.py +++ b/python/setup.py @@ -242,7 +242,7 @@ def get_outputs(self): 'clean': clean, 'build_ext': build_ext }, - install_requires=['cython >= 0.21'], + install_requires=['cython >= 0.21', 'numpy >= 1.9'], description=DESC, license='Apache License, Version 2.0', maintainer="Apache Arrow Developers", From 33022579e31b2448ed227ddf51160d08edd625e3 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 8 May 2016 18:03:28 -0700 Subject: [PATCH 067/210] ARROW-190: Python: Provide installable sdist builds Author: Uwe L. Korn Closes #71 from xhochy/arrow-190 and squashes the following commits: e28db45 [Uwe L. Korn] Add LICENSE and README to MANIFEST f9943f5 [Uwe L. Korn] ARROW-190: Python: Provide standalone installable sdist builds --- python/MANIFEST.in | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 python/MANIFEST.in diff --git a/python/MANIFEST.in b/python/MANIFEST.in new file mode 100644 index 00000000000..756879a0bb0 --- /dev/null +++ b/python/MANIFEST.in @@ -0,0 +1,14 @@ +include README.md +include LICENSE.txt + +global-include CMakeLists.txt +graft cmake_modules +recursive-include src/pyarrow *.cc *.h +recursive-include pyarrow *.pxd + +global-exclude *.so +global-exclude *.pyc +global-exclude *~ +global-exclude \#* +global-exclude .git* +global-exclude .DS_Store From c9ffe546b8ddb81851bcff78e4db051942dcc546 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sun, 8 May 2016 22:15:40 -0700 Subject: [PATCH 068/210] ARROW-194: C++: Allow read-only memory mapped source A simple patch to allow read-only mode. A test is also included. Author: Jihoon Son Closes #72 from jihoonson/ARROW-194 and squashes the following commits: f55dd22 [Jihoon Son] Change the type of protection flag from int8_t to int b928031 [Jihoon Son] Add missing initialization 63b99c5 [Jihoon Son] Remove unintended whitespace 22e6128 [Jihoon Son] Simplify error check 5559b8d [Jihoon Son] - Fixed a wrong protection flag in a test - Added a routine to check the protection flag before writing - Added a unit test to check the error status for protection mode - Improved failure check for mmap() d8939fa [Jihoon Son] Allow read-only memory mapped source. --- cpp/src/arrow/ipc/ipc-memory-test.cc | 54 ++++++++++++++++++++++++++-- cpp/src/arrow/ipc/memory.cc | 22 ++++++++---- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-memory-test.cc b/cpp/src/arrow/ipc/ipc-memory-test.cc index 19339212225..a2dbd35728c 100644 --- a/cpp/src/arrow/ipc/ipc-memory-test.cc +++ b/cpp/src/arrow/ipc/ipc-memory-test.cc @@ -26,9 +26,6 @@ #include "arrow/ipc/memory.h" #include "arrow/ipc/test-common.h" -#include "arrow/test-util.h" -#include "arrow/util/buffer.h" -#include "arrow/util/status.h" namespace arrow { namespace ipc { @@ -67,6 +64,57 @@ TEST_F(TestMemoryMappedSource, WriteRead) { } } +TEST_F(TestMemoryMappedSource, ReadOnly) { + const int64_t buffer_size = 1024; + std::vector buffer(buffer_size); + + test::random_bytes(1024, 0, buffer.data()); + + const int reps = 5; + + std::string path = "ipc-read-only-test"; + CreateFile(path, reps * buffer_size); + + std::shared_ptr rwmmap; + ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &rwmmap)); + + int64_t position = 0; + for (int i = 0; i < reps; ++i) { + ASSERT_OK(rwmmap->Write(position, buffer.data(), buffer_size)); + + position += buffer_size; + } + rwmmap->Close(); + + std::shared_ptr rommap; + ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_ONLY, &rommap)); + + position = 0; + std::shared_ptr out_buffer; + for (int i = 0; i < reps; ++i) { + ASSERT_OK(rommap->ReadAt(position, buffer_size, &out_buffer)); + + ASSERT_EQ(0, memcmp(out_buffer->data(), buffer.data(), buffer_size)); + position += buffer_size; + } + rommap->Close(); +} + +TEST_F(TestMemoryMappedSource, InvalidMode) { + const int64_t buffer_size = 1024; + std::vector buffer(buffer_size); + + test::random_bytes(1024, 0, buffer.data()); + + std::string path = "ipc-invalid-mode-test"; + CreateFile(path, buffer_size); + + std::shared_ptr rommap; + ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_ONLY, &rommap)); + + ASSERT_RAISES(IOError, rommap->Write(0, buffer.data(), buffer_size)); +} + TEST_F(TestMemoryMappedSource, InvalidFile) { std::string non_existent_path = "invalid-file-name-asfd"; diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index caff2c610b9..a6c56d64f4a 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -41,7 +41,7 @@ MemorySource::~MemorySource() {} class MemoryMappedSource::Impl { public: - Impl() : file_(nullptr), is_open_(false), data_(nullptr) {} + Impl() : file_(nullptr), is_open_(false), is_writable_(false), data_(nullptr) {} ~Impl() { if (is_open_) { @@ -53,10 +53,12 @@ class MemoryMappedSource::Impl { Status Open(const std::string& path, MemorySource::AccessMode mode) { if (is_open_) { return Status::IOError("A file is already open"); } - path_ = path; + int prot_flags = PROT_READ; if (mode == MemorySource::READ_WRITE) { file_ = fopen(path.c_str(), "r+b"); + prot_flags |= PROT_WRITE; + is_writable_ = true; } else { file_ = fopen(path.c_str(), "rb"); } @@ -73,14 +75,13 @@ class MemoryMappedSource::Impl { fseek(file_, 0L, SEEK_SET); is_open_ = true; - // TODO(wesm): Add read-only version of this - data_ = reinterpret_cast( - mmap(nullptr, size_, PROT_READ | PROT_WRITE, MAP_SHARED, fileno(file_), 0)); - if (data_ == nullptr) { + void* result = mmap(nullptr, size_, prot_flags, MAP_SHARED, fileno(file_), 0); + if (result == MAP_FAILED) { std::stringstream ss; ss << "Memory mapping file failed, errno: " << errno; return Status::IOError(ss.str()); } + data_ = reinterpret_cast(result); return Status::OK(); } @@ -89,11 +90,15 @@ class MemoryMappedSource::Impl { uint8_t* data() { return data_; } + bool writable() { return is_writable_; } + + bool opened() { return is_open_; } + private: - std::string path_; FILE* file_; int64_t size_; bool is_open_; + bool is_writable_; // The memory map uint8_t* data_; @@ -134,6 +139,9 @@ Status MemoryMappedSource::ReadAt( } Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { + if (!impl_->opened() || !impl_->writable()) { + return Status::IOError("Unable to write"); + } if (position < 0 || position >= impl_->size()) { return Status::Invalid("position is out of bounds"); } From 1f04f7ff90c43efd72b57cc09ba21da1597682d6 Mon Sep 17 00:00:00 2001 From: lfzCarlosC Date: Thu, 5 May 2016 21:58:31 +0200 Subject: [PATCH 069/210] ARROW-193: typos "int his" fix to "in this" --- .../main/java/org/apache/arrow/vector/VariableWidthVector.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java index e227bb4c417..971a241adaf 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VariableWidthVector.java @@ -30,7 +30,7 @@ public interface VariableWidthVector extends ValueVector{ void allocateNew(int totalBytes, int valueCount); /** - * Provide the maximum amount of variable width bytes that can be stored int his vector. + * Provide the maximum amount of variable width bytes that can be stored in this vector. * @return */ int getByteCapacity(); From 4bd13b852d376065fdb16c36fa821ab0e167f0fc Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 10 May 2016 15:58:04 -0700 Subject: [PATCH 070/210] ARROW-91: Basic Parquet read support Depends on (mainly one line fixes): - [x] https://github.com/apache/parquet-cpp/pull/99 - [x] https://github.com/apache/parquet-cpp/pull/98 - [x] https://github.com/apache/parquet-cpp/pull/97 Author: Uwe L. Korn Author: Wes McKinney Closes #73 from xhochy/arrow-91 and squashes the following commits: 7579fed [Uwe L. Korn] Mark single argument constructor as explicit 47441a1 [Uwe L. Korn] Assert that no exception was thrown 5fa1026 [Uwe L. Korn] Incorporate review comments 8d2db22 [Uwe L. Korn] ARROW-91: Basic Parquet read support d9940d8 [Wes McKinney] Public API draft --- cpp/src/arrow/parquet/CMakeLists.txt | 4 + cpp/src/arrow/parquet/parquet-reader-test.cc | 116 +++++++++++ cpp/src/arrow/parquet/reader.cc | 194 +++++++++++++++++++ cpp/src/arrow/parquet/reader.h | 134 +++++++++++++ cpp/src/arrow/parquet/schema.cc | 8 +- cpp/src/arrow/parquet/schema.h | 2 +- cpp/src/arrow/parquet/utils.h | 38 ++++ 7 files changed, 488 insertions(+), 8 deletions(-) create mode 100644 cpp/src/arrow/parquet/parquet-reader-test.cc create mode 100644 cpp/src/arrow/parquet/reader.cc create mode 100644 cpp/src/arrow/parquet/reader.h create mode 100644 cpp/src/arrow/parquet/utils.h diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 0d5cf263ec3..1ae6709652e 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,6 +19,7 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS + reader.cc schema.cc ) @@ -36,6 +37,9 @@ SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) ADD_ARROW_TEST(parquet-schema-test) ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) +ADD_ARROW_TEST(parquet-reader-test) +ARROW_TEST_LINK_LIBRARIES(parquet-reader-test arrow_parquet) + # Headers: top level install(FILES DESTINATION include/arrow/parquet) diff --git a/cpp/src/arrow/parquet/parquet-reader-test.cc b/cpp/src/arrow/parquet/parquet-reader-test.cc new file mode 100644 index 00000000000..a7fc2a89f5f --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-reader-test.cc @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" +#include "arrow/parquet/reader.h" +#include "arrow/types/primitive.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +#include "parquet/api/reader.h" +#include "parquet/api/writer.h" + +using ParquetBuffer = parquet::Buffer; +using parquet::BufferReader; +using parquet::InMemoryOutputStream; +using parquet::Int64Writer; +using parquet::ParquetFileReader; +using parquet::ParquetFileWriter; +using parquet::RandomAccessSource; +using parquet::Repetition; +using parquet::SchemaDescriptor; +using ParquetType = parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::NodePtr; +using parquet::schema::PrimitiveNode; + +namespace arrow { + +namespace parquet { + +class TestReadParquet : public ::testing::Test { + public: + virtual void SetUp() {} + + std::shared_ptr Int64Schema() { + auto pnode = PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64); + NodePtr node_ = + GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); + return std::static_pointer_cast(node_); + } + + std::unique_ptr Int64File( + std::vector& values, int num_chunks) { + std::shared_ptr schema = Int64Schema(); + std::shared_ptr sink(new InMemoryOutputStream()); + auto file_writer = ParquetFileWriter::Open(sink, schema); + size_t chunk_size = values.size() / num_chunks; + for (int i = 0; i < num_chunks; i++) { + auto row_group_writer = file_writer->AppendRowGroup(chunk_size); + auto column_writer = static_cast(row_group_writer->NextColumn()); + int64_t* data = values.data() + i * chunk_size; + column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); + column_writer->Close(); + row_group_writer->Close(); + } + file_writer->Close(); + + std::shared_ptr buffer = sink->GetBuffer(); + std::unique_ptr source(new BufferReader(buffer)); + return ParquetFileReader::Open(std::move(source)); + } + + private: +}; + +TEST_F(TestReadParquet, SingleColumnInt64) { + std::vector values(100, 128); + std::unique_ptr file_reader = Int64File(values, 1); + arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); + std::unique_ptr column_reader; + ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); + ASSERT_NE(nullptr, column_reader.get()); + std::shared_ptr out; + ASSERT_OK(column_reader->NextBatch(100, &out)); + ASSERT_NE(nullptr, out.get()); + Int64Array* out_array = static_cast(out.get()); + for (size_t i = 0; i < values.size(); i++) { + EXPECT_EQ(values[i], out_array->raw_data()[i]); + } +} + +TEST_F(TestReadParquet, SingleColumnInt64Chunked) { + std::vector values(100, 128); + std::unique_ptr file_reader = Int64File(values, 4); + arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); + std::unique_ptr column_reader; + ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); + ASSERT_NE(nullptr, column_reader.get()); + std::shared_ptr out; + ASSERT_OK(column_reader->NextBatch(100, &out)); + ASSERT_NE(nullptr, out.get()); + Int64Array* out_array = static_cast(out.get()); + for (size_t i = 0; i < values.size(); i++) { + EXPECT_EQ(values[i], out_array->raw_data()[i]); + } +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc new file mode 100644 index 00000000000..481ded5789a --- /dev/null +++ b/cpp/src/arrow/parquet/reader.cc @@ -0,0 +1,194 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/parquet/reader.h" + +#include + +#include "arrow/parquet/schema.h" +#include "arrow/parquet/utils.h" +#include "arrow/schema.h" +#include "arrow/types/primitive.h" +#include "arrow/util/status.h" + +using parquet::ColumnReader; +using parquet::TypedColumnReader; + +namespace arrow { +namespace parquet { + +class FileReader::Impl { + public: + Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader); + virtual ~Impl() {} + + Status GetFlatColumn(int i, std::unique_ptr* out); + Status ReadFlatColumn(int i, std::shared_ptr* out); + + private: + MemoryPool* pool_; + std::unique_ptr<::parquet::ParquetFileReader> reader_; +}; + +class FlatColumnReader::Impl { + public: + Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr, + ::parquet::ParquetFileReader* reader, int column_index); + virtual ~Impl() {} + + Status NextBatch(int batch_size, std::shared_ptr* out); + template + Status TypedReadBatch(int batch_size, std::shared_ptr* out); + + private: + void NextRowGroup(); + + MemoryPool* pool_; + const ::parquet::ColumnDescriptor* descr_; + ::parquet::ParquetFileReader* reader_; + int column_index_; + int next_row_group_; + std::shared_ptr column_reader_; + std::shared_ptr field_; + + PoolBuffer values_buffer_; + PoolBuffer def_levels_buffer_; + PoolBuffer rep_levels_buffer_; +}; + +FileReader::Impl::Impl( + MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader) + : pool_(pool), reader_(std::move(reader)) {} + +Status FileReader::Impl::GetFlatColumn(int i, std::unique_ptr* out) { + std::unique_ptr impl( + new FlatColumnReader::Impl(pool_, reader_->descr()->Column(i), reader_.get(), i)); + *out = std::unique_ptr(new FlatColumnReader(std::move(impl))); + return Status::OK(); +} + +Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr* out) { + std::unique_ptr flat_column_reader; + RETURN_NOT_OK(GetFlatColumn(i, &flat_column_reader)); + return flat_column_reader->NextBatch(reader_->num_rows(), out); +} + +FileReader::FileReader( + MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader) + : impl_(new FileReader::Impl(pool, std::move(reader))) {} + +FileReader::~FileReader() {} + +Status FileReader::GetFlatColumn(int i, std::unique_ptr* out) { + return impl_->GetFlatColumn(i, out); +} + +Status FileReader::ReadFlatColumn(int i, std::shared_ptr* out) { + return impl_->ReadFlatColumn(i, out); +} + +FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr, + ::parquet::ParquetFileReader* reader, int column_index) + : pool_(pool), + descr_(descr), + reader_(reader), + column_index_(column_index), + next_row_group_(0), + values_buffer_(pool), + def_levels_buffer_(pool), + rep_levels_buffer_(pool) { + NodeToField(descr_->schema_node(), &field_); + NextRowGroup(); +} + +template +Status FlatColumnReader::Impl::TypedReadBatch( + int batch_size, std::shared_ptr* out) { + int values_to_read = batch_size; + NumericBuilder builder(pool_, field_->type); + while ((values_to_read > 0) && column_reader_) { + values_buffer_.Resize(values_to_read * sizeof(CType)); + if (descr_->max_definition_level() > 0) { + def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); + } + if (descr_->max_repetition_level() > 0) { + rep_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); + } + auto reader = dynamic_cast*>(column_reader_.get()); + int64_t values_read; + CType* values = reinterpret_cast(values_buffer_.mutable_data()); + PARQUET_CATCH_NOT_OK( + values_to_read -= reader->ReadBatch(values_to_read, + reinterpret_cast(def_levels_buffer_.mutable_data()), + reinterpret_cast(rep_levels_buffer_.mutable_data()), values, + &values_read)); + if (descr_->max_definition_level() == 0) { + RETURN_NOT_OK(builder.Append(values, values_read)); + } else { + return Status::NotImplemented("no support for definition levels yet"); + } + if (!column_reader_->HasNext()) { NextRowGroup(); } + } + *out = builder.Finish(); + return Status::OK(); +} + +#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType, CType) \ + case Type::ENUM: \ + return TypedReadBatch(batch_size, out); \ + break; + +Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr* out) { + if (!column_reader_) { + // Exhausted all row groups. + *out = nullptr; + return Status::OK(); + } + + if (descr_->max_repetition_level() > 0) { + return Status::NotImplemented("no support for repetition yet"); + } + + switch (field_->type->type) { + TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type, int32_t) + TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type, int64_t) + TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType, float) + TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType, double) + default: + return Status::NotImplemented(field_->type->ToString()); + } +} + +void FlatColumnReader::Impl::NextRowGroup() { + if (next_row_group_ < reader_->num_row_groups()) { + column_reader_ = reader_->RowGroup(next_row_group_)->Column(column_index_); + next_row_group_++; + } else { + column_reader_ = nullptr; + } +} + +FlatColumnReader::FlatColumnReader(std::unique_ptr impl) : impl_(std::move(impl)) {} + +FlatColumnReader::~FlatColumnReader() {} + +Status FlatColumnReader::NextBatch(int batch_size, std::shared_ptr* out) { + return impl_->NextBatch(batch_size, out); +} + +} // namespace parquet +} // namespace arrow diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h new file mode 100644 index 00000000000..41ca7eb35b9 --- /dev/null +++ b/cpp/src/arrow/parquet/reader.h @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_READER_H +#define ARROW_PARQUET_READER_H + +#include + +#include "parquet/api/reader.h" +#include "parquet/api/schema.h" + +namespace arrow { + +class Array; +class MemoryPool; +class RowBatch; +class Status; + +namespace parquet { + +class FlatColumnReader; + +// Arrow read adapter class for deserializing Parquet files as Arrow row +// batches. +// +// TODO(wesm): nested data does not always make sense with this user +// interface unless you are only reading a single leaf node from a branch of +// a table. For example: +// +// repeated group data { +// optional group record { +// optional int32 val1; +// optional byte_array val2; +// optional bool val3; +// } +// optional int32 val4; +// } +// +// In the Parquet file, there are 3 leaf nodes: +// +// * data.record.val1 +// * data.record.val2 +// * data.record.val3 +// * data.val4 +// +// When materializing this data in an Arrow array, we would have: +// +// data: list), +// val3: bool, +// >, +// val4: int32 +// >> +// +// However, in the Parquet format, each leaf node has its own repetition and +// definition levels describing the structure of the intermediate nodes in +// this array structure. Thus, we will need to scan the leaf data for a group +// of leaf nodes part of the same type tree to create a single result Arrow +// nested array structure. +// +// This is additionally complicated "chunky" repeated fields or very large byte +// arrays +class FileReader { + public: + FileReader(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader); + + // Since the distribution of columns amongst a Parquet file's row groups may + // be uneven (the number of values in each column chunk can be different), we + // provide a column-oriented read interface. The ColumnReader hides the + // details of paging through the file's row groups and yielding + // fully-materialized arrow::Array instances + // + // Returns error status if the column of interest is not flat. + Status GetFlatColumn(int i, std::unique_ptr* out); + // Read column as a whole into an Array. + Status ReadFlatColumn(int i, std::shared_ptr* out); + + virtual ~FileReader(); + + private: + class Impl; + std::unique_ptr impl_; +}; + +// At this point, the column reader is a stream iterator. It only knows how to +// read the next batch of values for a particular column from the file until it +// runs out. +// +// We also do not expose any internal Parquet details, such as row groups. This +// might change in the future. +class FlatColumnReader { + public: + virtual ~FlatColumnReader(); + + // Scan the next array of the indicated size. The actual size of the + // returned array may be less than the passed size depending how much data is + // available in the file. + // + // When all the data in the file has been exhausted, the result is set to + // nullptr. + // + // Returns Status::OK on a successful read, including if you have exhausted + // the data available in the file. + Status NextBatch(int batch_size, std::shared_ptr* out); + + private: + class Impl; + std::unique_ptr impl_; + explicit FlatColumnReader(std::unique_ptr impl); + + friend class FileReader; +}; + +} // namespace parquet + +} // namespace arrow + +#endif // ARROW_PARQUET_READER_H diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index 214c764f08b..fd758940c9f 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -21,13 +21,12 @@ #include #include "parquet/api/schema.h" -#include "parquet/exception.h" +#include "arrow/parquet/utils.h" #include "arrow/types/decimal.h" #include "arrow/types/string.h" #include "arrow/util/status.h" -using parquet::ParquetException; using parquet::Repetition; using parquet::schema::Node; using parquet::schema::NodePtr; @@ -41,11 +40,6 @@ namespace arrow { namespace parquet { -#define PARQUET_CATCH_NOT_OK(s) \ - try { \ - (s); \ - } catch (const ParquetException& e) { return Status::Invalid(e.what()); } - const auto BOOL = std::make_shared(); const auto UINT8 = std::make_shared(); const auto INT32 = std::make_shared(); diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h index bfc7d211381..ec5f96062e8 100644 --- a/cpp/src/arrow/parquet/schema.h +++ b/cpp/src/arrow/parquet/schema.h @@ -45,4 +45,4 @@ Status ToParquetSchema( } // namespace arrow -#endif +#endif // ARROW_PARQUET_SCHEMA_H diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h new file mode 100644 index 00000000000..b32792fdf70 --- /dev/null +++ b/cpp/src/arrow/parquet/utils.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_UTILS_H +#define ARROW_PARQUET_UTILS_H + +#include "arrow/util/status.h" + +#include "parquet/exception.h" + +namespace arrow { + +namespace parquet { + +#define PARQUET_CATCH_NOT_OK(s) \ + try { \ + (s); \ + } catch (const ::parquet::ParquetException& e) { return Status::Invalid(e.what()); } + +} // namespace parquet + +} // namespace arrow + +#endif // ARROW_PARQUET_UTILS_H From 68b80a83876b1306f80d3914eef98f51100a8009 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 14 May 2016 18:53:22 -0700 Subject: [PATCH 071/210] ARROW-197: Working first draft of a conda recipe for pyarrow Includes ARROW-196. I will close that PR and merge these together as I had to make some additional changes. Requires PARQUET-617. Closes #76 Author: Wes McKinney Closes #77 from wesm/ARROW-197 and squashes the following commits: 4bf3d2c [Wes McKinney] Finagle toolchain environment variables to get pyarrow conda package working c2d3684 [Wes McKinney] Add conda recipe and ensure that libarrow_parquet is installed as well --- cpp/conda.recipe/build.sh | 45 ++++++++++++++++++++++++++++ cpp/conda.recipe/meta.yaml | 32 ++++++++++++++++++++ cpp/src/arrow/parquet/CMakeLists.txt | 7 +++++ cpp/src/arrow/types/primitive.h | 1 + python/conda.recipe/build.sh | 18 +++++++++++ python/conda.recipe/meta.yaml | 41 +++++++++++++++++++++++++ 6 files changed, 144 insertions(+) create mode 100644 cpp/conda.recipe/build.sh create mode 100644 cpp/conda.recipe/meta.yaml create mode 100644 python/conda.recipe/build.sh create mode 100644 python/conda.recipe/meta.yaml diff --git a/cpp/conda.recipe/build.sh b/cpp/conda.recipe/build.sh new file mode 100644 index 00000000000..ac1f9c89cc9 --- /dev/null +++ b/cpp/conda.recipe/build.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -e +set -x + +cd $RECIPE_DIR + +# Build dependencies +export FLATBUFFERS_HOME=$PREFIX +export PARQUET_HOME=$PREFIX + +cd .. + +rm -rf conda-build +mkdir conda-build + +cp -r thirdparty conda-build/ + +cd conda-build +pwd + +# Build googletest for running unit tests +./thirdparty/download_thirdparty.sh +./thirdparty/build_thirdparty.sh gtest + +source thirdparty/versions.sh +export GTEST_HOME=`pwd`/thirdparty/$GTEST_BASEDIR + +if [ `uname` == Linux ]; then + SHARED_LINKER_FLAGS='-static-libstdc++' +elif [ `uname` == Darwin ]; then + SHARED_LINKER_FLAGS='' +fi + +cmake \ + -DCMAKE_BUILD_TYPE=debug \ + -DCMAKE_INSTALL_PREFIX=$PREFIX \ + -DCMAKE_SHARED_LINKER_FLAGS=$SHARED_LINKER_FLAGS \ + -DARROW_IPC=on \ + -DARROW_PARQUET=on \ + .. + +make +ctest -L unittest +make install diff --git a/cpp/conda.recipe/meta.yaml b/cpp/conda.recipe/meta.yaml new file mode 100644 index 00000000000..2e834d5cbf8 --- /dev/null +++ b/cpp/conda.recipe/meta.yaml @@ -0,0 +1,32 @@ +package: + name: arrow-cpp + version: "0.1" + +build: + number: {{environ.get('TRAVIS_BUILD_NUMBER', 0)}} # [unix] + skip: true # [win] + script_env: + - CC [linux] + - CXX [linux] + - LD_LIBRARY_PATH [linux] + +requirements: + build: + - cmake + - flatbuffers + - parquet-cpp + - thrift-cpp + + run: + - parquet-cpp + +test: + commands: + - test -f $PREFIX/lib/libarrow.so + - test -f $PREFIX/lib/libarrow_parquet.so + - test -f $PREFIX/include/arrow/api.h + +about: + home: http://github.com/apache/arrow + license: Apache 2.0 + summary: 'C++ libraries for the reference Apache Arrow implementation' diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 1ae6709652e..cd6f05d6b5f 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -42,4 +42,11 @@ ARROW_TEST_LINK_LIBRARIES(parquet-reader-test arrow_parquet) # Headers: top level install(FILES + reader.h + schema.h + utils.h DESTINATION include/arrow/parquet) + +install(TARGETS arrow_parquet + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 6f6b2fed5a3..fc45f6c5b05 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -136,6 +136,7 @@ class NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::Append; using PrimitiveBuilder::Init; using PrimitiveBuilder::Resize; + using PrimitiveBuilder::Reserve; // Scalar append. void Append(value_type val) { diff --git a/python/conda.recipe/build.sh b/python/conda.recipe/build.sh new file mode 100644 index 00000000000..a9d9aedead3 --- /dev/null +++ b/python/conda.recipe/build.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -ex + +# Build dependency +export ARROW_HOME=$PREFIX + +cd $RECIPE_DIR + +echo Setting the compiler... +if [ `uname` == Linux ]; then + EXTRA_CMAKE_ARGS=-DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++ +elif [ `uname` == Darwin ]; then + EXTRA_CMAKE_ARGS= +fi + +cd .. +$PYTHON setup.py build_ext --extra-cmake-args=$EXTRA_CMAKE_ARGS || exit 1 +$PYTHON setup.py install || exit 1 diff --git a/python/conda.recipe/meta.yaml b/python/conda.recipe/meta.yaml new file mode 100644 index 00000000000..85d24b6bc32 --- /dev/null +++ b/python/conda.recipe/meta.yaml @@ -0,0 +1,41 @@ +package: + name: pyarrow + version: "0.1" + +build: + number: {{environ.get('TRAVIS_BUILD_NUMBER', 0)}} # [unix] + rpaths: + - lib # [unix] + - lib/python{{environ.get('PY_VER')}}/site-packages/pyarrow # [unix] + script_env: + - CC [linux] + - CXX [linux] + - LD_LIBRARY_PATH [linux] + skip: true # [win] + +requirements: + build: + - cmake + - python + - setuptools + - cython + - numpy + - pandas + - arrow-cpp + - pytest + + run: + - arrow-cpp + - python + - numpy + - pandas + - six + +test: + imports: + - pyarrow + +about: + home: http://github.com/apache/arrow + license: Apache 2.0 + summary: 'Python bindings for Arrow C++ and interoperability tool for pandas and NumPy' From 6968ec01d722584e9561dc3c0438bce29c664b5a Mon Sep 17 00:00:00 2001 From: hzhang2 Date: Sat, 14 May 2016 19:07:44 -0700 Subject: [PATCH 072/210] ARROW-199: [C++] Refine third party dependency To generate makefile, run download_thirdparty.sh and build_thirdparty.sh is not enough source setup_build_env.sh is necessary since FLATBUFFERS_HOME must be set . Author: hzhang2 Closes #75 from zhangh43/arrow2 and squashes the following commits: ea3101b [hzhang2] remove CMAKE_SKIP_INSTALL_ALL_DEPENDENCY for target install and fix typo 8c02a38 [hzhang2] ARROW-199: [C++] Refine third party dependency b2312e0 [hzhang2] ARROW-199: [C++] Refine third party dependency fefc314 [hzhang2] FLATBUFFERS_HOME must be set before cmake --- cpp/CMakeLists.txt | 5 ----- cpp/README.md | 1 + cpp/setup_build_env.sh | 6 +----- cpp/thirdparty/set_thirdparty_env.sh | 12 ++++++++++++ 4 files changed, 14 insertions(+), 10 deletions(-) create mode 100755 cpp/thirdparty/set_thirdparty_env.sh diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b38f91e5d68..a3fb01076d4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -25,11 +25,6 @@ include(CMakeParseArguments) set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty") -# Allow "make install" to not depend on all targets. -# -# Must be declared in the top-level CMakeLists.txt. -set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true) - find_package(ClangTools) if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) # Generate a Clang compile_commands.json "compilation database" file for use diff --git a/cpp/README.md b/cpp/README.md index c8cd86fedc6..129c5f15b15 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -13,6 +13,7 @@ To build the thirdparty build dependencies, run: ``` ./thirdparty/download_thirdparty.sh ./thirdparty/build_thirdparty.sh +source ./thirdparty/set_thirdparty_env.sh ``` You can also run from the root of the C++ tree diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index 6520dbd43f7..fa779fdd5c2 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -4,10 +4,6 @@ SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) ./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } ./thirdparty/build_thirdparty.sh || { echo "build_thirdparty.sh failed" ; return; } -source thirdparty/versions.sh - -export GTEST_HOME=$SOURCE_DIR/thirdparty/$GTEST_BASEDIR -export GBENCHMARK_HOME=$SOURCE_DIR/thirdparty/installed -export FLATBUFFERS_HOME=$SOURCE_DIR/thirdparty/installed +source ./thirdparty/set_thirdparty_env.sh || { echo "source set_thirdparty_env.sh failed" ; return; } echo "Build env initialized" diff --git a/cpp/thirdparty/set_thirdparty_env.sh b/cpp/thirdparty/set_thirdparty_env.sh new file mode 100755 index 00000000000..7e9531cd508 --- /dev/null +++ b/cpp/thirdparty/set_thirdparty_env.sh @@ -0,0 +1,12 @@ +#!/usr/bash + +SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) +source $SOURCE_DIR/versions.sh + +if [ -z "$THIRDPARTY_DIR" ]; then + THIRDPARTY_DIR=$SOURCE_DIR +fi + +export GTEST_HOME=$THIRDPARTY_DIR/$GTEST_BASEDIR +export GBENCHMARK_HOME=$THIRDPARTY_DIR/installed +export FLATBUFFERS_HOME=$THIRDPARTY_DIR/installed From 9c59158b4dc84e4de8e9271430befb840e523a4c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 17 May 2016 16:46:40 -0700 Subject: [PATCH 073/210] ARROW-185: Make padding and alignment for all buffers be 64 bytes + some small cleanup/removal of unnecessary code. I think there is likely a good opportunity to factor this code better generally, but this seems to work for now. Author: Micah Kornfield Closes #74 from emkornfield/emk_fix_allocations_PR and squashes the following commits: e3cca14 [Micah Kornfield] fix cast style 1d006d8 [Micah Kornfield] fix warning c140e04 [Micah Kornfield] fix lint 7543267 [Micah Kornfield] cleanup 11b3fd7 [Micah Kornfield] replace cython string conversion with string builder 05653cb [Micah Kornfield] add back in memsets because they make valgrind happy 6ff3048 [Micah Kornfield] ARROW-185: Make padding and alignment for all buffers be 64 bytes --- cpp/src/arrow/builder.cc | 11 +++++-- cpp/src/arrow/ipc/adapter.cc | 20 ++++++++++++- cpp/src/arrow/ipc/ipc-adapter-test.cc | 6 ++-- cpp/src/arrow/types/list.cc | 2 +- cpp/src/arrow/types/list.h | 3 ++ cpp/src/arrow/types/primitive.cc | 7 ++--- cpp/src/arrow/util/bit-util-test.cc | 10 +++++++ cpp/src/arrow/util/bit-util.h | 4 +++ cpp/src/arrow/util/buffer.cc | 17 +++++++++++ cpp/src/arrow/util/buffer.h | 34 +++++++++++++-------- cpp/src/arrow/util/memory-pool-test.cc | 1 + cpp/src/arrow/util/memory-pool.cc | 31 ++++++++++++++----- python/src/pyarrow/adapters/pandas.cc | 41 ++++++-------------------- 13 files changed, 124 insertions(+), 63 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 87c1219025d..1fba9616922 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -45,12 +45,14 @@ Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int32_t length) } Status ArrayBuilder::Init(int32_t capacity) { - capacity_ = capacity; int32_t to_alloc = util::ceil_byte(capacity) / 8; null_bitmap_ = std::make_shared(pool_); RETURN_NOT_OK(null_bitmap_->Resize(to_alloc)); + // Buffers might allocate more then necessary to satisfy padding requirements + const int byte_capacity = null_bitmap_->capacity(); + capacity_ = capacity; null_bitmap_data_ = null_bitmap_->mutable_data(); - memset(null_bitmap_data_, 0, to_alloc); + memset(null_bitmap_data_, 0, byte_capacity); return Status::OK(); } @@ -60,8 +62,11 @@ Status ArrayBuilder::Resize(int32_t new_bits) { int32_t old_bytes = null_bitmap_->size(); RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); null_bitmap_data_ = null_bitmap_->mutable_data(); + // The buffer might be overpadded to deal with padding according to the spec + const int32_t byte_capacity = null_bitmap_->capacity(); + capacity_ = new_bits; if (old_bytes < new_bytes) { - memset(null_bitmap_data_ + old_bytes, 0, new_bytes - old_bytes); + memset(null_bitmap_data_ + old_bytes, 0, byte_capacity - old_bytes); } return Status::OK(); } diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 34700080746..45cc288cd6b 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -43,6 +43,15 @@ namespace flatbuf = apache::arrow::flatbuf; namespace ipc { +namespace { +Status CheckMultipleOf64(int64_t size) { + if (util::is_multiple_of_64(size)) { return Status::OK(); } + return Status::Invalid( + "Attempted to write a buffer that " + "wasn't a multiple of 64 bytes"); +} +} + static bool IsPrimitive(const DataType* type) { DCHECK(type != nullptr); switch (type->type) { @@ -115,6 +124,8 @@ Status VisitArray(const Array* arr, std::vector* field_nodes } else if (arr->type_enum() == Type::STRUCT) { // TODO(wesm) return Status::NotImplemented("Struct type"); + } else { + return Status::NotImplemented("Unrecognized type"); } return Status::OK(); } @@ -142,7 +153,13 @@ class RowBatchWriter { int64_t size = 0; // The buffer might be null if we are handling zero row lengths. - if (buffer) { size = buffer->size(); } + if (buffer) { + // We use capacity here, because size might not reflect the padding + // requirements of buffers but capacity always should. + size = buffer->capacity(); + // check that padding is appropriate + RETURN_NOT_OK(CheckMultipleOf64(size)); + } // TODO(wesm): We currently have no notion of shared memory page id's, // but we've included it in the metadata IDL for when we have it in the // future. Use page=0 for now @@ -305,6 +322,7 @@ class RowBatchReader::Impl { Status GetBuffer(int buffer_index, std::shared_ptr* out) { BufferMetadata metadata = metadata_->buffer(buffer_index); + RETURN_NOT_OK(CheckMultipleOf64(metadata.length)); return source_->ReadAt(metadata.offset, metadata.length, out); } diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 3b147343f77..eb47ac6fee8 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -197,8 +197,8 @@ INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, void TestGetRowBatchSize(std::shared_ptr batch) { MockMemorySource mock_source(1 << 16); - int64_t mock_header_location; - int64_t size; + int64_t mock_header_location = -1; + int64_t size = -1; ASSERT_OK(WriteRowBatch(&mock_source, batch.get(), 0, &mock_header_location)); ASSERT_OK(GetRowBatchSize(batch.get(), &size)); ASSERT_EQ(mock_source.GetExtentBytesWritten(), size); @@ -270,7 +270,7 @@ TEST_F(RecursionLimits, WriteLimit) { } TEST_F(RecursionLimits, ReadLimit) { - int64_t header_location; + int64_t header_location = -1; std::shared_ptr schema; ASSERT_OK(WriteToMmap(64, true, &header_location, &schema)); diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index fc3331139c6..76e7fe5f4d4 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -47,7 +47,7 @@ bool ListArray::Equals(const std::shared_ptr& arr) const { Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } - if (offset_buf_->size() / sizeof(int32_t) < length_) { + if (offset_buf_->size() / static_cast(sizeof(int32_t)) < length_) { std::stringstream ss; ss << "offset buffer size (bytes): " << offset_buf_->size() << " isn't large enough for length: " << length_; diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index e2302d917b8..a020b8ad226 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -20,6 +20,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -113,12 +114,14 @@ class ListBuilder : public ArrayBuilder { values_(values) {} Status Init(int32_t elements) override { + DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); } Status Resize(int32_t capacity) override { + DCHECK_LT(capacity, std::numeric_limits::max()); // one more then requested for offsets RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); return ArrayBuilder::Resize(capacity); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 9102c530e25..57a3f1e4e15 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -76,6 +76,7 @@ Status PrimitiveBuilder::Init(int32_t capacity) { int64_t nbytes = type_traits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(nbytes)); + // TODO(emkornfield) valgrind complains without this memset(data_->mutable_data(), 0, nbytes); raw_data_ = reinterpret_cast(data_->mutable_data()); @@ -91,15 +92,13 @@ Status PrimitiveBuilder::Resize(int32_t capacity) { RETURN_NOT_OK(Init(capacity)); } else { RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); - - int64_t old_bytes = data_->size(); - int64_t new_bytes = type_traits::bytes_required(capacity); + const int64_t old_bytes = data_->size(); + const int64_t new_bytes = type_traits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); } - capacity_ = capacity; return Status::OK(); } diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 26554d2c906..e1d8a0808b4 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -21,6 +21,16 @@ namespace arrow { +TEST(UtilTests, TestIsMultipleOf64) { + using util::is_multiple_of_64; + EXPECT_TRUE(is_multiple_of_64(64)); + EXPECT_TRUE(is_multiple_of_64(0)); + EXPECT_TRUE(is_multiple_of_64(128)); + EXPECT_TRUE(is_multiple_of_64(192)); + EXPECT_FALSE(is_multiple_of_64(23)); + EXPECT_FALSE(is_multiple_of_64(32)); +} + TEST(UtilTests, TestNextPower2) { using util::next_power2; diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 1f0f08c4d88..a6c8dd904d8 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -71,6 +71,10 @@ static inline int64_t next_power2(int64_t n) { return n; } +static inline bool is_multiple_of_64(int64_t n) { + return (n & 63) == 0; +} + void bytes_to_bits(const std::vector& bytes, uint8_t* bits); Status bytes_to_bits(const std::vector&, std::shared_ptr*); diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index bc9c22c10de..703ef8384ac 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -18,16 +18,32 @@ #include "arrow/util/buffer.h" #include +#include +#include "arrow/util/logging.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" namespace arrow { +namespace { +int64_t RoundUpToMultipleOf64(int64_t num) { + DCHECK_GE(num, 0); + constexpr int64_t round_to = 64; + constexpr int64_t force_carry_addend = round_to - 1; + constexpr int64_t truncate_bitmask = ~(round_to - 1); + constexpr int64_t max_roundable_num = std::numeric_limits::max() - round_to; + if (num <= max_roundable_num) { return (num + force_carry_addend) & truncate_bitmask; } + // handle overflow case. This should result in a malloc error upstream + return num; +} +} // namespace + Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size) { data_ = parent->data() + offset; size_ = size; parent_ = parent; + capacity_ = size; } Buffer::~Buffer() {} @@ -48,6 +64,7 @@ PoolBuffer::~PoolBuffer() { Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; + new_capacity = RoundUpToMultipleOf64(new_capacity); if (mutable_data_) { RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data)); memcpy(new_data, mutable_data_, size_); diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 5ef0076953c..f845d67761f 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -36,15 +36,23 @@ class Status; // Buffer classes // Immutable API for a chunk of bytes which may or may not be owned by the -// class instance +// class instance. Buffers have two related notions of length: size and +// capacity. Size is the number of bytes that might have valid data. +// Capacity is the number of bytes that where allocated for the buffer in +// total. +// The following invariant is always true: Size < Capacity class Buffer : public std::enable_shared_from_this { public: - Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size) {} + Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size), capacity_(size) {} virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be // able to retain a valid pointer to it even after other shared_ptr's to the // parent buffer have been destroyed + // + // This method makes no assertions about alignment or padding of the buffer but + // in general we expected buffers to be aligned and padded to 64 bytes. In the future + // we might add utility methods to help determine if a buffer satisfies this contract. Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size); std::shared_ptr get_shared_ptr() { return shared_from_this(); } @@ -63,6 +71,7 @@ class Buffer : public std::enable_shared_from_this { (data_ == other.data_ || !memcmp(data_, other.data_, size_))); } + int64_t capacity() const { return capacity_; } const uint8_t* data() const { return data_; } int64_t size() const { return size_; } @@ -76,6 +85,7 @@ class Buffer : public std::enable_shared_from_this { protected: const uint8_t* data_; int64_t size_; + int64_t capacity_; // nullptr by default, but may be set std::shared_ptr parent_; @@ -105,18 +115,17 @@ class MutableBuffer : public Buffer { class ResizableBuffer : public MutableBuffer { public: // Change buffer reported size to indicated size, allocating memory if - // necessary + // necessary. This will ensure that the capacity of the buffer is a multiple + // of 64 bytes as defined in Layout.md. virtual Status Resize(int64_t new_size) = 0; // Ensure that buffer has enough memory allocated to fit the indicated - // capacity. Does not change buffer's reported size + // capacity (and meets the 64 byte padding requirement in Layout.md). + // It does not change buffer's reported size. virtual Status Reserve(int64_t new_capacity) = 0; protected: - ResizableBuffer(uint8_t* data, int64_t size) - : MutableBuffer(data, size), capacity_(size) {} - - int64_t capacity_; + ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} }; // A Buffer whose lifetime is tied to a particular MemoryPool @@ -125,8 +134,8 @@ class PoolBuffer : public ResizableBuffer { explicit PoolBuffer(MemoryPool* pool = nullptr); virtual ~PoolBuffer(); - virtual Status Resize(int64_t new_size); - virtual Status Reserve(int64_t new_capacity); + Status Resize(int64_t new_size) override; + Status Reserve(int64_t new_capacity) override; private: MemoryPool* pool_; @@ -138,10 +147,11 @@ class BufferBuilder { public: explicit BufferBuilder(MemoryPool* pool) : pool_(pool), capacity_(0), size_(0) {} + // Resizes the buffer to the nearest multiple of 64 bytes per Layout.md Status Resize(int32_t elements) { if (capacity_ == 0) { buffer_ = std::make_shared(pool_); } - capacity_ = elements; - RETURN_NOT_OK(buffer_->Resize(capacity_)); + RETURN_NOT_OK(buffer_->Resize(elements)); + capacity_ = buffer_->capacity(); data_ = buffer_->mutable_data(); return Status::OK(); } diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index e4600a9bd9b..4ab9736c2b4 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -31,6 +31,7 @@ TEST(DefaultMemoryPool, MemoryTracking) { uint8_t* data; ASSERT_OK(pool->Allocate(100, &data)); + EXPECT_EQ(0, reinterpret_cast(data) % 64); ASSERT_EQ(100, pool->bytes_allocated()); pool->Free(data, 100); diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index 961554fe06b..0a58e5aa21f 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -17,6 +17,7 @@ #include "arrow/util/memory-pool.h" +#include #include #include #include @@ -25,6 +26,28 @@ namespace arrow { +namespace { +// Allocate memory according to the alignment requirements for Arrow +// (as of May 2016 64 bytes) +Status AllocateAligned(int64_t size, uint8_t** out) { + // TODO(emkornfield) find something compatible with windows + constexpr size_t kAlignment = 64; + const int result = posix_memalign(reinterpret_cast(out), kAlignment, size); + if (result == ENOMEM) { + std::stringstream ss; + ss << "malloc of size " << size << " failed"; + return Status::OutOfMemory(ss.str()); + } + + if (result == EINVAL) { + std::stringstream ss; + ss << "invalid alignment parameter: " << kAlignment; + return Status::Invalid(ss.str()); + } + return Status::OK(); +} +} // namespace + MemoryPool::~MemoryPool() {} class InternalMemoryPool : public MemoryPool { @@ -45,13 +68,7 @@ class InternalMemoryPool : public MemoryPool { Status InternalMemoryPool::Allocate(int64_t size, uint8_t** out) { std::lock_guard guard(pool_lock_); - *out = static_cast(std::malloc(size)); - if (*out == nullptr) { - std::stringstream ss; - ss << "malloc of size " << size << " failed"; - return Status::OutOfMemory(ss.str()); - } - + RETURN_NOT_OK(AllocateAligned(size, out)); bytes_allocated_ += size; return Status::OK(); diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index b39fde92034..5159d86865c 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -147,17 +147,12 @@ class ArrowSerializer { Status ConvertObjectStrings(std::shared_ptr* out) { PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); + arrow::TypePtr string_type(new arrow::StringType()); + arrow::StringBuilder string_builder(pool_, string_type); + RETURN_ARROW_NOT_OK(string_builder.Resize(length_)); - auto offsets_buffer = std::make_shared(pool_); - RETURN_ARROW_NOT_OK(offsets_buffer->Resize(sizeof(int32_t) * (length_ + 1))); - int32_t* offsets = reinterpret_cast(offsets_buffer->mutable_data()); - - arrow::BufferBuilder data_builder(pool_); arrow::Status s; PyObject* obj; - int length; - int offset = 0; - int64_t null_count = 0; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; if (PyUnicode_Check(obj)) { @@ -166,38 +161,20 @@ class ArrowSerializer { PyErr_Clear(); return Status::TypeError("failed converting unicode to UTF8"); } - length = PyBytes_GET_SIZE(obj); - s = data_builder.Append( - reinterpret_cast(PyBytes_AS_STRING(obj)), length); + const int32_t length = PyBytes_GET_SIZE(obj); + s = string_builder.Append(PyBytes_AS_STRING(obj), length); Py_DECREF(obj); if (!s.ok()) { return Status::ArrowError(s.ToString()); } - util::set_bit(null_bitmap_data_, i); } else if (PyBytes_Check(obj)) { - length = PyBytes_GET_SIZE(obj); - RETURN_ARROW_NOT_OK(data_builder.Append( - reinterpret_cast(PyBytes_AS_STRING(obj)), length)); - util::set_bit(null_bitmap_data_, i); + const int32_t length = PyBytes_GET_SIZE(obj); + RETURN_ARROW_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length)); } else { - // NULL - // No change to offset - length = 0; - ++null_count; + string_builder.AppendNull(); } - offsets[i] = offset; - offset += length; } - // End offset - offsets[length_] = offset; - - std::shared_ptr data_buffer = data_builder.Finish(); - - auto values = std::make_shared(data_buffer->size(), - data_buffer); - *out = std::shared_ptr( - new arrow::StringArray(length_, offsets_buffer, values, null_count, - null_bitmap_)); + *out = std::shared_ptr(string_builder.Finish()); return Status::OK(); } From 978de1a94dd451b3142aca0eb95ce410064a2330 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 18 May 2016 10:15:14 -0700 Subject: [PATCH 074/210] ARROW-204: Add Travis CI builds that post conda artifacts for Linux and OS X I tested this on my fork of Arrow, but ultimately we'll see if it works when the commit hits master. I've arranged so that packaging issues won't fail the build. Author: Wes McKinney Closes #79 from wesm/ARROW-204 and squashes the following commits: afd0582 [Wes McKinney] Change encrypted token to apache/arrow, only upload on commits to master 58955e5 [Wes McKinney] Draft of automated conda builds for libarrow, pyarrow. Remove unneeded thrift-cpp build dependency --- .travis.yml | 27 ++++++++++++++++++- ci/travis_conda_build.sh | 53 ++++++++++++++++++++++++++++++++++++++ cpp/conda.recipe/build.sh | 15 ++++++++++- cpp/conda.recipe/meta.yaml | 5 ++-- 4 files changed, 95 insertions(+), 5 deletions(-) create mode 100755 ci/travis_conda_build.sh diff --git a/.travis.yml b/.travis.yml index a0138a79598..646f80fee7b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ sudo: required -dist: precise +dist: precise addons: apt: sources: @@ -18,6 +18,9 @@ addons: - valgrind matrix: + fast_finish: true + allow_failures: + - env: ARROW_TEST_GROUP=packaging include: - compiler: gcc language: cpp @@ -39,6 +42,24 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh + - compiler: gcc + env: ARROW_TEST_GROUP=packaging + os: linux + before_script: + - export CC="gcc-4.9" + - export CXX="g++-4.9" + script: + - $TRAVIS_BUILD_DIR/ci/travis_conda_build.sh + - os: osx + env: ARROW_TEST_GROUP=packaging + language: objective-c + osx_image: xcode6.4 + compiler: clang + addons: + before_script: + before_install: + script: + - $TRAVIS_BUILD_DIR/ci/travis_conda_build.sh before_install: - ulimit -c unlimited -S @@ -51,3 +72,7 @@ after_script: after_failure: - COREFILE=$(find . -maxdepth 2 -name "core*" | head -n 1) - if [[ -f "$COREFILE" ]]; then gdb -c "$COREFILE" example -ex "thread apply all bt" -ex "set pagination 0" -batch; fi + +env: + global: + - secure: "GcrPtsKUCgNY7HKYjWlHQo8SiFrShDvdZSU8t1m1FJrE+UfK0Dgh9zXmAausM8GmhqSwkF0q4UbLQf2uCnSITWKeEPAL8Mo9eu4ib+ikJx/b3Sk81frgW5ADoHfW1Eyqd8xJNIMwMegJOtRLSDqiXh1CvMlKnY8PyTOGM2DgN9ona/v6p9OFH9Qs0JhBRVXAn0S4ztjumck8E56+01hqRfxbZ88pTfpKghBxYp9PJaMjtGdomjVWlqPaWaWJj+KptT8inV9NK+TVYKx0dXWD+S1Vgr1PytQnLdILOYV23gsOBYqn33ByF/yADl4m3hUjU/qeT0Fi7aWxmVpj+oTJISOSH5N8nIsuNH8mQk2ZzzXHfV7btFvP+cOPRczadoKkT6D6cHA8nQ7b0dphC6bl6SAeSfc/cbhRT+fYnIjg8jFXC8jlyWBr7LR6GXVpc0bND7i300ITo0FuRJhy2OxqPtGo3dKLE7eAcv78tuO0OYJ/kol1PEqFdFkbYbNVbg/cFpbGqiCXDsOtPDbAGBv69YnXdVowSxxs8cRGjSkDydv6ZSytb/Zd4lH/KAomcFNk8adx12O1Lk4sbmVav1cGig5P6OcQKS0jC5IiRb4THcQzVzAkXXbaafKm5sru/NoYxhzmkyhkOc11nTYHKVng+XKWzLCNn7pTTSLitp5+xa4=" diff --git a/ci/travis_conda_build.sh b/ci/travis_conda_build.sh new file mode 100755 index 00000000000..afa531dbd6b --- /dev/null +++ b/ci/travis_conda_build.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +set -e + +if [ $TRAVIS_OS_NAME == "linux" ]; then + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" +else + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" +fi + +wget -O miniconda.sh $MINICONDA_URL +MINICONDA=$TRAVIS_BUILD_DIR/miniconda +bash miniconda.sh -b -p $MINICONDA +export PATH="$MINICONDA/bin:$PATH" +conda update -y -q conda +conda info -a + +conda config --set show_channel_urls yes +conda config --add channels conda-forge +conda config --add channels apache + +conda install --yes conda-build jinja2 anaconda-client + +# faster builds, please +conda install -y nomkl + +# Build libarrow + +cd $TRAVIS_BUILD_DIR/cpp + +conda build conda.recipe --channel apache/channel/dev +CONDA_PACKAGE=`conda build --output conda.recipe | grep bz2` + +if [ $TRAVIS_BRANCH == "master" ] && [ $TRAVIS_PULL_REQUEST == "false" ]; then + anaconda --token $ANACONDA_TOKEN upload $CONDA_PACKAGE --user apache --channel dev; +fi + +# Build pyarrow + +cd $TRAVIS_BUILD_DIR/python + +build_for_python_version() { + PY_VERSION=$1 + conda build conda.recipe --python $PY_VERSION --channel apache/channel/dev + CONDA_PACKAGE=`conda build --python $PY_VERSION --output conda.recipe | grep bz2` + + if [ $TRAVIS_BRANCH == "master" ] && [ $TRAVIS_PULL_REQUEST == "false" ]; then + anaconda --token $ANACONDA_TOKEN upload $CONDA_PACKAGE --user apache --channel dev; + fi +} + +build_for_python_version 2.7 +build_for_python_version 3.5 diff --git a/cpp/conda.recipe/build.sh b/cpp/conda.recipe/build.sh index ac1f9c89cc9..b10dd03349b 100644 --- a/cpp/conda.recipe/build.sh +++ b/cpp/conda.recipe/build.sh @@ -9,6 +9,19 @@ cd $RECIPE_DIR export FLATBUFFERS_HOME=$PREFIX export PARQUET_HOME=$PREFIX +if [ "$(uname)" == "Darwin" ]; then + # C++11 finagling for Mac OSX + export CC=clang + export CXX=clang++ + export MACOSX_VERSION_MIN="10.7" + CXXFLAGS="${CXXFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}" + CXXFLAGS="${CXXFLAGS} -stdlib=libc++ -std=c++11" + export LDFLAGS="${LDFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}" + export LDFLAGS="${LDFLAGS} -stdlib=libc++ -std=c++11" + export LINKFLAGS="${LDFLAGS}" + export MACOSX_DEPLOYMENT_TARGET=10.7 +fi + cd .. rm -rf conda-build @@ -33,7 +46,7 @@ elif [ `uname` == Darwin ]; then fi cmake \ - -DCMAKE_BUILD_TYPE=debug \ + -DCMAKE_BUILD_TYPE=release \ -DCMAKE_INSTALL_PREFIX=$PREFIX \ -DCMAKE_SHARED_LINKER_FLAGS=$SHARED_LINKER_FLAGS \ -DARROW_IPC=on \ diff --git a/cpp/conda.recipe/meta.yaml b/cpp/conda.recipe/meta.yaml index 2e834d5cbf8..75f3a8ba3d9 100644 --- a/cpp/conda.recipe/meta.yaml +++ b/cpp/conda.recipe/meta.yaml @@ -15,15 +15,14 @@ requirements: - cmake - flatbuffers - parquet-cpp - - thrift-cpp run: - parquet-cpp test: commands: - - test -f $PREFIX/lib/libarrow.so - - test -f $PREFIX/lib/libarrow_parquet.so + - test -f $PREFIX/lib/libarrow.so # [linux] + - test -f $PREFIX/lib/libarrow_parquet.so # [linux] - test -f $PREFIX/include/arrow/api.h about: From e0fb3698e5602bccaee232d4c259b3df089886e6 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 18 May 2016 10:49:04 -0700 Subject: [PATCH 075/210] ARROW-201: [C++] Initial ParquetWriter implementation Author: Uwe L. Korn Closes #78 from xhochy/arrow-201 and squashes the following commits: 5d95099 [Uwe L. Korn] Add check for flat column 88ae3ca [Uwe L. Korn] Install arrow_parquet headers f81021b [Uwe L. Korn] Incorporate reader comments ba240e8 [Uwe L. Korn] Incorporate writer comments 2179c0e [Uwe L. Korn] Infer c-type from ArrowType efd46fb [Uwe L. Korn] Infer c-type from ArrowType 77386ea [Uwe L. Korn] Templatize test functions 1aa7698 [Uwe L. Korn] Add comment to helper function 8fdd4c8 [Uwe L. Korn] Parameterize schema creation 8e8d7d7 [Uwe L. Korn] ARROW-201: [C++] Initial ParquetWriter implementation --- cpp/src/arrow/parquet/CMakeLists.txt | 6 +- cpp/src/arrow/parquet/parquet-io-test.cc | 222 +++++++++++++++++++ cpp/src/arrow/parquet/parquet-reader-test.cc | 116 ---------- cpp/src/arrow/parquet/reader.cc | 79 ++++--- cpp/src/arrow/parquet/writer.cc | 148 +++++++++++++ cpp/src/arrow/parquet/writer.h | 59 +++++ 6 files changed, 485 insertions(+), 145 deletions(-) create mode 100644 cpp/src/arrow/parquet/parquet-io-test.cc delete mode 100644 cpp/src/arrow/parquet/parquet-reader-test.cc create mode 100644 cpp/src/arrow/parquet/writer.cc create mode 100644 cpp/src/arrow/parquet/writer.h diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index cd6f05d6b5f..c00cc9f0f25 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -21,6 +21,7 @@ set(PARQUET_SRCS reader.cc schema.cc + writer.cc ) set(PARQUET_LIBS @@ -37,14 +38,15 @@ SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) ADD_ARROW_TEST(parquet-schema-test) ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) -ADD_ARROW_TEST(parquet-reader-test) -ARROW_TEST_LINK_LIBRARIES(parquet-reader-test arrow_parquet) +ADD_ARROW_TEST(parquet-io-test) +ARROW_TEST_LINK_LIBRARIES(parquet-io-test arrow_parquet) # Headers: top level install(FILES reader.h schema.h utils.h + writer.h DESTINATION include/arrow/parquet) install(TARGETS arrow_parquet diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc new file mode 100644 index 00000000000..845574d2c53 --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" +#include "arrow/parquet/reader.h" +#include "arrow/parquet/writer.h" +#include "arrow/types/primitive.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +#include "parquet/api/reader.h" +#include "parquet/api/writer.h" + +using ParquetBuffer = parquet::Buffer; +using parquet::BufferReader; +using parquet::InMemoryOutputStream; +using parquet::ParquetFileReader; +using parquet::ParquetFileWriter; +using parquet::RandomAccessSource; +using parquet::Repetition; +using parquet::SchemaDescriptor; +using ParquetType = parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::NodePtr; +using parquet::schema::PrimitiveNode; + +namespace arrow { + +namespace parquet { + +template +std::shared_ptr NonNullArray( + size_t size, typename ArrowType::c_type value) { + std::vector values(size, value); + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size()); + return std::static_pointer_cast(builder.Finish()); +} + +// This helper function only supports (size/2) nulls yet. +template +std::shared_ptr NullableArray( + size_t size, typename ArrowType::c_type value, size_t num_nulls) { + std::vector values(size, value); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size(), valid_bytes.data()); + return std::static_pointer_cast(builder.Finish()); +} + +class TestParquetIO : public ::testing::Test { + public: + virtual void SetUp() {} + + std::shared_ptr Schema( + ParquetType::type parquet_type, Repetition::type repetition) { + auto pnode = PrimitiveNode::Make("column1", repetition, parquet_type); + NodePtr node_ = + GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); + return std::static_pointer_cast(node_); + } + + std::unique_ptr MakeWriter(std::shared_ptr& schema) { + sink_ = std::make_shared(); + return ParquetFileWriter::Open(sink_, schema); + } + + std::unique_ptr ReaderFromSink() { + std::shared_ptr buffer = sink_->GetBuffer(); + std::unique_ptr source(new BufferReader(buffer)); + return ParquetFileReader::Open(std::move(source)); + } + + void ReadSingleColumnFile( + std::unique_ptr file_reader, std::shared_ptr* out) { + arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); + std::unique_ptr column_reader; + ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); + ASSERT_NE(nullptr, column_reader.get()); + ASSERT_OK(column_reader->NextBatch(100, out)); + ASSERT_NE(nullptr, out->get()); + } + + std::unique_ptr Int64File( + std::vector& values, int num_chunks) { + std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); + std::unique_ptr file_writer = MakeWriter(schema); + size_t chunk_size = values.size() / num_chunks; + for (int i = 0; i < num_chunks; i++) { + auto row_group_writer = file_writer->AppendRowGroup(chunk_size); + auto column_writer = + static_cast<::parquet::Int64Writer*>(row_group_writer->NextColumn()); + int64_t* data = values.data() + i * chunk_size; + column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); + column_writer->Close(); + row_group_writer->Close(); + } + file_writer->Close(); + return ReaderFromSink(); + } + + private: + std::shared_ptr sink_; +}; + +TEST_F(TestParquetIO, SingleColumnInt64Read) { + std::vector values(100, 128); + std::unique_ptr file_reader = Int64File(values, 1); + + std::shared_ptr out; + ReadSingleColumnFile(std::move(file_reader), &out); + + Int64Array* out_array = static_cast(out.get()); + for (size_t i = 0; i < values.size(); i++) { + EXPECT_EQ(values[i], out_array->raw_data()[i]); + } +} + +TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) { + std::vector values(100, 128); + std::unique_ptr file_reader = Int64File(values, 4); + + std::shared_ptr out; + ReadSingleColumnFile(std::move(file_reader), &out); + + Int64Array* out_array = static_cast(out.get()); + for (size_t i = 0; i < values.size(); i++) { + EXPECT_EQ(values[i], out_array->raw_data()[i]); + } +} + +TEST_F(TestParquetIO, SingleColumnInt64Write) { + std::shared_ptr values = NonNullArray(100, 128); + + std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); + FileWriter writer(default_memory_pool(), MakeWriter(schema)); + ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); + ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); + ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + + std::shared_ptr out; + ReadSingleColumnFile(ReaderFromSink(), &out); + ASSERT_TRUE(values->Equals(out)); +} + +TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(100, 128, 10); + + std::shared_ptr schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL); + FileWriter writer(default_memory_pool(), MakeWriter(schema)); + ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); + ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); + ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + + std::shared_ptr out; + ReadSingleColumnFile(ReaderFromSink(), &out); + ASSERT_TRUE(values->Equals(out)); +} + +TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) { + std::shared_ptr values = NonNullArray(100, 128); + std::shared_ptr values_chunk = NonNullArray(25, 128); + + std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); + FileWriter writer(default_memory_pool(), MakeWriter(schema)); + for (int i = 0; i < 4; i++) { + ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length()))); + ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk.get()))); + } + ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + + std::shared_ptr out; + ReadSingleColumnFile(ReaderFromSink(), &out); + ASSERT_TRUE(values->Equals(out)); +} + +TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) { + std::shared_ptr values = NullableArray(100, 128, 10); + std::shared_ptr values_chunk_nulls = + NullableArray(25, 128, 10); + std::shared_ptr values_chunk = NullableArray(25, 128, 0); + + std::shared_ptr schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL); + FileWriter writer(default_memory_pool(), MakeWriter(schema)); + ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk_nulls->length()))); + ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk_nulls.get()))); + for (int i = 0; i < 3; i++) { + ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length()))); + ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk.get()))); + } + ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + + std::shared_ptr out; + ReadSingleColumnFile(ReaderFromSink(), &out); + ASSERT_TRUE(values->Equals(out)); +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/parquet-reader-test.cc b/cpp/src/arrow/parquet/parquet-reader-test.cc deleted file mode 100644 index a7fc2a89f5f..00000000000 --- a/cpp/src/arrow/parquet/parquet-reader-test.cc +++ /dev/null @@ -1,116 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "gtest/gtest.h" - -#include "arrow/test-util.h" -#include "arrow/parquet/reader.h" -#include "arrow/types/primitive.h" -#include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" - -#include "parquet/api/reader.h" -#include "parquet/api/writer.h" - -using ParquetBuffer = parquet::Buffer; -using parquet::BufferReader; -using parquet::InMemoryOutputStream; -using parquet::Int64Writer; -using parquet::ParquetFileReader; -using parquet::ParquetFileWriter; -using parquet::RandomAccessSource; -using parquet::Repetition; -using parquet::SchemaDescriptor; -using ParquetType = parquet::Type; -using parquet::schema::GroupNode; -using parquet::schema::NodePtr; -using parquet::schema::PrimitiveNode; - -namespace arrow { - -namespace parquet { - -class TestReadParquet : public ::testing::Test { - public: - virtual void SetUp() {} - - std::shared_ptr Int64Schema() { - auto pnode = PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64); - NodePtr node_ = - GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); - return std::static_pointer_cast(node_); - } - - std::unique_ptr Int64File( - std::vector& values, int num_chunks) { - std::shared_ptr schema = Int64Schema(); - std::shared_ptr sink(new InMemoryOutputStream()); - auto file_writer = ParquetFileWriter::Open(sink, schema); - size_t chunk_size = values.size() / num_chunks; - for (int i = 0; i < num_chunks; i++) { - auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = static_cast(row_group_writer->NextColumn()); - int64_t* data = values.data() + i * chunk_size; - column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); - column_writer->Close(); - row_group_writer->Close(); - } - file_writer->Close(); - - std::shared_ptr buffer = sink->GetBuffer(); - std::unique_ptr source(new BufferReader(buffer)); - return ParquetFileReader::Open(std::move(source)); - } - - private: -}; - -TEST_F(TestReadParquet, SingleColumnInt64) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 1); - arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); - std::unique_ptr column_reader; - ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); - ASSERT_NE(nullptr, column_reader.get()); - std::shared_ptr out; - ASSERT_OK(column_reader->NextBatch(100, &out)); - ASSERT_NE(nullptr, out.get()); - Int64Array* out_array = static_cast(out.get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } -} - -TEST_F(TestReadParquet, SingleColumnInt64Chunked) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 4); - arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); - std::unique_ptr column_reader; - ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); - ASSERT_NE(nullptr, column_reader.get()); - std::shared_ptr out; - ASSERT_OK(column_reader->NextBatch(100, &out)); - ASSERT_NE(nullptr, out.get()); - Int64Array* out_array = static_cast(out.get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } -} - -} // namespace parquet - -} // namespace arrow diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 481ded5789a..346de253606 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -26,6 +26,7 @@ #include "arrow/util/status.h" using parquet::ColumnReader; +using parquet::Repetition; using parquet::TypedColumnReader; namespace arrow { @@ -36,6 +37,7 @@ class FileReader::Impl { Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader); virtual ~Impl() {} + bool CheckForFlatColumn(const ::parquet::ColumnDescriptor* descr); Status GetFlatColumn(int i, std::unique_ptr* out); Status ReadFlatColumn(int i, std::shared_ptr* out); @@ -51,7 +53,7 @@ class FlatColumnReader::Impl { virtual ~Impl() {} Status NextBatch(int batch_size, std::shared_ptr* out); - template + template Status TypedReadBatch(int batch_size, std::shared_ptr* out); private: @@ -67,14 +69,28 @@ class FlatColumnReader::Impl { PoolBuffer values_buffer_; PoolBuffer def_levels_buffer_; - PoolBuffer rep_levels_buffer_; + PoolBuffer values_builder_buffer_; + PoolBuffer valid_bytes_buffer_; }; FileReader::Impl::Impl( MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader) : pool_(pool), reader_(std::move(reader)) {} +bool FileReader::Impl::CheckForFlatColumn(const ::parquet::ColumnDescriptor* descr) { + if ((descr->max_repetition_level() > 0) || (descr->max_definition_level() > 1)) { + return false; + } else if ((descr->max_definition_level() == 1) && + (descr->schema_node()->repetition() != Repetition::OPTIONAL)) { + return false; + } + return true; +} + Status FileReader::Impl::GetFlatColumn(int i, std::unique_ptr* out) { + if (!CheckForFlatColumn(reader_->descr()->Column(i))) { + return Status::Invalid("The requested column is not flat"); + } std::unique_ptr impl( new FlatColumnReader::Impl(pool_, reader_->descr()->Column(i), reader_.get(), i)); *out = std::unique_ptr(new FlatColumnReader(std::move(impl))); @@ -109,37 +125,50 @@ FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor column_index_(column_index), next_row_group_(0), values_buffer_(pool), - def_levels_buffer_(pool), - rep_levels_buffer_(pool) { + def_levels_buffer_(pool) { NodeToField(descr_->schema_node(), &field_); NextRowGroup(); } -template +template Status FlatColumnReader::Impl::TypedReadBatch( int batch_size, std::shared_ptr* out) { int values_to_read = batch_size; NumericBuilder builder(pool_, field_->type); while ((values_to_read > 0) && column_reader_) { - values_buffer_.Resize(values_to_read * sizeof(CType)); + values_buffer_.Resize(values_to_read * sizeof(typename ParquetType::c_type)); if (descr_->max_definition_level() > 0) { def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); } - if (descr_->max_repetition_level() > 0) { - rep_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); - } auto reader = dynamic_cast*>(column_reader_.get()); int64_t values_read; - CType* values = reinterpret_cast(values_buffer_.mutable_data()); - PARQUET_CATCH_NOT_OK( - values_to_read -= reader->ReadBatch(values_to_read, - reinterpret_cast(def_levels_buffer_.mutable_data()), - reinterpret_cast(rep_levels_buffer_.mutable_data()), values, - &values_read)); + int64_t levels_read; + int16_t* def_levels = reinterpret_cast(def_levels_buffer_.mutable_data()); + auto values = + reinterpret_cast(values_buffer_.mutable_data()); + PARQUET_CATCH_NOT_OK(levels_read = reader->ReadBatch( + values_to_read, def_levels, nullptr, values, &values_read)); + values_to_read -= levels_read; if (descr_->max_definition_level() == 0) { RETURN_NOT_OK(builder.Append(values, values_read)); } else { - return Status::NotImplemented("no support for definition levels yet"); + // descr_->max_definition_level() == 1 + RETURN_NOT_OK(values_builder_buffer_.Resize( + levels_read * sizeof(typename ParquetType::c_type))); + RETURN_NOT_OK(valid_bytes_buffer_.Resize(levels_read * sizeof(uint8_t))); + auto values_ptr = reinterpret_cast( + values_builder_buffer_.mutable_data()); + uint8_t* valid_bytes = valid_bytes_buffer_.mutable_data(); + int values_idx = 0; + for (int64_t i = 0; i < levels_read; i++) { + if (def_levels[i] < descr_->max_definition_level()) { + valid_bytes[i] = 0; + } else { + valid_bytes[i] = 1; + values_ptr[i] = values[values_idx++]; + } + } + builder.Append(values_ptr, levels_read, valid_bytes); } if (!column_reader_->HasNext()) { NextRowGroup(); } } @@ -147,9 +176,9 @@ Status FlatColumnReader::Impl::TypedReadBatch( return Status::OK(); } -#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType, CType) \ - case Type::ENUM: \ - return TypedReadBatch(batch_size, out); \ +#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ + case Type::ENUM: \ + return TypedReadBatch(batch_size, out); \ break; Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr* out) { @@ -159,15 +188,11 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr* return Status::OK(); } - if (descr_->max_repetition_level() > 0) { - return Status::NotImplemented("no support for repetition yet"); - } - switch (field_->type->type) { - TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type, int32_t) - TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type, int64_t) - TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType, float) - TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType, double) + TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) + TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) + TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) default: return Status::NotImplemented(field_->type->ToString()); } diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc new file mode 100644 index 00000000000..3ad2c5b0735 --- /dev/null +++ b/cpp/src/arrow/parquet/writer.cc @@ -0,0 +1,148 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/parquet/writer.h" + +#include "arrow/array.h" +#include "arrow/types/primitive.h" +#include "arrow/parquet/utils.h" +#include "arrow/util/status.h" + +namespace arrow { + +namespace parquet { + +class FileWriter::Impl { + public: + Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); + + Status NewRowGroup(int64_t chunk_size); + template + Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data); + Status WriteFlatColumnChunk(const PrimitiveArray* data); + Status Close(); + + virtual ~Impl() {} + + private: + MemoryPool* pool_; + PoolBuffer data_buffer_; + PoolBuffer def_levels_buffer_; + std::unique_ptr<::parquet::ParquetFileWriter> writer_; + ::parquet::RowGroupWriter* row_group_writer_; +}; + +FileWriter::Impl::Impl( + MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer) + : pool_(pool), + data_buffer_(pool), + writer_(std::move(writer)), + row_group_writer_(nullptr) {} + +Status FileWriter::Impl::NewRowGroup(int64_t chunk_size) { + if (row_group_writer_ != nullptr) { PARQUET_CATCH_NOT_OK(row_group_writer_->Close()); } + PARQUET_CATCH_NOT_OK(row_group_writer_ = writer_->AppendRowGroup(chunk_size)); + return Status::OK(); +} + +template +Status FileWriter::Impl::TypedWriteBatch( + ::parquet::ColumnWriter* column_writer, const PrimitiveArray* data) { + auto data_ptr = + reinterpret_cast(data->data()->data()); + auto writer = + reinterpret_cast<::parquet::TypedColumnWriter*>(column_writer); + if (writer->descr()->max_definition_level() == 0) { + // no nulls, just dump the data + PARQUET_CATCH_NOT_OK(writer->WriteBatch(data->length(), nullptr, nullptr, data_ptr)); + } else if (writer->descr()->max_definition_level() == 1) { + RETURN_NOT_OK(def_levels_buffer_.Resize(data->length() * sizeof(int16_t))); + int16_t* def_levels_ptr = + reinterpret_cast(def_levels_buffer_.mutable_data()); + if (data->null_count() == 0) { + std::fill(def_levels_ptr, def_levels_ptr + data->length(), 1); + PARQUET_CATCH_NOT_OK( + writer->WriteBatch(data->length(), def_levels_ptr, nullptr, data_ptr)); + } else { + RETURN_NOT_OK(data_buffer_.Resize( + (data->length() - data->null_count()) * sizeof(typename ParquetType::c_type))); + auto buffer_ptr = + reinterpret_cast(data_buffer_.mutable_data()); + int buffer_idx = 0; + for (size_t i = 0; i < data->length(); i++) { + if (data->IsNull(i)) { + def_levels_ptr[i] = 0; + } else { + def_levels_ptr[i] = 1; + buffer_ptr[buffer_idx++] = data_ptr[i]; + } + } + PARQUET_CATCH_NOT_OK( + writer->WriteBatch(data->length(), def_levels_ptr, nullptr, buffer_ptr)); + } + } else { + return Status::NotImplemented("no support for max definition level > 1 yet"); + } + PARQUET_CATCH_NOT_OK(writer->Close()); + return Status::OK(); +} + +Status FileWriter::Impl::Close() { + if (row_group_writer_ != nullptr) { PARQUET_CATCH_NOT_OK(row_group_writer_->Close()); } + PARQUET_CATCH_NOT_OK(writer_->Close()); + return Status::OK(); +} + +#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ + case Type::ENUM: \ + return TypedWriteBatch(writer, data); \ + break; + +Status FileWriter::Impl::WriteFlatColumnChunk(const PrimitiveArray* data) { + ::parquet::ColumnWriter* writer; + PARQUET_CATCH_NOT_OK(writer = row_group_writer_->NextColumn()); + switch (data->type_enum()) { + TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) + TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) + TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) + default: + return Status::NotImplemented(data->type()->ToString()); + } +} + +FileWriter::FileWriter( + MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer) + : impl_(new FileWriter::Impl(pool, std::move(writer))) {} + +Status FileWriter::NewRowGroup(int64_t chunk_size) { + return impl_->NewRowGroup(chunk_size); +} + +Status FileWriter::WriteFlatColumnChunk(const PrimitiveArray* data) { + return impl_->WriteFlatColumnChunk(data); +} + +Status FileWriter::Close() { + return impl_->Close(); +} + +FileWriter::~FileWriter() {} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h new file mode 100644 index 00000000000..38f7d0b3a89 --- /dev/null +++ b/cpp/src/arrow/parquet/writer.h @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_WRITER_H +#define ARROW_PARQUET_WRITER_H + +#include + +#include "parquet/api/schema.h" +#include "parquet/api/writer.h" + +namespace arrow { + +class MemoryPool; +class PrimitiveArray; +class RowBatch; +class Status; + +namespace parquet { + +/** + * Iterative API: + * Start a new RowGroup/Chunk with NewRowGroup + * Write column-by-column the whole column chunk + */ +class FileWriter { + public: + FileWriter(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); + + Status NewRowGroup(int64_t chunk_size); + Status WriteFlatColumnChunk(const PrimitiveArray* data); + Status Close(); + + virtual ~FileWriter(); + + private: + class Impl; + std::unique_ptr impl_; +}; + +} // namespace parquet + +} // namespace arrow + +#endif // ARROW_PARQUET_WRITER_H From c0985a47665f8ce8847a6a0215e6e3c0f1db28f4 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Mon, 18 Apr 2016 11:07:22 -0700 Subject: [PATCH 076/210] Make BaseValueVector#MAX_ALLOCATION_SIZE configurable This closes #65 Some of the tests are based on the assumption that the JVM can allocate at least 2GB of memory, which is not a common occurence (JVM usually defaults at 512MB). Current Travis CI VM only have 3GB of memory total, which would have make challenging to run some of the tests on them Add a system property to change BaseValueVector.MAX_ALLOCATION_SIZE to allow to use a much smaller value during tests. --- .../apache/arrow/vector/BaseValueVector.java | 14 ++++---- .../apache/arrow/vector/TestValueVector.java | 36 +++++++++++++++---- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 8bca3c00537..932e6f13caf 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -17,23 +17,24 @@ */ package org.apache.arrow.vector; -import io.netty.buffer.ArrowBuf; - import java.util.Iterator; -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterators; - import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.types.MaterializedField; import org.apache.arrow.vector.util.TransferPair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterators; + +import io.netty.buffer.ArrowBuf; + public abstract class BaseValueVector implements ValueVector { private static final Logger logger = LoggerFactory.getLogger(BaseValueVector.class); - public static final int MAX_ALLOCATION_SIZE = Integer.MAX_VALUE; + public static final String MAX_ALLOCATION_SIZE_PROPERTY = "arrow.vector.max_allocation_bytes"; + public static final int MAX_ALLOCATION_SIZE = Integer.getInteger(MAX_ALLOCATION_SIZE_PROPERTY, Integer.MAX_VALUE); public static final int INITIAL_VALUE_ALLOCATION = 4096; protected final BufferAllocator allocator; @@ -99,6 +100,7 @@ protected BaseMutator() { } public void generateTestData(int values) {} //TODO: consider making mutator stateless(if possible) on another issue. + @Override public void reset() {} } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index ac3eebe98ea..b5c4509c8b5 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -23,16 +23,12 @@ import java.nio.charset.Charset; +import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.RepeatedListVector; import org.apache.arrow.vector.complex.RepeatedMapVector; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types; -import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.BasicTypeHelper; -import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.holders.BitHolder; import org.apache.arrow.vector.holders.IntHolder; import org.apache.arrow.vector.holders.NullableFloat4Holder; @@ -44,10 +40,16 @@ import org.apache.arrow.vector.holders.RepeatedVarBinaryHolder; import org.apache.arrow.vector.holders.UInt4Holder; import org.apache.arrow.vector.holders.VarCharHolder; -import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.BasicTypeHelper; +import org.apache.arrow.vector.util.OversizedAllocationException; import org.junit.After; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExternalResource; public class TestValueVector { @@ -57,6 +59,28 @@ public class TestValueVector { private BufferAllocator allocator; + // Rule to adjust MAX_ALLOCATION_SIZE and restore it back after the tests + @Rule + public final ExternalResource rule = new ExternalResource() { + private final String systemValue = System.getProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY); + private final String testValue = Long.toString(32*1024*1024); + + @Override + protected void before() throws Throwable { + System.setProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY, testValue); + } + + @Override + protected void after() { + if (systemValue != null) { + System.setProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY, systemValue); + } + else { + System.clearProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY); + } + } + }; + @Before public void init() { allocator = new RootAllocator(Long.MAX_VALUE); From e316b3f765167fa1f45197061624e73332b095f4 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Fri, 15 Apr 2016 14:00:19 -0700 Subject: [PATCH 077/210] Fix BaseAllocator.java NPE when assertions are disabled This closes #64 When verifying memory using verifyAllocator() method, BaseAllocator throws NPE if assertions are disabled. Fixing this issue by checking first if assertion are disabled --- .../apache/arrow/memory/BaseAllocator.java | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java index 90257bb9ffb..f1503c902d0 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -99,6 +99,7 @@ protected BaseAllocator( } + @Override public void assertOpen() { if (AssertionUtil.ASSERT_ENABLED) { if (isClosed) { @@ -287,6 +288,7 @@ public Reservation() { } } + @Override public boolean add(final int nBytes) { assertOpen(); @@ -308,6 +310,7 @@ public boolean add(final int nBytes) { return true; } + @Override public ArrowBuf allocateBuffer() { assertOpen(); @@ -319,14 +322,17 @@ public ArrowBuf allocateBuffer() { return arrowBuf; } + @Override public int getSize() { return nBytes; } + @Override public boolean isUsed() { return used; } + @Override public boolean isClosed() { return closed; } @@ -364,6 +370,7 @@ public void close() { closed = true; } + @Override public boolean reserve(int nBytes) { assertOpen(); @@ -509,6 +516,7 @@ public synchronized void close() { } + @Override public String toString() { final Verbosity verbosity = logger.isTraceEnabled() ? Verbosity.LOG_WITH_STACKTRACE : Verbosity.BASIC; @@ -523,6 +531,7 @@ public String toString() { * * @return A Verbose string of current allocator state. */ + @Override public String toVerboseString() { final StringBuilder sb = new StringBuilder(); print(sb, 0, Verbosity.LOG_WITH_STACKTRACE); @@ -575,13 +584,12 @@ void verifyAllocator() { * when any problems are found */ private void verifyAllocator(final IdentityHashMap buffersSeen) { - synchronized (DEBUG_LOCK) { - - // The remaining tests can only be performed if we're in debug mode. - if (!DEBUG) { - return; - } + // The remaining tests can only be performed if we're in debug mode. + if (!DEBUG) { + return; + } + synchronized (DEBUG_LOCK) { final long allocated = getAllocatedMemory(); // verify my direct descendants From 703546787e049f1abbc96082f60fe4d08731a5ce Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Wed, 13 Apr 2016 22:36:38 -0700 Subject: [PATCH 078/210] Add java support to Travis CI Add java support to Travis CI using oracle JDK7 on a Linux host. --- .travis.yml | 6 +++++- ci/travis_script_java.sh | 11 +++++++++++ java/pom.xml | 2 +- 3 files changed, 17 insertions(+), 2 deletions(-) create mode 100755 ci/travis_script_java.sh diff --git a/.travis.yml b/.travis.yml index 646f80fee7b..7c4183700ca 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,6 @@ addons: - gcc-4.9 # Needed for C++11 - g++-4.9 # Needed for C++11 - gdb - - gcov - ccache - cmake - valgrind @@ -60,6 +59,11 @@ matrix: before_install: script: - $TRAVIS_BUILD_DIR/ci/travis_conda_build.sh + - language: java + os: linux + jdk: oraclejdk7 + script: + - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh before_install: - ulimit -c unlimited -S diff --git a/ci/travis_script_java.sh b/ci/travis_script_java.sh new file mode 100755 index 00000000000..2d11eaeb4c5 --- /dev/null +++ b/ci/travis_script_java.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +set -e + +JAVA_DIR=${TRAVIS_BUILD_DIR}/java + +pushd $JAVA_DIR + +mvn -B test + +popd diff --git a/java/pom.xml b/java/pom.xml index 4ee4ff4f760..ea42894fda2 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -297,7 +297,7 @@ maven-surefire-plugin 2.17 - -ea + true ${forkCount} true From cd1d770ede57f08b8be2f2b42f2f629eb5106098 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 23 May 2016 13:55:51 -0700 Subject: [PATCH 079/210] ARROW-206: Expose a C++ api to compare ranges of slots between two arrays @wesm the need for this grew out of @fengguangyuan PR to add struct type (#66) and struct builder. I considered a different APIs before settling on this: 1. Add an API that took the parent bitmask (this potentially has possibility of being the most performant, but would have a more awkward contract then provided here) 2. Add an equality comparison for a single slot (leaves the least amount of room for optimization but it would be the simplest to implement). 3. This API which potentially leaves some room for optimization but I think places the least requirements on the caller. Let me know if you would prefer a different API. WIP because I need to add more unit tests (I also need to think about if it is worth mirroring the EqualsExact in addition to the Equals method). Which I should get to by the end of the weekend. @fengguangyuan let me know if this makes sense to you as a way forward on your PR Author: Micah Kornfield Closes #80 from emkornfield/emk_add_equality and squashes the following commits: d5ae777 [Micah Kornfield] remove todo, its handled by type_traits f963639 [Micah Kornfield] add in check for null arrays f5c6bd5 [Micah Kornfield] make format/lint check dcbaad4 [Micah Kornfield] unittests passing 318855d [Micah Kornfield] working primitive tests dadb244 [Micah Kornfield] wip expose range equality to to allow for nested comparisons --- cpp/src/arrow/array-test.cc | 29 +++++++++++++++ cpp/src/arrow/array.cc | 7 ++++ cpp/src/arrow/array.h | 9 ++++- cpp/src/arrow/types/list-test.cc | 36 +++++++++++++++++++ cpp/src/arrow/types/list.cc | 26 ++++++++++++++ cpp/src/arrow/types/list.h | 3 ++ cpp/src/arrow/types/primitive-test.cc | 51 +++++++++++++++++++++++++++ cpp/src/arrow/types/primitive.cc | 17 ++++++++- cpp/src/arrow/types/primitive.h | 35 ++++++++++++++---- 9 files changed, 205 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index b4c727997ee..3b4736327b4 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -56,6 +56,35 @@ TEST_F(TestArray, TestLength) { ASSERT_EQ(arr->length(), 100); } +ArrayPtr MakeArrayFromValidBytes(const std::vector& v, MemoryPool* pool) { + int32_t null_count = v.size() - std::accumulate(v.begin(), v.end(), 0); + std::shared_ptr null_buf = test::bytes_to_null_buffer(v); + + BufferBuilder value_builder(pool); + for (size_t i = 0; i < v.size(); ++i) { + value_builder.Append(0); + } + + ArrayPtr arr(new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf)); + return arr; +} + +TEST_F(TestArray, TestEquality) { + auto array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_); + auto equal_array = MakeArrayFromValidBytes({1, 0, 1, 1, 0, 1, 0, 0}, pool_); + auto unequal_array = MakeArrayFromValidBytes({1, 1, 1, 1, 0, 1, 0, 0}, pool_); + + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + EXPECT_TRUE(array->RangeEquals(4, 8, 4, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 4, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 8, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); +} + TEST_F(TestArray, TestIsNull) { // clang-format off std::vector null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0, diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index c6b9b1599cd..d6b081f3155 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -58,4 +58,11 @@ bool NullArray::Equals(const std::shared_ptr& arr) const { return arr->length() == length_; } +bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, + const std::shared_ptr& arr) const { + if (!arr) { return false; } + if (Type::NA != arr->type_enum()) { return false; } + return true; +} + } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index f98c4c28310..76dc0f59814 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -59,6 +59,12 @@ class Array { bool EqualsExact(const Array& arr) const; virtual bool Equals(const std::shared_ptr& arr) const = 0; + + // Compare if the range of slots specified are equal for the given array and + // this array. end_idx exclusive. This methods does not bounds check. + virtual bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const = 0; + // Determines if the array is internally consistent. Defaults to always // returning Status::OK. This can be an expensive check. virtual Status Validate() const; @@ -85,10 +91,11 @@ class NullArray : public Array { explicit NullArray(int32_t length) : NullArray(std::make_shared(), length) {} bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, + const std::shared_ptr& arr) const override; }; typedef std::shared_ptr ArrayPtr; - } // namespace arrow #endif diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 6a8ad9aa59e..2e41b4a61ca 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -86,6 +86,42 @@ class TestListBuilder : public TestBuilder { shared_ptr result_; }; +TEST_F(TestListBuilder, Equality) { + Int32Builder* vb = static_cast(builder_->value_builder().get()); + + ArrayPtr array, equal_array, unequal_array; + vector equal_offsets = {0, 1, 2, 5}; + vector equal_values = {1, 2, 3, 4, 5, 2, 2, 2}; + vector unequal_offsets = {0, 1, 4}; + vector unequal_values = {1, 2, 2, 2, 3, 4, 5}; + + // setup two equal arrays + ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); + array = builder_->Finish(); + ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); + ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); + equal_array = builder_->Finish(); + // now an unequal one + ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size())); + ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size())); + unequal_array = builder_->Finish(); + + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array)); + EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array)); + EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array)); +} + TEST_F(TestListBuilder, TestResize) {} TEST_F(TestListBuilder, TestAppendNull) { diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 76e7fe5f4d4..6334054caf8 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -44,6 +44,32 @@ bool ListArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = offset(i); + const int32_t end_offset = offset(i + 1); + const int32_t other_begin_offset = other->offset(o_i); + const int32_t other_end_offset = other->offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != other_end_offset - other_begin_offset) { + return false; + } + if (!values_->RangeEquals( + begin_offset, end_offset, other_begin_offset, other->values())) { + return false; + } + } + return true; +} + Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index a020b8ad226..0a3941633eb 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -72,6 +72,9 @@ class ListArray : public Array { bool EqualsExact(const ListArray& other) const; bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override; + protected: std::shared_ptr offset_buf_; const int32_t* offsets_; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 2b4c0879a28..87eb0fe3a8b 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -304,6 +304,57 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); } +template +Status MakeArray(const vector& valid_bytes, const vector& draws, int size, + Builder* builder, ArrayPtr* out) { + // Append the first 1000 + for (int i = 0; i < size; ++i) { + if (valid_bytes[i] > 0) { + RETURN_NOT_OK(builder->Append(draws[i])); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + *out = builder->Finish(); + return Status::OK(); +} + +TYPED_TEST(TestPrimitiveBuilder, Equality) { + DECL_T(); + + const int size = 1000; + this->RandomData(size); + vector& draws = this->draws_; + vector& valid_bytes = this->valid_bytes_; + ArrayPtr array, equal_array, unequal_array; + auto builder = this->builder_.get(); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array)); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &equal_array)); + + // Make the not equal array by negating the first valid element with itself. + const auto first_valid = std::find_if( + valid_bytes.begin(), valid_bytes.end(), [](uint8_t valid) { return valid > 0; }); + const int first_valid_idx = std::distance(valid_bytes.begin(), first_valid); + // This should be true with a very high probability, but might introduce flakiness + ASSERT_LT(first_valid_idx, size - 1); + draws[first_valid_idx] = ~*reinterpret_cast(&draws[first_valid_idx]); + ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array)); + + // test normal equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_array)); + EXPECT_FALSE(unequal_array->Equals(equal_array)); + + // Test range equality + EXPECT_FALSE(array->RangeEquals(0, first_valid_idx + 1, 0, unequal_array)); + EXPECT_FALSE(array->RangeEquals(first_valid_idx, size, first_valid_idx, unequal_array)); + EXPECT_TRUE(array->RangeEquals(0, first_valid_idx, 0, unequal_array)); + EXPECT_TRUE( + array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array)); +} + TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { DECL_T(); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 57a3f1e4e15..8e6c0f809ca 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -185,10 +185,25 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const { } } -bool BooleanArray::Equals(const std::shared_ptr& arr) const { +bool BooleanArray::Equals(const ArrayPtr& arr) const { if (this == arr.get()) return true; if (Type::BOOL != arr->type_enum()) { return false; } return EqualsExact(*static_cast(arr.get())); } +bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx, + int32_t other_start_idx, const ArrayPtr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i))) { + return false; + } + } + return true; +} + } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index fc45f6c5b05..9597fc83631 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -66,6 +66,22 @@ class PrimitiveArray : public Array { return PrimitiveArray::EqualsExact(*static_cast(&other)); \ } \ \ + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, \ + const ArrayPtr& arr) const override { \ + if (this == arr.get()) { return true; } \ + if (!arr) { return false; } \ + if (this->type_enum() != arr->type_enum()) { return false; } \ + const auto other = static_cast(arr.get()); \ + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { \ + const bool is_null = IsNull(i); \ + if (is_null != arr->IsNull(o_i) || \ + (!is_null && Value(i) != other->Value(o_i))) { \ + return false; \ + } \ + } \ + return true; \ + } \ + \ const T* raw_data() const { return reinterpret_cast(raw_data_); } \ \ T Value(int i) const { return raw_data()[i]; } \ @@ -95,8 +111,10 @@ class PrimitiveBuilder : public ArrayBuilder { using ArrayBuilder::Advance; // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory - void AppendNulls(const uint8_t* valid_bytes, int32_t length) { + Status AppendNulls(const uint8_t* valid_bytes, int32_t length) { + RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); } Status AppendNull() { @@ -139,9 +157,10 @@ class NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::Reserve; // Scalar append. - void Append(value_type val) { - ArrayBuilder::Reserve(1); + Status Append(value_type val) { + RETURN_NOT_OK(ArrayBuilder::Reserve(1)); UnsafeAppend(val); + return Status::OK(); } // Does not capacity-check; make sure to call Reserve beforehand @@ -248,7 +267,9 @@ class BooleanArray : public PrimitiveArray { int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); bool EqualsExact(const BooleanArray& other) const; - bool Equals(const std::shared_ptr& arr) const override; + bool Equals(const ArrayPtr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override; const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } @@ -274,7 +295,8 @@ class BooleanBuilder : public PrimitiveBuilder { using PrimitiveBuilder::Append; // Scalar append - void Append(bool val) { + Status Append(bool val) { + Reserve(1); util::set_bit(null_bitmap_data_, length_); if (val) { util::set_bit(raw_data_, length_); @@ -282,9 +304,10 @@ class BooleanBuilder : public PrimitiveBuilder { util::clear_bit(raw_data_, length_); } ++length_; + return Status::OK(); } - void Append(uint8_t val) { Append(static_cast(val)); } + Status Append(uint8_t val) { return Append(static_cast(val)); } }; } // namespace arrow From c8b8078810be1d703c0261859b0862d574384600 Mon Sep 17 00:00:00 2001 From: Edmon Begoli Date: Sat, 28 May 2016 19:11:47 -0400 Subject: [PATCH 080/210] [Doc] Update Layout.md For clarity, added references to official SIMD documentation, the description of Endiandness, Parquet. Used Markdown syntax for the exponent to document the size of the arrays. Closes PR #82. --- format/Layout.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/format/Layout.md b/format/Layout.md index 34eade31341..9de0479738a 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -41,7 +41,7 @@ Base requirements proprietary systems that utilize the open source components. * All array slots are accessible in constant time, with complexity growing linearly in the nesting level -* Capable of representing fully-materialized and decoded / decompressed Parquet +* Capable of representing fully-materialized and decoded / decompressed [Parquet][5] data * All contiguous memory buffers are aligned at 64-byte boundaries and padded to a multiple of 64 bytes. * Any relative type can have null slots @@ -76,7 +76,7 @@ Base requirements * Any memory management or reference counting subsystem * To enumerate or specify types of encodings or compression support -## Byte Order (Endianness) +## Byte Order ([Endianness][3]) The Arrow format is little endian. @@ -91,7 +91,7 @@ requirement follows best practices for optimized memory access: * 64 byte alignment is recommended by the [Intel performance guide][2] for data-structures over 64 bytes (which will be a common case for Arrow Arrays). -Requiring padding to a multiple of 64 bytes allows for using SIMD instructions +Requiring padding to a multiple of 64 bytes allows for using [SIMD][4] instructions consistently in loops without additional conditional checks. This should allow for simpler and more efficient code. The specific padding length was chosen because it matches the largest known @@ -105,13 +105,13 @@ Unless otherwise noted, padded bytes do not need to have a specific value. ## Array lengths Any array has a known and fixed length, stored as a 32-bit signed integer, so a -maximum of 2^31 - 1 elements. We choose a signed int32 for a couple reasons: +maximum of 231 - 1 elements. We choose a signed int32 for a couple reasons: * Enhance compatibility with Java and client languages which may have varying quality of support for unsigned integers. * To encourage developers to compose smaller arrays (each of which contains contiguous memory in its leaf nodes) to create larger array structures - possibly exceeding 2^31 - 1 elements, as opposed to allocating very large + possibly exceeding 231 - 1 elements, as opposed to allocating very large contiguous memory blocks. ## Null count @@ -238,7 +238,7 @@ A list-array is represented by the combination of the following: * A values array, a child array of type T. T may also be a nested type. * An offsets buffer containing 32-bit signed integers with length equal to the length of the top-level array plus one. Note that this limits the size of the - values array to 2^31 -1. + values array to 231-1. The offsets array encodes a start position in the values array, and the length of the value in each slot is computed using the first difference with the next @@ -578,7 +578,11 @@ the the types array indicates that a slot contains a different type at the index ## References -Drill docs https://drill.apache.org/docs/value-vectors/ +Apache Drill Documentation - [Value Vectors][6] [1]: https://en.wikipedia.org/wiki/Bit_numbering [2]: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors +[3]: https://en.wikipedia.org/wiki/Endianness +[4]: https://software.intel.com/en-us/node/600110 +[5]: https://parquet.apache.org/documentation/latest/ +[6]: https://drill.apache.org/docs/value-vectors/ From 65740950c852b82c475ca84e970e147d25d27398 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 2 Jun 2016 18:36:43 -0700 Subject: [PATCH 081/210] ARROW-209: [C++] Triage builds due to unavailable LLVM apt repo For now, this unblocks builds until we can resolve the LLVM apt issue. Author: Wes McKinney Closes #84 from wesm/ARROW-209 and squashes the following commits: c6bf166 [Wes McKinney] Remove clang-* packages from apt list 30d8c5c [Wes McKinney] Temporarily disable clang-format and clang-tidy checks in Travis CI build --- .travis.yml | 3 --- ci/travis_script_cpp.sh | 12 ++++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 7c4183700ca..ac2b0d457cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,10 +5,7 @@ addons: sources: - ubuntu-toolchain-r-test - kalakris-cmake - - llvm-toolchain-precise-3.7 packages: - - clang-format-3.7 - - clang-tidy-3.7 - gcc-4.9 # Needed for C++11 - g++-4.9 # Needed for C++11 - gdb diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index c9b3b5f1442..9cf4f8e3521 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -7,10 +7,14 @@ set -e pushd $CPP_BUILD_DIR make lint -if [ $TRAVIS_OS_NAME == "linux" ]; then - make check-format - make check-clang-tidy -fi + +# ARROW-209: checks depending on the LLVM toolchain are disabled temporarily +# until we are able to install the full LLVM toolchain in Travis CI again + +# if [ $TRAVIS_OS_NAME == "linux" ]; then +# make check-format +# make check-clang-tidy +# fi ctest -L unittest From ce2fe7a782c9c1f84a6ccdc2b7b00768d535d8fc Mon Sep 17 00:00:00 2001 From: Smyatkin Maxim Date: Mon, 6 Jun 2016 23:25:31 -0700 Subject: [PATCH 082/210] ARROW-211: [Format] Fixed typos in layout examples Just a few typo fixes according to the ticket. Author: Smyatkin Maxim Closes #86 from Smyatkin-Maxim/ARROW-211 and squashes the following commits: 6cefba6 [Smyatkin Maxim] ARROW-211: [Format] Fixed typos in layout examples --- format/Layout.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/format/Layout.md b/format/Layout.md index 9de0479738a..815c47f2c93 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -299,7 +299,7 @@ will be be represented as follows: | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | |------------|------------|------------|-------------|-------------| - | 0 | 2 | 6 | 7 | unspecified | + | 0 | 2 | 5 | 6 | unspecified | * Values array (`List`) * Length: 6, Null count: 1 @@ -368,7 +368,7 @@ The layout for [{'joe', 1}, {null, 2}, null, {'mark', 4}] would be: | Byte 0 (validity bitmap) | Bytes 1-7 | |--------------------------|-----------------------| - | 00011101 | 0 (padding) | + | 00001101 | 0 (padding) | * Offsets buffer: @@ -472,7 +472,7 @@ An example layout for logical union of: | 1.2, 3.4 | unspecified | - * Field-1 array (f: float): + * Field-1 array (i: int32): * Length: 1, nulls: 0 * Null bitmap buffer: Not required @@ -499,7 +499,7 @@ union, it has some advantages that may be desirable in certain use cases: For the union array: -[{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, 'mark'] +[{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, {u2='mark'}] will have the following layout: ``` From 9ce13a06726874c04433100127f74e6ea4afa855 Mon Sep 17 00:00:00 2001 From: fengguangyuan Date: Mon, 6 Jun 2016 23:32:38 -0700 Subject: [PATCH 083/210] ARROW-60: [C++] Struct type builder API Implement the basic classes, `StructArray` and `StructBuilder,` meanwhile, add the perspective test cases for them. Other necessary methods will be added subsequently. Author: fengguangyuan Closes #66 from fengguangyuan/ARROW-60 and squashes the following commits: 190967f [fengguangyuan] ARROW-60: [C++] Struct type builder API Add field index and TODO comment. ae74c80 [fengguangyuan] ARROW-60: Struct type builder API Add RangeEquals method to implement Equals method. fa856fd [fengguangyuan] ARROW-60:[C++] Struct typebuilder API Modify Validate() refered to the specification. bfabdc1 [fengguangyuan] ARROW-60: Struct type builder API Refine the previous committed patch. Add validate methods for testing StructArray and StructBuilder. TODO, Equals methods also need to be tested, but now it's not convient to do it. 5733de7 [fengguangyuan] ARROW-60: Struct type builder API --- cpp/src/arrow/type.h | 1 + cpp/src/arrow/types/construct.cc | 15 ++ cpp/src/arrow/types/construct.h | 3 +- cpp/src/arrow/types/struct-test.cc | 332 +++++++++++++++++++++++++++++ cpp/src/arrow/types/struct.cc | 72 ++++++- cpp/src/arrow/types/struct.h | 97 ++++++++- 6 files changed, 517 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 77404cd7025..f366645cd5c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -161,6 +161,7 @@ struct Field { std::string ToString() const; }; +typedef std::shared_ptr FieldPtr; template struct PrimitiveType : public DataType { diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 78036d4bf57..bcb0ec49090 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -23,6 +23,7 @@ #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -66,6 +67,20 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, out->reset(new ListBuilder(pool, value_builder)); return Status::OK(); } + + case Type::STRUCT: { + std::vector& fields = type->children_; + std::vector> values_builder; + + for (auto it : fields) { + std::shared_ptr builder; + RETURN_NOT_OK(MakeBuilder(pool, it->type, &builder)); + values_builder.push_back(builder); + } + out->reset(new StructBuilder(pool, type, values_builder)); + return Status::OK(); + } + default: return Status::NotImplemented(type->ToString()); } diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index 43c0018c67e..d0370840ca1 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -20,13 +20,14 @@ #include #include - +#include namespace arrow { class Array; class ArrayBuilder; class Buffer; struct DataType; +struct Field; class MemoryPool; class Status; diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 79d560e19bc..d2bd2971d04 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -21,7 +21,16 @@ #include "gtest/gtest.h" +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/types/construct.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/types/struct.h" +#include "arrow/types/test-common.h" +#include "arrow/util/status.h" using std::shared_ptr; using std::string; @@ -52,4 +61,327 @@ TEST(TestStructType, Basics) { // TODO(wesm): out of bounds for field(...) } +void ValidateBasicStructArray(const StructArray* result, + const vector& struct_is_valid, const vector& list_values, + const vector& list_is_valid, const vector& list_lengths, + const vector& list_offsets, const vector& int_values) { + ASSERT_EQ(4, result->length()); + ASSERT_OK(result->Validate()); + + auto list_char_arr = static_cast(result->field(0).get()); + auto char_arr = static_cast(list_char_arr->values().get()); + auto int32_arr = static_cast(result->field(1).get()); + + ASSERT_EQ(0, result->null_count()); + ASSERT_EQ(1, list_char_arr->null_count()); + ASSERT_EQ(0, int32_arr->null_count()); + + // List + ASSERT_EQ(4, list_char_arr->length()); + ASSERT_EQ(10, list_char_arr->values()->length()); + for (size_t i = 0; i < list_offsets.size(); ++i) { + ASSERT_EQ(list_offsets[i], list_char_arr->offsets()[i]); + } + for (size_t i = 0; i < list_values.size(); ++i) { + ASSERT_EQ(list_values[i], char_arr->Value(i)); + } + + // Int32 + ASSERT_EQ(4, int32_arr->length()); + for (size_t i = 0; i < int_values.size(); ++i) { + ASSERT_EQ(int_values[i], int32_arr->Value(i)); + } +} + +// ---------------------------------------------------------------------------------- +// Struct test +class TestStructBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + + auto int32_type = TypePtr(new Int32Type()); + auto char_type = TypePtr(new Int8Type()); + auto list_type = TypePtr(new ListType(char_type)); + + std::vector types = {list_type, int32_type}; + std::vector fields; + fields.push_back(FieldPtr(new Field("list", list_type))); + fields.push_back(FieldPtr(new Field("int", int32_type))); + + type_ = TypePtr(new StructType(fields)); + value_fields_ = fields; + + std::shared_ptr tmp; + ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); + + builder_ = std::dynamic_pointer_cast(tmp); + ASSERT_EQ(2, builder_->field_builders().size()); + } + + void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } + + protected: + std::vector value_fields_; + TypePtr type_; + + std::shared_ptr builder_; + std::shared_ptr result_; +}; + +TEST_F(TestStructBuilder, TestAppendNull) { + ASSERT_OK(builder_->AppendNull()); + ASSERT_OK(builder_->AppendNull()); + ASSERT_EQ(2, builder_->field_builders().size()); + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_OK(list_vb->AppendNull()); + ASSERT_EQ(2, list_vb->length()); + + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_OK(int_vb->AppendNull()); + ASSERT_EQ(2, int_vb->length()); + + Done(); + + ASSERT_OK(result_->Validate()); + + ASSERT_EQ(2, result_->fields().size()); + ASSERT_EQ(2, result_->length()); + ASSERT_EQ(2, result_->field(0)->length()); + ASSERT_EQ(2, result_->field(1)->length()); + ASSERT_TRUE(result_->IsNull(0)); + ASSERT_TRUE(result_->IsNull(1)); + ASSERT_TRUE(result_->field(0)->IsNull(0)); + ASSERT_TRUE(result_->field(0)->IsNull(1)); + ASSERT_TRUE(result_->field(1)->IsNull(0)); + ASSERT_TRUE(result_->field(1)->IsNull(1)); + + ASSERT_EQ(Type::LIST, result_->field(0)->type_enum()); + ASSERT_EQ(Type::INT32, result_->field(1)->type_enum()); +} + +TEST_F(TestStructBuilder, TestBasics) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6, 10}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_EQ(2, builder_->field_builders().size()); + + EXPECT_OK(builder_->Resize(list_lengths.size())); + EXPECT_OK(char_vb->Resize(list_values.size())); + EXPECT_OK(int_vb->Resize(int_values.size())); + + int pos = 0; + for (size_t i = 0; i < list_lengths.size(); ++i) { + ASSERT_OK(list_vb->Append(list_is_valid[i] > 0)); + int_vb->UnsafeAppend(int_values[i]); + for (int j = 0; j < list_lengths[i]; ++j) { + char_vb->UnsafeAppend(list_values[pos++]); + } + } + + for (size_t i = 0; i < struct_is_valid.size(); ++i) { + ASSERT_OK(builder_->Append(struct_is_valid[i] > 0)); + } + + Done(); + + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} + +TEST_F(TestStructBuilder, BulkAppend) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + Done(); + ValidateBasicStructArray(result_.get(), struct_is_valid, list_values, list_is_valid, + list_lengths, list_offsets, int_values); +} + +TEST_F(TestStructBuilder, BulkAppendInvalid) { + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 0, 1, 1}; // should be 1, 1, 1, 1 + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + + Done(); + // Even null bitmap of the parent Struct is not valid, Validate() will ignore it. + ASSERT_OK(result_->Validate()); +} + +TEST_F(TestStructBuilder, TestEquality) { + ArrayPtr array, equal_array; + ArrayPtr unequal_bitmap_array, unequal_offsets_array, unequal_values_array; + + vector int_values = {1, 2, 3, 4}; + vector list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'm', 'a', 'r', 'k'}; + vector list_lengths = {3, 0, 3, 4}; + vector list_offsets = {0, 3, 3, 6}; + vector list_is_valid = {1, 0, 1, 1}; + vector struct_is_valid = {1, 1, 1, 1}; + + vector unequal_int_values = {4, 2, 3, 1}; + vector unequal_list_values = {'j', 'o', 'e', 'b', 'o', 'b', 'l', 'u', 'c', 'y'}; + vector unequal_list_offsets = {0, 3, 4, 6}; + vector unequal_list_is_valid = {1, 1, 1, 1}; + vector unequal_struct_is_valid = {1, 0, 0, 1}; + + ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); + Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); + Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); + ASSERT_OK(builder_->Reserve(list_lengths.size())); + ASSERT_OK(char_vb->Reserve(list_values.size())); + ASSERT_OK(int_vb->Reserve(int_values.size())); + + // setup two equal arrays, one of which takes an unequal bitmap + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with the unequal bitmap + builder_->Append(unequal_struct_is_valid.size(), unequal_struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + unequal_bitmap_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup an unequal one with unequal offsets + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(unequal_list_offsets.data(), unequal_list_offsets.size(), + unequal_list_is_valid.data()); + for (int8_t value : list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : int_values) { + int_vb->UnsafeAppend(value); + } + unequal_offsets_array = builder_->Finish(); + + ASSERT_OK(builder_->Resize(list_lengths.size())); + ASSERT_OK(char_vb->Resize(list_values.size())); + ASSERT_OK(int_vb->Resize(int_values.size())); + + // setup anunequal one with unequal values + builder_->Append(struct_is_valid.size(), struct_is_valid.data()); + list_vb->Append(list_offsets.data(), list_offsets.size(), list_is_valid.data()); + for (int8_t value : unequal_list_values) { + char_vb->UnsafeAppend(value); + } + for (int32_t value : unequal_int_values) { + int_vb->UnsafeAppend(value); + } + unequal_values_array = builder_->Finish(); + + // Test array equality + EXPECT_TRUE(array->Equals(array)); + EXPECT_TRUE(array->Equals(equal_array)); + EXPECT_TRUE(equal_array->Equals(array)); + EXPECT_FALSE(equal_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(equal_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_values_array)); + EXPECT_FALSE(unequal_values_array->Equals(unequal_bitmap_array)); + EXPECT_FALSE(unequal_bitmap_array->Equals(unequal_offsets_array)); + EXPECT_FALSE(unequal_offsets_array->Equals(unequal_bitmap_array)); + + // Test range equality + EXPECT_TRUE(array->RangeEquals(0, 4, 0, equal_array)); + EXPECT_TRUE(array->RangeEquals(3, 4, 3, unequal_bitmap_array)); + EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_offsets_array)); + EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array)); + EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array)); + EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array)); +} + +TEST_F(TestStructBuilder, TestZeroLength) { + // All buffers are null + Done(); + ASSERT_OK(result_->Validate()); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 04a277a86fa..e8176f08268 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -17,4 +17,74 @@ #include "arrow/types/struct.h" -namespace arrow {} // namespace arrow +#include + +namespace arrow { + +bool StructArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + if (null_count_ != arr->null_count()) { return false; } + return RangeEquals(0, length_, 0, arr); +} + +bool StructArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (Type::STRUCT != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + + bool equal_fields = true; + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + if (IsNull(i) != arr->IsNull(o_i)) { return false; } + if (IsNull(i)) continue; + for (size_t j = 0; j < field_arrays_.size(); ++j) { + // TODO: really we should be comparing stretches of non-null data rather + // than looking at one value at a time. + equal_fields = field(j)->RangeEquals(i, i + 1, o_i, other->field(j)); + if (!equal_fields) { return false; } + } + } + + return true; +} + +Status StructArray::Validate() const { + if (length_ < 0) { return Status::Invalid("Length was negative"); } + + if (null_count() > length_) { + return Status::Invalid("Null count exceeds the length of this struct"); + } + + if (field_arrays_.size() > 0) { + // Validate fields + int32_t array_length = field_arrays_[0]->length(); + size_t idx = 0; + for (auto it : field_arrays_) { + if (it->length() != array_length) { + std::stringstream ss; + ss << "Length is not equal from field " << it->type()->ToString() + << " at position {" << idx << "}"; + return Status::Invalid(ss.str()); + } + + const Status child_valid = it->Validate(); + if (!child_valid.ok()) { + std::stringstream ss; + ss << "Child array invalid: " << child_valid.ToString() << " at position {" << idx + << "}"; + return Status::Invalid(ss.str()); + } + ++idx; + } + + if (array_length > 0 && array_length != length_) { + return Status::Invalid("Struct's length is not equal to its child arrays"); + } + } + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 17e32993bf9..78afd29eb8d 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -23,7 +23,102 @@ #include #include "arrow/type.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" -namespace arrow {} // namespace arrow +namespace arrow { + +class StructArray : public Array { + public: + StructArray(const TypePtr& type, int32_t length, std::vector& field_arrays, + int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) + : Array(type, length, null_count, null_bitmap) { + type_ = type; + field_arrays_ = field_arrays; + } + + Status Validate() const override; + + virtual ~StructArray() {} + + // Return a shared pointer in case the requestor desires to share ownership + // with this array. + const std::shared_ptr& field(int32_t pos) const { + DCHECK_GT(field_arrays_.size(), 0); + return field_arrays_[pos]; + } + const std::vector& fields() const { return field_arrays_; } + + bool EqualsExact(const StructArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const override; + + protected: + // The child arrays corresponding to each field of the struct data type. + std::vector field_arrays_; +}; + +// --------------------------------------------------------------------------------- +// StructArray builder +// Append, Resize and Reserve methods are acting on StructBuilder. +// Please make sure all these methods of all child-builders' are consistently +// called to maintain data-structure consistency. +class StructBuilder : public ArrayBuilder { + public: + StructBuilder(MemoryPool* pool, const std::shared_ptr& type, + const std::vector>& field_builders) + : ArrayBuilder(pool, type) { + field_builders_ = field_builders; + } + + // Null bitmap is of equal length to every child field, and any zero byte + // will be considered as a null for that field, but users must using app- + // end methods or advance methods of the child builders' independently to + // insert data. + Status Append(int32_t length, const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + std::shared_ptr Finish() override { + std::vector fields; + for (auto it : field_builders_) { + fields.push_back(it->Finish()); + } + + auto result = + std::make_shared(type_, length_, fields, null_count_, null_bitmap_); + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return result; + } + + // Append an element to the Struct. All child-builders' Append method must + // be called independently to maintain data-structure consistency. + Status Append(bool is_valid = true) { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(is_valid); + return Status::OK(); + } + + Status AppendNull() { return Append(false); } + + const std::shared_ptr field_builder(int pos) const { + DCHECK_GT(field_builders_.size(), 0); + return field_builders_[pos]; + } + const std::vector>& field_builders() const { + return field_builders_; + } + + protected: + std::vector> field_builders_; +}; + +} // namespace arrow #endif // ARROW_TYPES_STRUCT_H From bc6c4c88fb4bfd1d99e71c8043f0ba0ca5544ae2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2016 11:23:07 -0700 Subject: [PATCH 084/210] ARROW-200: [C++/Python] Return error status on string initialization failure Author: Micah Kornfield Closes #88 from emkornfield/emk_arrow_200 and squashes the following commits: 37e23be [Micah Kornfield] ARROW-200: Return error status on string initialization failure --- python/src/pyarrow/adapters/pandas.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 5159d86865c..8dcc2b1c92e 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -669,7 +669,7 @@ class ArrowDeserializer { out_values[i] = make_pystring(data, length); if (out_values[i] == nullptr) { - return Status::OK(); + return Status::UnknownError("String initialization failed"); } } } @@ -678,7 +678,7 @@ class ArrowDeserializer { data = string_arr->GetValue(i, &length); out_values[i] = make_pystring(data, length); if (out_values[i] == nullptr) { - return Status::OK(); + return Status::UnknownError("String initialization failed"); } } } From 8197f246de934db14b3af26a0899d95bffbdc6b2 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Wed, 8 Jun 2016 11:24:04 -0700 Subject: [PATCH 085/210] ARROW-212: Change contract of PrimitiveArray to reflect its abstractness Follow-up based on #80 Author: Micah Kornfield Closes #87 from emkornfield/emk_clarify_primitive and squashes the following commits: 14bd5b2 [Micah Kornfield] ARROW-212: Make the fact that PrimitiveArray is a abstract class more apparent fromt the contract --- cpp/src/arrow/types/primitive.cc | 5 +++++ cpp/src/arrow/types/primitive.h | 15 +++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 8e6c0f809ca..08fc8478e6d 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -162,6 +162,11 @@ BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, : PrimitiveArray( std::make_shared(), length, data, null_count, null_bitmap) {} +BooleanArray::BooleanArray(const TypePtr& type, int32_t length, + const std::shared_ptr& data, int32_t null_count, + const std::shared_ptr& null_bitmap) + : PrimitiveArray(type, length, data, null_count, null_bitmap) {} + bool BooleanArray::EqualsExact(const BooleanArray& other) const { if (this == &other) return true; if (null_count_ != other.null_count_) { return false; } diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 9597fc83631..f1ec417d510 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -34,11 +34,10 @@ namespace arrow { class MemoryPool; -// Base class for fixed-size logical types +// Base class for fixed-size logical types. See MakePrimitiveArray +// (types/construct.h) for constructing a specific subclass. class PrimitiveArray : public Array { public: - PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); virtual ~PrimitiveArray() {} const std::shared_ptr& data() const { return data_; } @@ -47,6 +46,8 @@ class PrimitiveArray : public Array { bool Equals(const std::shared_ptr& arr) const override; protected: + PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); std::shared_ptr data_; const uint8_t* raw_data_; }; @@ -55,12 +56,14 @@ class PrimitiveArray : public Array { class NAME : public PrimitiveArray { \ public: \ using value_type = T; \ - using PrimitiveArray::PrimitiveArray; \ \ NAME(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, \ const std::shared_ptr& null_bitmap = nullptr) \ : PrimitiveArray( \ std::make_shared(), length, data, null_count, null_bitmap) {} \ + NAME(const TypePtr& type, int32_t length, const std::shared_ptr& data, \ + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) \ + : PrimitiveArray(type, length, data, null_count, null_bitmap) {} \ \ bool EqualsExact(const NAME& other) const { \ return PrimitiveArray::EqualsExact(*static_cast(&other)); \ @@ -261,10 +264,10 @@ typedef NumericBuilder DoubleBuilder; class BooleanArray : public PrimitiveArray { public: - using PrimitiveArray::PrimitiveArray; - BooleanArray(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); + BooleanArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); bool EqualsExact(const BooleanArray& other) const; bool Equals(const ArrayPtr& arr) const override; From ec66ddd1fd4954b78967bfa1893480473e4d380c Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jun 2016 15:08:23 -0700 Subject: [PATCH 086/210] ARROW-203: Python: Basic filename based Parquet read/write Author: Uwe L. Korn Closes #83 from xhochy/arrow-203 and squashes the following commits: 405f85d [Uwe L. Korn] Remove FindParquet duplication 38d786c [Uwe L. Korn] Make code more readable by using using ec07768 [Uwe L. Korn] Set LD_LIBRARY_PATH in python build 8d90d3f [Uwe L. Korn] Do not set LD_LIBRARY_PATH in python build 000e1e3 [Uwe L. Korn] Use unique_ptr and shared_ptr from Cython 8f6010a [Uwe L. Korn] Linter fixes 0514d01 [Uwe L. Korn] Handle exceptions on RowGroupWriter::Close better 77bd21a [Uwe L. Korn] Add pandas roundtrip to tests f583b61 [Uwe L. Korn] Fix rpath for libarrow_parquet 00c1461 [Uwe L. Korn] Also ensure correct OSX compiler flags in PyArrow 4a80116 [Uwe L. Korn] Handle Python3 strings correctly 066c08a [Uwe L. Korn] Add missing functions to smart pointers 5706db2 [Uwe L. Korn] Use length and offset instead of slicing 443de8b [Uwe L. Korn] Add miniconda to the LD_LIBRARY_PATH 2dffc14 [Uwe L. Korn] Fix min mistake, use equals instead of == 2006e70 [Uwe L. Korn] Rewrite test py.test style 9520c39 [Uwe L. Korn] Use PARQUET from miniconda path cd3b9a9 [Uwe L. Korn] Also search for Parquet in PyArrow 6a41d23 [Uwe L. Korn] Re-use conda installation from C++ 81f501e [Uwe L. Korn] No need to install conda in travis_script_python anymore b505feb [Uwe L. Korn] Install parquet-cpp via conda 5d4929a [Uwe L. Korn] Add test-util.h 9b06e41 [Uwe L. Korn] Make tests templated be6415c [Uwe L. Korn] Incorportate review comments 0fbed3f [Uwe L. Korn] Remove obsolete parquet files 081db5f [Uwe L. Korn] Limit and document chunk_size 7192cfb [Uwe L. Korn] Add const to slicing parameters 0463995 [Uwe L. Korn] ARROW-203: Python: Basic filename based Parquet read/write --- ci/travis_before_script_cpp.sh | 6 +- ci/travis_conda_build.sh | 22 +- ci/travis_install_conda.sh | 26 +++ ci/travis_script_python.sh | 21 +- cpp/src/arrow/column.h | 2 + cpp/src/arrow/parquet/CMakeLists.txt | 7 + cpp/src/arrow/parquet/parquet-io-test.cc | 256 +++++++++++++++++------ cpp/src/arrow/parquet/reader.cc | 25 +++ cpp/src/arrow/parquet/reader.h | 3 + cpp/src/arrow/parquet/test-util.h | 77 +++++++ cpp/src/arrow/parquet/utils.h | 5 + cpp/src/arrow/parquet/writer.cc | 99 +++++++-- cpp/src/arrow/parquet/writer.h | 12 +- cpp/src/arrow/util/status.h | 9 + python/CMakeLists.txt | 8 + python/cmake_modules/FindArrow.cmake | 14 +- python/conda.recipe/build.sh | 13 ++ python/pyarrow/array.pyx | 3 + python/pyarrow/error.pxd | 2 + python/pyarrow/error.pyx | 8 + python/pyarrow/includes/common.pxd | 9 +- python/pyarrow/includes/libarrow.pxd | 3 + python/pyarrow/includes/parquet.pxd | 46 ++++ python/pyarrow/parquet.pyx | 50 ++++- python/pyarrow/schema.pyx | 9 +- python/pyarrow/tests/test_parquet.py | 59 ++++++ python/setup.py | 4 +- 27 files changed, 654 insertions(+), 144 deletions(-) create mode 100644 ci/travis_install_conda.sh create mode 100644 cpp/src/arrow/parquet/test-util.h create mode 100644 python/pyarrow/tests/test_parquet.py diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 193c76feba1..6159f67e361 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -2,6 +2,10 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh +conda install -y --channel apache/channel/dev parquet-cpp +export PARQUET_HOME=$MINICONDA + : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build} mkdir $CPP_BUILD_DIR @@ -19,7 +23,7 @@ echo $GTEST_HOME : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} -CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" +CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DARROW_PARQUET=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" if [ $TRAVIS_OS_NAME == "linux" ]; then cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR diff --git a/ci/travis_conda_build.sh b/ci/travis_conda_build.sh index afa531dbd6b..c43a85170b0 100755 --- a/ci/travis_conda_build.sh +++ b/ci/travis_conda_build.sh @@ -2,27 +2,7 @@ set -e -if [ $TRAVIS_OS_NAME == "linux" ]; then - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" -else - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" -fi - -wget -O miniconda.sh $MINICONDA_URL -MINICONDA=$TRAVIS_BUILD_DIR/miniconda -bash miniconda.sh -b -p $MINICONDA -export PATH="$MINICONDA/bin:$PATH" -conda update -y -q conda -conda info -a - -conda config --set show_channel_urls yes -conda config --add channels conda-forge -conda config --add channels apache - -conda install --yes conda-build jinja2 anaconda-client - -# faster builds, please -conda install -y nomkl +source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh # Build libarrow diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh new file mode 100644 index 00000000000..bef667dff7c --- /dev/null +++ b/ci/travis_install_conda.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -e + +if [ $TRAVIS_OS_NAME == "linux" ]; then + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" +else + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" +fi + +wget -O miniconda.sh $MINICONDA_URL +export MINICONDA=$TRAVIS_BUILD_DIR/miniconda +bash miniconda.sh -b -p $MINICONDA +export PATH="$MINICONDA/bin:$PATH" +conda update -y -q conda +conda info -a + +conda config --set show_channel_urls yes +conda config --add channels conda-forge +conda config --add channels apache + +conda install --yes conda-build jinja2 anaconda-client + +# faster builds, please +conda install -y nomkl + diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index d45b895d8cf..6d35785356a 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -4,6 +4,12 @@ set -e PYTHON_DIR=$TRAVIS_BUILD_DIR/python +# Re-use conda installation from C++ +export MINICONDA=$TRAVIS_BUILD_DIR/miniconda +export PATH="$MINICONDA/bin:$PATH" +export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH" +export PARQUET_HOME=$MINICONDA + # Share environment with C++ pushd $CPP_BUILD_DIR source setup_build_env.sh @@ -11,21 +17,6 @@ popd pushd $PYTHON_DIR -# Bootstrap a Conda Python environment - -if [ $TRAVIS_OS_NAME == "linux" ]; then - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" -else - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" -fi - -curl $MINICONDA_URL > miniconda.sh -MINICONDA=$TRAVIS_BUILD_DIR/miniconda -bash miniconda.sh -b -p $MINICONDA -export PATH="$MINICONDA/bin:$PATH" -conda update -y -q conda -conda info -a - python_version_tests() { PYTHON_VERSION=$1 CONDA_ENV_NAME="pyarrow-test-${PYTHON_VERSION}" diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h index 22becc34547..e409566e1f1 100644 --- a/cpp/src/arrow/column.h +++ b/cpp/src/arrow/column.h @@ -67,6 +67,8 @@ class Column { int64_t null_count() const { return data_->null_count(); } + const std::shared_ptr& field() const { return field_; } + // @returns: the column's name in the passed metadata const std::string& name() const { return field_->name; } diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index c00cc9f0f25..f00bb53c084 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -35,6 +35,13 @@ add_library(arrow_parquet SHARED target_link_libraries(arrow_parquet ${PARQUET_LIBS}) SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) +if (APPLE) + set_target_properties(arrow_parquet + PROPERTIES + BUILD_WITH_INSTALL_RPATH ON + INSTALL_NAME_DIR "@rpath") +endif() + ADD_ARROW_TEST(parquet-schema-test) ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index 845574d2c53..db779d8309c 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -18,6 +18,7 @@ #include "gtest/gtest.h" #include "arrow/test-util.h" +#include "arrow/parquet/test-util.h" #include "arrow/parquet/reader.h" #include "arrow/parquet/writer.h" #include "arrow/types/primitive.h" @@ -44,36 +45,45 @@ namespace arrow { namespace parquet { -template -std::shared_ptr NonNullArray( - size_t size, typename ArrowType::c_type value) { - std::vector values(size, value); - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size()); - return std::static_pointer_cast(builder.Finish()); -} +const int SMALL_SIZE = 100; +const int LARGE_SIZE = 10000; -// This helper function only supports (size/2) nulls yet. -template -std::shared_ptr NullableArray( - size_t size, typename ArrowType::c_type value, size_t num_nulls) { - std::vector values(size, value); - std::vector valid_bytes(size, 1); +template +struct test_traits {}; - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; +}; - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size(), valid_bytes.data()); - return std::static_pointer_cast(builder.Finish()); -} +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT64; +}; + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT; +}; + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE; +}; + +template +using ParquetDataType = ::parquet::DataType::parquet_enum>; +template +using ParquetWriter = ::parquet::TypedColumnWriter>; + +template class TestParquetIO : public ::testing::Test { public: + typedef typename TestType::c_type T; virtual void SetUp() {} - std::shared_ptr Schema( + std::shared_ptr MakeSchema( ParquetType::type parquet_type, Repetition::type repetition) { auto pnode = PrimitiveNode::Make("column1", repetition, parquet_type); NodePtr node_ = @@ -98,20 +108,27 @@ class TestParquetIO : public ::testing::Test { std::unique_ptr column_reader; ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); ASSERT_NE(nullptr, column_reader.get()); - ASSERT_OK(column_reader->NextBatch(100, out)); + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); + ASSERT_NE(nullptr, out->get()); + } + + void ReadTableFromFile( + std::unique_ptr file_reader, std::shared_ptr

* out) { + arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); + ASSERT_NO_THROW(ASSERT_OK(reader.ReadFlatTable(out))); ASSERT_NE(nullptr, out->get()); } - std::unique_ptr Int64File( - std::vector& values, int num_chunks) { - std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); + std::unique_ptr TestFile(std::vector& values, int num_chunks) { + std::shared_ptr schema = + MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); std::unique_ptr file_writer = MakeWriter(schema); size_t chunk_size = values.size() / num_chunks; for (int i = 0; i < num_chunks; i++) { auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = - static_cast<::parquet::Int64Writer*>(row_group_writer->NextColumn()); - int64_t* data = values.data() + i * chunk_size; + auto column_writer = static_cast*>( + row_group_writer->NextColumn()); + T* data = values.data() + i * chunk_size; column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); column_writer->Close(); row_group_writer->Close(); @@ -120,71 +137,135 @@ class TestParquetIO : public ::testing::Test { return ReaderFromSink(); } - private: std::shared_ptr sink_; }; -TEST_F(TestParquetIO, SingleColumnInt64Read) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 1); +typedef ::testing::Types TestTypes; + +TYPED_TEST_CASE(TestParquetIO, TestTypes); + +TYPED_TEST(TestParquetIO, SingleColumnRequiredRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 1); std::shared_ptr out; - ReadSingleColumnFile(std::move(file_reader), &out); + this->ReadSingleColumnFile(std::move(file_reader), &out); - Int64Array* out_array = static_cast(out.get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } + ExpectArray(values.data(), out.get()); } -TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 4); +TYPED_TEST(TestParquetIO, SingleColumnRequiredTableRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 1); + + std::shared_ptr
out; + this->ReadTableFromFile(std::move(file_reader), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ExpectArray(values.data(), chunked_array->chunk(0).get()); +} + +TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 4); std::shared_ptr out; - ReadSingleColumnFile(std::move(file_reader), &out); + this->ReadSingleColumnFile(std::move(file_reader), &out); - Int64Array* out_array = static_cast(out.get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } + ExpectArray(values.data(), out.get()); } -TEST_F(TestParquetIO, SingleColumnInt64Write) { - std::shared_ptr values = NonNullArray(100, 128); +TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedTableRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 4); + + std::shared_ptr
out; + this->ReadTableFromFile(std::move(file_reader), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); - std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ExpectArray(values.data(), chunked_array->chunk(0).get()); +} + +TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) { + std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); + + std::shared_ptr schema = + this->MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } -TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) { +TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { + std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); + std::shared_ptr
table = MakeSimpleTable(values, false); + this->sink_ = std::make_shared(); + ASSERT_NO_THROW(ASSERT_OK( + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length()))); + + std::shared_ptr
out; + this->ReadTableFromFile(this->ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + +TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(100, 128, 10); + std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); - std::shared_ptr schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); + std::shared_ptr schema = + this->MakeSchema(test_traits::parquet_enum, Repetition::OPTIONAL); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } -TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) { - std::shared_ptr values = NonNullArray(100, 128); - std::shared_ptr values_chunk = NonNullArray(25, 128); +TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); + std::shared_ptr
table = MakeSimpleTable(values, true); + this->sink_ = std::make_shared(); + ASSERT_NO_THROW(ASSERT_OK( + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length()))); + + std::shared_ptr
out; + this->ReadTableFromFile(this->ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} - std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); +TYPED_TEST(TestParquetIO, SingleColumnIntRequiredChunkedWrite) { + std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); + std::shared_ptr values_chunk = + NonNullArray(SMALL_SIZE / 4, 128); + + std::shared_ptr schema = + this->MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); for (int i = 0; i < 4; i++) { ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk.get()))); @@ -192,18 +273,37 @@ TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) { ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } -TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) { - std::shared_ptr values = NullableArray(100, 128, 10); +TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { + std::shared_ptr values = NonNullArray(LARGE_SIZE, 128); + std::shared_ptr
table = MakeSimpleTable(values, false); + this->sink_ = std::make_shared(); + ASSERT_NO_THROW( + ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512))); + + std::shared_ptr
out; + this->ReadTableFromFile(this->ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(LARGE_SIZE, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + +TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { + std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); std::shared_ptr values_chunk_nulls = - NullableArray(25, 128, 10); - std::shared_ptr values_chunk = NullableArray(25, 128, 0); + NullableArray(SMALL_SIZE / 4, 128, 10); + std::shared_ptr values_chunk = + NullableArray(SMALL_SIZE / 4, 128, 0); - std::shared_ptr schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); + std::shared_ptr schema = + this->MakeSchema(test_traits::parquet_enum, Repetition::OPTIONAL); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk_nulls->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk_nulls.get()))); for (int i = 0; i < 3; i++) { @@ -213,10 +313,28 @@ TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) { ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } +TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(LARGE_SIZE, 128, 100); + std::shared_ptr
table = MakeSimpleTable(values, true); + this->sink_ = std::make_shared(); + ASSERT_NO_THROW( + ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512))); + + std::shared_ptr
out; + this->ReadTableFromFile(this->ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(LARGE_SIZE, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 346de253606..3b4882d4439 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -18,10 +18,14 @@ #include "arrow/parquet/reader.h" #include +#include +#include +#include "arrow/column.h" #include "arrow/parquet/schema.h" #include "arrow/parquet/utils.h" #include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/types/primitive.h" #include "arrow/util/status.h" @@ -40,6 +44,7 @@ class FileReader::Impl { bool CheckForFlatColumn(const ::parquet::ColumnDescriptor* descr); Status GetFlatColumn(int i, std::unique_ptr* out); Status ReadFlatColumn(int i, std::shared_ptr* out); + Status ReadFlatTable(std::shared_ptr
* out); private: MemoryPool* pool_; @@ -103,6 +108,22 @@ Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr* out) { return flat_column_reader->NextBatch(reader_->num_rows(), out); } +Status FileReader::Impl::ReadFlatTable(std::shared_ptr
* table) { + const std::string& name = reader_->descr()->schema()->name(); + std::shared_ptr schema; + RETURN_NOT_OK(FromParquetSchema(reader_->descr(), &schema)); + + std::vector> columns(reader_->num_columns()); + for (int i = 0; i < reader_->num_columns(); i++) { + std::shared_ptr array; + RETURN_NOT_OK(ReadFlatColumn(i, &array)); + columns[i] = std::make_shared(schema->field(i), array); + } + + *table = std::make_shared
(name, schema, columns); + return Status::OK(); +} + FileReader::FileReader( MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader) : impl_(new FileReader::Impl(pool, std::move(reader))) {} @@ -117,6 +138,10 @@ Status FileReader::ReadFlatColumn(int i, std::shared_ptr* out) { return impl_->ReadFlatColumn(i, out); } +Status FileReader::ReadFlatTable(std::shared_ptr
* out) { + return impl_->ReadFlatTable(out); +} + FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr, ::parquet::ParquetFileReader* reader, int column_index) : pool_(pool), diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h index 41ca7eb35b9..db7a15753d8 100644 --- a/cpp/src/arrow/parquet/reader.h +++ b/cpp/src/arrow/parquet/reader.h @@ -29,6 +29,7 @@ class Array; class MemoryPool; class RowBatch; class Status; +class Table; namespace parquet { @@ -90,6 +91,8 @@ class FileReader { Status GetFlatColumn(int i, std::unique_ptr* out); // Read column as a whole into an Array. Status ReadFlatColumn(int i, std::shared_ptr* out); + // Read a table of flat columns into a Table. + Status ReadFlatTable(std::shared_ptr
* out); virtual ~FileReader(); diff --git a/cpp/src/arrow/parquet/test-util.h b/cpp/src/arrow/parquet/test-util.h new file mode 100644 index 00000000000..1496082d5c6 --- /dev/null +++ b/cpp/src/arrow/parquet/test-util.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/types/primitive.h" + +namespace arrow { + +namespace parquet { + +template +std::shared_ptr NonNullArray( + size_t size, typename ArrowType::c_type value) { + std::vector values(size, value); + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size()); + return std::static_pointer_cast(builder.Finish()); +} + +// This helper function only supports (size/2) nulls yet. +template +std::shared_ptr NullableArray( + size_t size, typename ArrowType::c_type value, size_t num_nulls) { + std::vector values(size, value); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size(), valid_bytes.data()); + return std::static_pointer_cast(builder.Finish()); +} + +std::shared_ptr MakeColumn(const std::string& name, + const std::shared_ptr& array, bool nullable) { + auto field = std::make_shared(name, array->type(), nullable); + return std::make_shared(field, array); +} + +std::shared_ptr
MakeSimpleTable( + const std::shared_ptr& values, bool nullable) { + std::shared_ptr column = MakeColumn("col", values, nullable); + std::vector> columns({column}); + std::vector> fields({column->field()}); + auto schema = std::make_shared(fields); + return std::make_shared
("table", schema, columns); +} + +template +void ExpectArray(T* expected, Array* result) { + PrimitiveArray* p_array = static_cast(result); + for (size_t i = 0; i < result->length(); i++) { + EXPECT_EQ(expected[i], reinterpret_cast(p_array->data()->data())[i]); + } +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h index b32792fdf70..409bcd9065c 100644 --- a/cpp/src/arrow/parquet/utils.h +++ b/cpp/src/arrow/parquet/utils.h @@ -31,6 +31,11 @@ namespace parquet { (s); \ } catch (const ::parquet::ParquetException& e) { return Status::Invalid(e.what()); } +#define PARQUET_IGNORE_NOT_OK(s) \ + try { \ + (s); \ + } catch (const ::parquet::ParquetException& e) {} + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 3ad2c5b0735..1223901d550 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -17,11 +17,21 @@ #include "arrow/parquet/writer.h" +#include +#include + #include "arrow/array.h" +#include "arrow/column.h" +#include "arrow/table.h" +#include "arrow/types/construct.h" #include "arrow/types/primitive.h" +#include "arrow/parquet/schema.h" #include "arrow/parquet/utils.h" #include "arrow/util/status.h" +using parquet::ParquetFileWriter; +using parquet::schema::GroupNode; + namespace arrow { namespace parquet { @@ -32,8 +42,9 @@ class FileWriter::Impl { Status NewRowGroup(int64_t chunk_size); template - Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data); - Status WriteFlatColumnChunk(const PrimitiveArray* data); + Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data, + int64_t offset, int64_t length); + Status WriteFlatColumnChunk(const PrimitiveArray* data, int64_t offset, int64_t length); Status Close(); virtual ~Impl() {} @@ -60,31 +71,31 @@ Status FileWriter::Impl::NewRowGroup(int64_t chunk_size) { } template -Status FileWriter::Impl::TypedWriteBatch( - ::parquet::ColumnWriter* column_writer, const PrimitiveArray* data) { +Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer, + const PrimitiveArray* data, int64_t offset, int64_t length) { + // TODO: DCHECK((offset + length) <= data->length()); auto data_ptr = - reinterpret_cast(data->data()->data()); + reinterpret_cast(data->data()->data()) + + offset; auto writer = reinterpret_cast<::parquet::TypedColumnWriter*>(column_writer); if (writer->descr()->max_definition_level() == 0) { // no nulls, just dump the data - PARQUET_CATCH_NOT_OK(writer->WriteBatch(data->length(), nullptr, nullptr, data_ptr)); + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, data_ptr)); } else if (writer->descr()->max_definition_level() == 1) { - RETURN_NOT_OK(def_levels_buffer_.Resize(data->length() * sizeof(int16_t))); + RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); int16_t* def_levels_ptr = reinterpret_cast(def_levels_buffer_.mutable_data()); if (data->null_count() == 0) { - std::fill(def_levels_ptr, def_levels_ptr + data->length(), 1); - PARQUET_CATCH_NOT_OK( - writer->WriteBatch(data->length(), def_levels_ptr, nullptr, data_ptr)); + std::fill(def_levels_ptr, def_levels_ptr + length, 1); + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, data_ptr)); } else { - RETURN_NOT_OK(data_buffer_.Resize( - (data->length() - data->null_count()) * sizeof(typename ParquetType::c_type))); + RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(typename ParquetType::c_type))); auto buffer_ptr = reinterpret_cast(data_buffer_.mutable_data()); int buffer_idx = 0; - for (size_t i = 0; i < data->length(); i++) { - if (data->IsNull(i)) { + for (size_t i = 0; i < length; i++) { + if (data->IsNull(offset + i)) { def_levels_ptr[i] = 0; } else { def_levels_ptr[i] = 1; @@ -92,7 +103,7 @@ Status FileWriter::Impl::TypedWriteBatch( } } PARQUET_CATCH_NOT_OK( - writer->WriteBatch(data->length(), def_levels_ptr, nullptr, buffer_ptr)); + writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); } } else { return Status::NotImplemented("no support for max definition level > 1 yet"); @@ -107,12 +118,13 @@ Status FileWriter::Impl::Close() { return Status::OK(); } -#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ - case Type::ENUM: \ - return TypedWriteBatch(writer, data); \ +#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ + case Type::ENUM: \ + return TypedWriteBatch(writer, data, offset, length); \ break; -Status FileWriter::Impl::WriteFlatColumnChunk(const PrimitiveArray* data) { +Status FileWriter::Impl::WriteFlatColumnChunk( + const PrimitiveArray* data, int64_t offset, int64_t length) { ::parquet::ColumnWriter* writer; PARQUET_CATCH_NOT_OK(writer = row_group_writer_->NextColumn()); switch (data->type_enum()) { @@ -133,8 +145,11 @@ Status FileWriter::NewRowGroup(int64_t chunk_size) { return impl_->NewRowGroup(chunk_size); } -Status FileWriter::WriteFlatColumnChunk(const PrimitiveArray* data) { - return impl_->WriteFlatColumnChunk(data); +Status FileWriter::WriteFlatColumnChunk( + const PrimitiveArray* data, int64_t offset, int64_t length) { + int64_t real_length = length; + if (length == -1) { real_length = data->length(); } + return impl_->WriteFlatColumnChunk(data, offset, real_length); } Status FileWriter::Close() { @@ -143,6 +158,48 @@ Status FileWriter::Close() { FileWriter::~FileWriter() {} +Status WriteFlatTable(const Table* table, MemoryPool* pool, + std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size) { + std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema; + RETURN_NOT_OK(ToParquetSchema(table->schema().get(), &parquet_schema)); + auto schema_node = std::static_pointer_cast(parquet_schema->schema()); + std::unique_ptr parquet_writer = + ParquetFileWriter::Open(sink, schema_node); + FileWriter writer(pool, std::move(parquet_writer)); + + // TODO: Support writing chunked arrays. + for (int i = 0; i < table->num_columns(); i++) { + if (table->column(i)->data()->num_chunks() != 1) { + return Status::NotImplemented("No support for writing chunked arrays yet."); + } + } + + // Cast to PrimitiveArray instances as we work with them. + std::vector> arrays(table->num_columns()); + for (int i = 0; i < table->num_columns(); i++) { + // num_chunks == 1 as per above loop + std::shared_ptr array = table->column(i)->data()->chunk(0); + auto primitive_array = std::dynamic_pointer_cast(array); + if (!primitive_array) { + PARQUET_IGNORE_NOT_OK(writer.Close()); + return Status::NotImplemented("Table must consist of PrimitiveArray instances"); + } + arrays[i] = primitive_array; + } + + for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) { + int64_t offset = chunk * chunk_size; + int64_t size = std::min(chunk_size, table->num_rows() - offset); + RETURN_NOT_OK_ELSE(writer.NewRowGroup(size), PARQUET_IGNORE_NOT_OK(writer.Close())); + for (int i = 0; i < table->num_columns(); i++) { + RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size), + PARQUET_IGNORE_NOT_OK(writer.Close())); + } + } + + return writer.Close(); +} + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index 38f7d0b3a89..83e799f7ed1 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -29,6 +29,7 @@ class MemoryPool; class PrimitiveArray; class RowBatch; class Status; +class Table; namespace parquet { @@ -42,7 +43,8 @@ class FileWriter { FileWriter(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); Status NewRowGroup(int64_t chunk_size); - Status WriteFlatColumnChunk(const PrimitiveArray* data); + Status WriteFlatColumnChunk( + const PrimitiveArray* data, int64_t offset = 0, int64_t length = -1); Status Close(); virtual ~FileWriter(); @@ -52,6 +54,14 @@ class FileWriter { std::unique_ptr impl_; }; +/** + * Write a flat Table to Parquet. + * + * The table shall only consist of nullable, non-repeated columns of primitive type. + */ +Status WriteFlatTable(const Table* table, MemoryPool* pool, + std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size); + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index 6ddc177a9a5..d1a74250008 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -63,6 +63,15 @@ namespace arrow { if (!_s.ok()) { return _s; } \ } while (0); +#define RETURN_NOT_OK_ELSE(s, else_) \ + do { \ + Status _s = (s); \ + if (!_s.ok()) { \ + else_; \ + return _s; \ + } \ + } while (0); + enum class StatusCode : char { OK = 0, OutOfMemory = 1, diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 2173232d4ef..f1becfcf449 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -339,11 +339,17 @@ if (PYARROW_BUILD_TESTS) STATIC_LIB ${GTEST_STATIC_LIB}) endif() +## Parquet +find_package(Parquet REQUIRED) +include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) + ## Arrow find_package(Arrow REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) +ADD_THIRDPARTY_LIB(arrow_parquet + SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) ############################################################ # Linker setup @@ -422,6 +428,7 @@ set(PYARROW_SRCS set(LINK_LIBS arrow + arrow_parquet ) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) @@ -442,6 +449,7 @@ set(CYTHON_EXTENSIONS array config error + parquet scalar schema table diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 3d9983849eb..f0b258ed027 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -42,19 +42,27 @@ find_library(ARROW_LIB_PATH NAMES arrow ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) -if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) +find_library(ARROW_PARQUET_LIB_PATH NAMES arrow_parquet + PATHS + ${ARROW_SEARCH_LIB_PATH} + NO_DEFAULT_PATH) + +if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) + set(ARROW_PARQUET_LIB_NAME libarrow_parquet) set(ARROW_LIBS ${ARROW_SEARCH_LIB_PATH}) set(ARROW_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_LIB_NAME}.a) set(ARROW_SHARED_LIB ${ARROW_LIBS}/${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) + set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) else () set(ARROW_FOUND FALSE) endif () if (ARROW_FOUND) if (NOT Arrow_FIND_QUIETLY) - message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}") + message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}, ${ARROW_PARQUET_LIB_PATH}") endif () else () if (NOT Arrow_FIND_QUIETLY) @@ -74,4 +82,6 @@ mark_as_advanced( ARROW_LIBS ARROW_STATIC_LIB ARROW_SHARED_LIB + ARROW_PARQUET_STATIC_LIB + ARROW_PARQUET_SHARED_LIB ) diff --git a/python/conda.recipe/build.sh b/python/conda.recipe/build.sh index a9d9aedead3..a164c1af518 100644 --- a/python/conda.recipe/build.sh +++ b/python/conda.recipe/build.sh @@ -6,6 +6,19 @@ export ARROW_HOME=$PREFIX cd $RECIPE_DIR +if [ "$(uname)" == "Darwin" ]; then + # C++11 finagling for Mac OSX + export CC=clang + export CXX=clang++ + export MACOSX_VERSION_MIN="10.7" + CXXFLAGS="${CXXFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}" + CXXFLAGS="${CXXFLAGS} -stdlib=libc++ -std=c++11" + export LDFLAGS="${LDFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}" + export LDFLAGS="${LDFLAGS} -stdlib=libc++ -std=c++11" + export LINKFLAGS="${LDFLAGS}" + export MACOSX_DEPLOYMENT_TARGET=10.7 +fi + echo Setting the compiler... if [ `uname` == Linux ]; then EXTRA_CMAKE_ARGS=-DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++ diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index a80b3ce8398..619e5ef7e39 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -68,6 +68,9 @@ cdef class Array: values = array_format(self, window=10) return '{0}\n{1}'.format(type_format, values) + def equals(Array self, Array other): + return self.ap.Equals(other.sp_array) + def __len__(self): if self.sp_array.get(): return self.sp_array.get().length() diff --git a/python/pyarrow/error.pxd b/python/pyarrow/error.pxd index d226abeda04..97ba0ef2e9f 100644 --- a/python/pyarrow/error.pxd +++ b/python/pyarrow/error.pxd @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from pyarrow.includes.libarrow cimport CStatus from pyarrow.includes.pyarrow cimport * +cdef check_cstatus(const CStatus& status) cdef check_status(const Status& status) diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx index 3f8d7dd6460..5a6a038a92e 100644 --- a/python/pyarrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -15,12 +15,20 @@ # specific language governing permissions and limitations # under the License. +from pyarrow.includes.libarrow cimport CStatus from pyarrow.includes.common cimport c_string from pyarrow.compat import frombytes class ArrowException(Exception): pass +cdef check_cstatus(const CStatus& status): + if status.ok(): + return + + cdef c_string c_message = status.ToString() + raise ArrowException(frombytes(c_message)) + cdef check_status(const Status& status): if status.ok(): return diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index e86d5d77e8b..1f6ecee5105 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -19,6 +19,7 @@ from libc.stdint cimport * from libcpp cimport bool as c_bool +from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string as c_string from libcpp.vector cimport vector @@ -32,11 +33,3 @@ cdef extern from "": cdef extern from "": void Py_XDECREF(PyObject* o) -cdef extern from "" namespace "std" nogil: - - cdef cppclass shared_ptr[T]: - shared_ptr() - shared_ptr(T*) - T* get() - void reset() - void reset(T* p) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b2ef45a347b..90414e3d542 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -72,6 +72,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass MemoryPool" arrow::MemoryPool": int64_t bytes_allocated() + cdef MemoryPool* default_memory_pool() + cdef cppclass CListType" arrow::ListType"(CDataType): CListType(const shared_ptr[CDataType]& value_type) @@ -103,6 +105,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int32_t null_count() Type type_enum() + c_bool Equals(const shared_ptr[CArray]& arr) c_bool IsNull(int i) cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray): diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index ffdc5d48706..0918344070e 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -18,6 +18,26 @@ # distutils: language = c++ from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport CSchema, CStatus, CTable, MemoryPool + + +cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: + cdef cppclass Node: + pass + + cdef cppclass GroupNode(Node): + pass + + cdef cppclass PrimitiveNode(Node): + pass + +cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: + cdef cppclass SchemaDescriptor: + shared_ptr[Node] schema() + GroupNode* group() + + cdef cppclass ColumnDescriptor: + pass cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: @@ -48,4 +68,30 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: pass cdef cppclass ParquetFileReader: + # TODO: Some default arguments are missing + @staticmethod + unique_ptr[ParquetFileReader] OpenFile(const c_string& path) + +cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: + cdef cppclass OutputStream: pass + + cdef cppclass LocalFileOutputStream(OutputStream): + LocalFileOutputStream(const c_string& path) + void Close() + + +cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil: + cdef cppclass FileReader: + FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader) + CStatus ReadFlatTable(shared_ptr[CTable]* out); + + +cdef extern from "arrow/parquet/schema.h" namespace "arrow::parquet" nogil: + CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, shared_ptr[CSchema]* out) + CStatus ToParquetSchema(const CSchema* arrow_schema, shared_ptr[SchemaDescriptor]* out) + + +cdef extern from "arrow/parquet/writer.h" namespace "arrow::parquet" nogil: + cdef CStatus WriteFlatTable(const CTable* table, MemoryPool* pool, shared_ptr[OutputStream] sink, int64_t chunk_size) + diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 622e7d07724..3d5355ebe43 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -19,5 +19,53 @@ # distutils: language = c++ # cython: embedsignature = True -from pyarrow.compat import frombytes, tobytes +from pyarrow.includes.libarrow cimport * +cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.includes.parquet cimport * + +from pyarrow.compat import tobytes +from pyarrow.error cimport check_cstatus +from pyarrow.table cimport Table + +def read_table(filename, columns=None): + """ + Read a Table from Parquet format + Returns + ------- + table: pyarrow.Table + """ + cdef unique_ptr[FileReader] reader + cdef Table table = Table() + cdef shared_ptr[CTable] ctable + + # Must be in one expression to avoid calling std::move which is not possible + # in Cython (due to missing rvalue support) + reader = unique_ptr[FileReader](new FileReader(default_memory_pool(), + ParquetFileReader.OpenFile(tobytes(filename)))) + check_cstatus(reader.get().ReadFlatTable(&ctable)) + table.init(ctable) + return table + +def write_table(table, filename, chunk_size=None): + """ + Write a Table to Parquet format + + Parameters + ---------- + table : pyarrow.Table + filename : string + chunk_size : int + The maximum number of rows in each Parquet RowGroup + """ + cdef Table table_ = table + cdef CTable* ctable_ = table_.table + cdef shared_ptr[OutputStream] sink + cdef int64_t chunk_size_ = 0 + if chunk_size is None: + chunk_size_ = min(ctable_.num_rows(), int(2**16)) + else: + chunk_size_ = chunk_size + + sink.reset(new LocalFileOutputStream(tobytes(filename))) + check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, chunk_size_)) + diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index 22ddf0cf17e..084c304aed2 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -201,7 +201,9 @@ def string(): def list_(DataType value_type): cdef DataType out = DataType() - out.init(shared_ptr[CDataType](new CListType(value_type.sp_type))) + cdef shared_ptr[CDataType] list_type + list_type.reset(new CListType(value_type.sp_type)) + out.init(list_type) return out def struct(fields): @@ -212,12 +214,13 @@ def struct(fields): DataType out = DataType() Field field vector[shared_ptr[CField]] c_fields + cdef shared_ptr[CDataType] struct_type for field in fields: c_fields.push_back(field.sp_field) - out.init(shared_ptr[CDataType]( - new CStructType(c_fields))) + struct_type.reset(new CStructType(c_fields)) + out.init(struct_type) return out def schema(fields): diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py new file mode 100644 index 00000000000..d92cf4ca656 --- /dev/null +++ b/python/pyarrow/tests/test_parquet.py @@ -0,0 +1,59 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.compat import unittest +import pyarrow as arrow +import pyarrow.parquet + +A = arrow + +import numpy as np +import os.path +import pandas as pd + +import pandas.util.testing as pdt + + +def test_single_pylist_column_roundtrip(tmpdir): + for dtype in [int, float]: + filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__)) + data = [A.from_pylist(list(map(dtype, range(5))))] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + A.parquet.write_table(table, filename.strpath) + table_read = pyarrow.parquet.read_table(filename.strpath) + for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): + assert col_written.name == col_read.name + assert col_read.data.num_chunks == 1 + data_written = col_written.data.chunk(0) + data_read = col_read.data.chunk(0) + assert data_written.equals(data_read) + +def test_pandas_rountrip(tmpdir): + size = 10000 + df = pd.DataFrame({ + 'int32': np.arange(size, dtype=np.int32), + 'int64': np.arange(size, dtype=np.int64), + 'float32': np.arange(size, dtype=np.float32), + 'float64': np.arange(size, dtype=np.float64) + }) + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = A.from_pandas_dataframe(df) + A.parquet.write_table(arrow_table, filename.strpath) + table_read = pyarrow.parquet.read_table(filename.strpath) + df_read = table_read.to_pandas() + pdt.assert_frame_equal(df, df_read) + diff --git a/python/setup.py b/python/setup.py index 5f228ed0af2..7edeb914331 100644 --- a/python/setup.py +++ b/python/setup.py @@ -214,7 +214,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['array', 'config', 'error', 'scalar', 'schema', 'table'] + return ['array', 'config', 'error', 'parquet', 'scalar', 'schema', 'table'] def get_names(self): return self._found_names @@ -242,7 +242,7 @@ def get_outputs(self): 'clean': clean, 'build_ext': build_ext }, - install_requires=['cython >= 0.21', 'numpy >= 1.9'], + install_requires=['cython >= 0.23', 'numpy >= 1.9'], description=DESC, license='Apache License, Version 2.0', maintainer="Apache Arrow Developers", From b4e0e93d580b8e0344c0caa1cf51cbe088bd25ac Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 15 Jun 2016 13:28:10 -0700 Subject: [PATCH 087/210] ARROW-217: Fix Travis w.r.t conda 4.1.0 changes Travis is happy, fixes the problems we see with Travis in #85 Author: Uwe L. Korn Closes #90 from xhochy/fix-conda-show-channel-urls and squashes the following commits: 82e9840 [Uwe L. Korn] ARROW-217: Fix Travis w.r.t. conda 4.1.0 changes --- ci/travis_before_script_cpp.sh | 2 +- ci/travis_conda_build.sh | 2 +- ci/travis_install_conda.sh | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 6159f67e361..9060cc9b5ef 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -e +set -ex source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh conda install -y --channel apache/channel/dev parquet-cpp diff --git a/ci/travis_conda_build.sh b/ci/travis_conda_build.sh index c43a85170b0..a787df79a55 100755 --- a/ci/travis_conda_build.sh +++ b/ci/travis_conda_build.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -set -e +set -ex source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh index bef667dff7c..be7f59a4733 100644 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -15,9 +15,11 @@ export PATH="$MINICONDA/bin:$PATH" conda update -y -q conda conda info -a -conda config --set show_channel_urls yes +conda config --set show_channel_urls True +conda config --add channels https://repo.continuum.io/pkgs/free conda config --add channels conda-forge conda config --add channels apache +conda info -a conda install --yes conda-build jinja2 anaconda-client From 790d5412da67f807159f236179a8a7df37b270d2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 16 Jun 2016 10:50:40 -0700 Subject: [PATCH 088/210] ARROW-218: Add optional API token authentication option to PR merge tool You can use an API token with extremely limited privileges (i.e., only access to public GitHub repos), but this helps avoid rate limiting issues on shared outbound IP addresses. Author: Wes McKinney Closes #91 from wesm/ARROW-218 and squashes the following commits: f45808c [Wes McKinney] Add optional GitHub API token to patch tool (to avoid rate limiting issues with unauthenticated requests) --- dev/merge_arrow_pr.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index fe0bcd13dd8..981779ffb4c 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -66,7 +66,17 @@ def get_json(url): try: - return json.load(urllib2.urlopen(url)) + from urllib2 import urlopen, Request + env_var = 'ARROW_GITHUB_API_TOKEN' + + if env_var in os.environ: + token = os.environ[env_var] + request = Request(url) + request.add_header('Authorization', 'token %s' % token) + response = urlopen(request) + else: + response = urlopen(url) + return json.load(response) except urllib2.HTTPError as e: print "Unable to fetch URL, exiting: %s" % url sys.exit(-1) From 27edd25eb4f714ff1cc2770ed5a1fbc695eb8a08 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Thu, 16 Jun 2016 10:58:18 -0700 Subject: [PATCH 089/210] ARROW-210: Cleanup of the string related types in C++ code base One thing that is worth discussing is if char types should also be removed (if they aren't i'll add the missing unit tests). I also moved CharType to type.h which seems more consistent with existing code. I can clean it up either way in a follow-up review if we decide with want to push types into their corresponding Array headers. Author: Micah Kornfield Closes #85 from emkornfield/emk_string_types_wip and squashes the following commits: 4414816 [Micah Kornfield] remove CHAR from parquet 6f0634c [Micah Kornfield] remove char type and add dcheck 58bfcc9 [Micah Kornfield] fix style of char_type_ 1e0152d [Micah Kornfield] wip --- cpp/src/arrow/parquet/schema.cc | 5 - cpp/src/arrow/type.cc | 17 ++- cpp/src/arrow/type.h | 55 ++++++--- cpp/src/arrow/types/construct.cc | 2 - cpp/src/arrow/types/decimal.h | 1 - cpp/src/arrow/types/list.h | 8 +- cpp/src/arrow/types/string-test.cc | 188 ++++++++++++++++++++++++----- cpp/src/arrow/types/string.cc | 40 ++++-- cpp/src/arrow/types/string.h | 104 +++++++++------- cpp/src/arrow/util/macros.h | 2 +- 10 files changed, 307 insertions(+), 115 deletions(-) diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index fd758940c9f..c7979db3494 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -250,11 +250,6 @@ Status FieldToNode(const std::shared_ptr& field, NodePtr* out) { case Type::DOUBLE: type = ParquetType::DOUBLE; break; - case Type::CHAR: - type = ParquetType::FIXED_LEN_BYTE_ARRAY; - logical_type = LogicalType::UTF8; - length = static_cast(field->type.get())->size; - break; case Type::STRING: type = ParquetType::BYTE_ARRAY; logical_type = LogicalType::UTF8; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4e686d9cf4a..4fd50b7c193 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -31,7 +31,18 @@ std::string Field::ToString() const { DataType::~DataType() {} -StringType::StringType() : DataType(Type::STRING) {} +bool DataType::Equals(const DataType* other) const { + bool equals = other && ((this == other) || + ((this->type == other->type) && + ((this->num_children() == other->num_children())))); + if (equals) { + for (int i = 0; i < num_children(); ++i) { + // TODO(emkornfield) limit recursion + if (!children_[i]->Equals(other->children_[i])) { return false; } + } + } + return equals; +} std::string StringType::ToString() const { std::string result(name()); @@ -44,6 +55,10 @@ std::string ListType::ToString() const { return s.str(); } +std::string BinaryType::ToString() const { + return std::string(name()); +} + std::string StructType::ToString() const { std::stringstream s; s << "struct<"; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f366645cd5c..8fb41211ba9 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,6 +23,8 @@ #include #include +#include "arrow/util/macros.h" + namespace arrow { // Data types in this library are all *logical*. They can be expressed as @@ -53,15 +55,9 @@ struct Type { // 8-byte floating point value DOUBLE = 11, - // CHAR(N): fixed-length UTF8 string with length N - CHAR = 12, - // UTF8 variable-length string as List STRING = 13, - // VARCHAR(N): Null-terminated string type embedded in a CHAR(N + 1) - VARCHAR = 14, - // Variable-length bytes (no guarantee of UTF8-ness) BINARY = 15, @@ -114,12 +110,15 @@ struct DataType { virtual ~DataType(); - bool Equals(const DataType* other) { - // Call with a pointer so more friendly to subclasses - return other && ((this == other) || (this->type == other->type)); - } + // Return whether the types are equal + // + // Types that are logically convertable from one to another e.g. List + // and Binary are NOT equal). + virtual bool Equals(const DataType* other) const; - bool Equals(const std::shared_ptr& other) { return Equals(other.get()); } + bool Equals(const std::shared_ptr& other) const { + return Equals(other.get()); + } const std::shared_ptr& child(int i) const { return children_[i]; } @@ -236,9 +235,8 @@ struct DoubleType : public PrimitiveType { struct ListType : public DataType { // List can contain any other logical value type - explicit ListType(const std::shared_ptr& value_type) : DataType(Type::LIST) { - children_ = {std::make_shared("item", value_type)}; - } + explicit ListType(const std::shared_ptr& value_type) + : ListType(value_type, Type::LIST) {} explicit ListType(const std::shared_ptr& value_field) : DataType(Type::LIST) { children_ = {value_field}; @@ -251,15 +249,38 @@ struct ListType : public DataType { static char const* name() { return "list"; } std::string ToString() const override; + + protected: + // Constructor for classes that are implemented as List Arrays. + ListType(const std::shared_ptr& value_type, Type::type logical_type) + : DataType(logical_type) { + // TODO ARROW-187 this can technically fail, make a constructor method ? + children_ = {std::make_shared("item", value_type)}; + } }; -// String is a logical type consisting of a physical list of 1-byte values -struct StringType : public DataType { - StringType(); +// BinaryType type is reprsents lists of 1-byte values. +struct BinaryType : public ListType { + BinaryType() : BinaryType(Type::BINARY) {} + static char const* name() { return "binary"; } + std::string ToString() const override; + + protected: + // Allow subclasses to change the logical type. + explicit BinaryType(Type::type logical_type) + : ListType(std::shared_ptr(new UInt8Type()), logical_type) {} +}; + +// UTF encoded strings +struct StringType : public BinaryType { + StringType() : BinaryType(Type::STRING) {} static char const* name() { return "string"; } std::string ToString() const override; + + protected: + explicit StringType(Type::type logical_type) : BinaryType(logical_type) {} }; struct StructType : public DataType { diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index bcb0ec49090..2d913a73748 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -127,10 +127,8 @@ Status MakeListArray(const TypePtr& type, int32_t length, case Type::LIST: out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); break; - case Type::CHAR: case Type::DECIMAL_TEXT: case Type::STRING: - case Type::VARCHAR: out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); break; default: diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 1be489d4f51..598df3ef70d 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -29,7 +29,6 @@ struct DecimalType : public DataType { : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} int precision; int scale; - static char const* name() { return "decimal"; } std::string ToString() const override; diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 0a3941633eb..2f6f85d66ca 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -66,8 +66,8 @@ class ListArray : public Array { int32_t offset(int i) const { return offsets_[i]; } // Neither of these functions will perform boundschecking - int32_t value_offset(int i) { return offsets_[i]; } - int32_t value_length(int i) { return offsets_[i + 1] - offsets_[i]; } + int32_t value_offset(int i) const { return offsets_[i]; } + int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } bool EqualsExact(const ListArray& other) const; bool Equals(const std::shared_ptr& arr) const override; @@ -92,9 +92,9 @@ class ListArray : public Array { // a sequence of offests and null values. // // A note on types. Per arrow/type.h all types in the c++ implementation are -// logical so even though this class always builds an Array of lists, this can +// logical so even though this class always builds list array, this can // represent multiple different logical types. If no logical type is provided -// at construction time, the class defaults to List where t is take from the +// at construction time, the class defaults to List where t is taken from the // value_builder/values that the object is constructed with. class ListBuilder : public ArrayBuilder { public: diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index ee4307c4d16..a141fc11321 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -34,32 +34,14 @@ namespace arrow { class Buffer; -TEST(TypesTest, TestCharType) { - CharType t1(5); - - ASSERT_EQ(t1.type, Type::CHAR); - ASSERT_EQ(t1.size, 5); - - ASSERT_EQ(t1.ToString(), std::string("char(5)")); - - // Test copy constructor - CharType t2 = t1; - ASSERT_EQ(t2.type, Type::CHAR); - ASSERT_EQ(t2.size, 5); -} - -TEST(TypesTest, TestVarcharType) { - VarcharType t1(5); - - ASSERT_EQ(t1.type, Type::VARCHAR); - ASSERT_EQ(t1.size, 5); - - ASSERT_EQ(t1.ToString(), std::string("varchar(5)")); - - // Test copy constructor - VarcharType t2 = t1; - ASSERT_EQ(t2.type, Type::VARCHAR); - ASSERT_EQ(t2.size, 5); +TEST(TypesTest, BinaryType) { + BinaryType t1; + BinaryType e1; + StringType t2; + EXPECT_TRUE(t1.Equals(&e1)); + EXPECT_FALSE(t1.Equals(&t2)); + ASSERT_EQ(t1.type, Type::BINARY); + ASSERT_EQ(t1.ToString(), std::string("binary")); } TEST(TypesTest, TestStringType) { @@ -119,6 +101,7 @@ class TestStringContainer : public ::testing::Test { TEST_F(TestStringContainer, TestArrayBasics) { ASSERT_EQ(length_, strings_->length()); ASSERT_EQ(1, strings_->null_count()); + ASSERT_OK(strings_->Validate()); } TEST_F(TestStringContainer, TestType) { @@ -163,7 +146,10 @@ class TestStringBuilder : public TestBuilder { builder_.reset(new StringBuilder(pool_, type_)); } - void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } + void Done() { + result_ = std::dynamic_pointer_cast(builder_->Finish()); + result_->Validate(); + } protected: TypePtr type_; @@ -216,4 +202,152 @@ TEST_F(TestStringBuilder, TestZeroLength) { Done(); } +// Binary container type +// TODO(emkornfield) there should be some way to refactor these to avoid code duplicating +// with String +class TestBinaryContainer : public ::testing::Test { + public: + void SetUp() { + chars_ = {'a', 'b', 'b', 'c', 'c', 'c'}; + offsets_ = {0, 1, 1, 1, 3, 6}; + valid_bytes_ = {1, 1, 0, 1, 1}; + expected_ = {"a", "", "", "bb", "ccc"}; + + MakeArray(); + } + + void MakeArray() { + length_ = offsets_.size() - 1; + int nchars = chars_.size(); + + value_buf_ = test::to_buffer(chars_); + values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); + + offsets_buf_ = test::to_buffer(offsets_); + + null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); + null_count_ = test::null_count(valid_bytes_); + + strings_ = std::make_shared( + length_, offsets_buf_, values_, null_count_, null_bitmap_); + } + + protected: + std::vector offsets_; + std::vector chars_; + std::vector valid_bytes_; + + std::vector expected_; + + std::shared_ptr value_buf_; + std::shared_ptr offsets_buf_; + std::shared_ptr null_bitmap_; + + int null_count_; + int length_; + + ArrayPtr values_; + std::shared_ptr strings_; +}; + +TEST_F(TestBinaryContainer, TestArrayBasics) { + ASSERT_EQ(length_, strings_->length()); + ASSERT_EQ(1, strings_->null_count()); + ASSERT_OK(strings_->Validate()); +} + +TEST_F(TestBinaryContainer, TestType) { + TypePtr type = strings_->type(); + + ASSERT_EQ(Type::BINARY, type->type); + ASSERT_EQ(Type::BINARY, strings_->type_enum()); +} + +TEST_F(TestBinaryContainer, TestListFunctions) { + int pos = 0; + for (size_t i = 0; i < expected_.size(); ++i) { + ASSERT_EQ(pos, strings_->value_offset(i)); + ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); + pos += expected_[i].size(); + } +} + +TEST_F(TestBinaryContainer, TestDestructor) { + auto arr = std::make_shared( + length_, offsets_buf_, values_, null_count_, null_bitmap_); +} + +TEST_F(TestBinaryContainer, TestGetValue) { + for (size_t i = 0; i < expected_.size(); ++i) { + if (valid_bytes_[i] == 0) { + ASSERT_TRUE(strings_->IsNull(i)); + } else { + int32_t len = -1; + const uint8_t* bytes = strings_->GetValue(i, &len); + ASSERT_EQ(0, std::memcmp(expected_[i].data(), bytes, len)); + } + } +} + +class TestBinaryBuilder : public TestBuilder { + public: + void SetUp() { + TestBuilder::SetUp(); + type_ = TypePtr(new BinaryType()); + builder_.reset(new BinaryBuilder(pool_, type_)); + } + + void Done() { + result_ = std::dynamic_pointer_cast(builder_->Finish()); + result_->Validate(); + } + + protected: + TypePtr type_; + + std::unique_ptr builder_; + std::shared_ptr result_; +}; + +TEST_F(TestBinaryBuilder, TestScalarAppend) { + std::vector strings = {"", "bb", "a", "", "ccc"}; + std::vector is_null = {0, 0, 0, 1, 0}; + + int N = strings.size(); + int reps = 1000; + + for (int j = 0; j < reps; ++j) { + for (int i = 0; i < N; ++i) { + if (is_null[i]) { + builder_->AppendNull(); + } else { + builder_->Append( + reinterpret_cast(strings[i].data()), strings[i].size()); + } + } + } + Done(); + ASSERT_OK(result_->Validate()); + ASSERT_EQ(reps * N, result_->length()); + ASSERT_EQ(reps, result_->null_count()); + ASSERT_EQ(reps * 6, result_->values()->length()); + + int32_t length; + for (int i = 0; i < N * reps; ++i) { + if (is_null[i % N]) { + ASSERT_TRUE(result_->IsNull(i)); + } else { + ASSERT_FALSE(result_->IsNull(i)); + const uint8_t* vals = result_->GetValue(i, &length); + ASSERT_EQ(strings[i % N].size(), length); + ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length)); + } + } +} + +TEST_F(TestBinaryBuilder, TestZeroLength) { + // All buffers are null + Done(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 29d97d03947..da02c7d1d8a 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -24,25 +24,43 @@ namespace arrow { +const std::shared_ptr BINARY(new BinaryType()); const std::shared_ptr STRING(new StringType()); -StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, +BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, const std::shared_ptr& null_bitmap) - : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + : BinaryArray(BINARY, length, offsets, values, null_count, null_bitmap) {} + +BinaryArray::BinaryArray(const TypePtr& type, int32_t length, + const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap) + : ListArray(type, length, offsets, values, null_count, null_bitmap), + bytes_(std::dynamic_pointer_cast(values).get()), + raw_bytes_(bytes_->raw_data()) { + // Check in case the dynamic cast fails. + DCHECK(bytes_); +} -std::string CharType::ToString() const { - std::stringstream s; - s << "char(" << size << ")"; - return s.str(); +Status BinaryArray::Validate() const { + if (values()->null_count() > 0) { + std::stringstream ss; + ss << type()->ToString() << " can have null values in the value array"; + Status::Invalid(ss.str()); + } + return ListArray::Validate(); } -std::string VarcharType::ToString() const { - std::stringstream s; - s << "varchar(" << size << ")"; - return s.str(); +StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& null_bitmap) + : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + +Status StringArray::Validate() const { + // TODO(emkornfield) Validate proper UTF8 code points? + return BinaryArray::Validate(); } -TypePtr StringBuilder::value_type_ = TypePtr(new UInt8Type()); +TypePtr BinaryBuilder::value_type_ = TypePtr(new UInt8Type()); } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index d2d3c5b6b5a..b3c00d298b3 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -34,87 +34,99 @@ namespace arrow { class Buffer; class MemoryPool; -struct CharType : public DataType { - int size; - - explicit CharType(int size) : DataType(Type::CHAR), size(size) {} - - CharType(const CharType& other) : CharType(other.size) {} - - virtual std::string ToString() const; -}; - -// Variable-length, null-terminated strings, up to a certain length -struct VarcharType : public DataType { - int size; - - explicit VarcharType(int size) : DataType(Type::VARCHAR), size(size) {} - VarcharType(const VarcharType& other) : VarcharType(other.size) {} - - virtual std::string ToString() const; -}; - -// TODO(wesm): add a BinaryArray layer in between -class StringArray : public ListArray { +class BinaryArray : public ListArray { public: - StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, + BinaryArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) - : ListArray(type, length, offsets, values, null_count, null_bitmap) { - // For convenience - bytes_ = static_cast(values.get()); - raw_bytes_ = bytes_->raw_data(); - } - - StringArray(int32_t length, const std::shared_ptr& offsets, + const std::shared_ptr& null_bitmap = nullptr); + // Constructor that allows sub-classes/builders to propagate there logical type up the + // class hierarchy. + BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); - // Compute the pointer t + // Return the pointer to the given elements bytes + // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy + // pointer + offset const uint8_t* GetValue(int i, int32_t* out_length) const { - int32_t pos = offsets_[i]; + DCHECK(out_length); + const int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; return raw_bytes_ + pos; } + Status Validate() const override; + + private: + UInt8Array* bytes_; + const uint8_t* raw_bytes_; +}; + +class StringArray : public BinaryArray { + public: + StringArray(int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr); + // Constructor that allows overriding the logical type, so subclasses can propagate + // there + // up the class hierarchy. + StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, + const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& null_bitmap = nullptr) + : BinaryArray(type, length, offsets, values, null_count, null_bitmap) {} + // Construct a std::string + // TODO: std::bad_alloc possibility std::string GetString(int i) const { int32_t nchars; const uint8_t* str = GetValue(i, &nchars); return std::string(reinterpret_cast(str), nchars); } - private: - UInt8Array* bytes_; - const uint8_t* raw_bytes_; + Status Validate() const override; }; -// String builder -class StringBuilder : public ListBuilder { +// BinaryBuilder : public ListBuilder +class BinaryBuilder : public ListBuilder { public: - explicit StringBuilder(MemoryPool* pool, const TypePtr& type) + explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type) : ListBuilder(pool, std::make_shared(pool, value_type_), type) { byte_builder_ = static_cast(value_builder_.get()); } - Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } - - Status Append(const char* value, int32_t length) { + Status Append(const uint8_t* value, int32_t length) { RETURN_NOT_OK(ListBuilder::Append()); - return byte_builder_->Append(reinterpret_cast(value), length); + return byte_builder_->Append(value, length); } - Status Append(const std::vector& values, uint8_t* null_bytes); std::shared_ptr Finish() override { - return ListBuilder::Transfer(); + return ListBuilder::Transfer(); } protected: UInt8Builder* byte_builder_; - static TypePtr value_type_; }; +// String builder +class StringBuilder : public BinaryBuilder { + public: + explicit StringBuilder(MemoryPool* pool, const TypePtr& type) + : BinaryBuilder(pool, type) {} + + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } + + Status Append(const char* value, int32_t length) { + return BinaryBuilder::Append(reinterpret_cast(value), length); + } + + Status Append(const std::vector& values, uint8_t* null_bytes); + + std::shared_ptr Finish() override { + return ListBuilder::Transfer(); + } +}; + } // namespace arrow #endif // ARROW_TYPES_STRING_H diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 51e605ee50a..69ecda16ceb 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -21,6 +21,6 @@ // From Google gutil #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ TypeName(const TypeName&) = delete; \ - void operator=(const TypeName&) = delete + TypeName& operator=(const TypeName&) = delete #endif // ARROW_UTIL_MACROS_H From a3e3849cde60f611ea47271f510a96c2f36606a7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 16 Jun 2016 22:07:30 -0700 Subject: [PATCH 090/210] ARROW-219: Preserve CMAKE_CXX_FLAGS, fix compiler warnings Some undesired compiler warnings had crept into our build; future warnings should fail the build now. Author: Wes McKinney Closes #92 from wesm/ARROW-219 and squashes the following commits: fd68a74 [Wes McKinney] Buglet 6507351 [Wes McKinney] Fix clang warning 0f9e3ca [Wes McKinney] Preserve CMAKE_CXX_FLAGS, fix compiler warnings --- cpp/CMakeLists.txt | 13 +++++++------ cpp/src/arrow/parquet/test-util.h | 2 +- cpp/src/arrow/parquet/writer.cc | 8 +++++++- cpp/src/arrow/parquet/writer.h | 2 ++ cpp/src/arrow/util/macros.h | 2 ++ 5 files changed, 19 insertions(+), 8 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a3fb01076d4..bdf757238cc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -139,15 +139,15 @@ string (TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) # Set compile flags based on the build type. message("Configured for ${CMAKE_BUILD_TYPE} build (set with cmake -DCMAKE_BUILD_TYPE={release,debug,...})") if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") - set(CMAKE_CXX_FLAGS ${CXX_FLAGS_DEBUG}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_DEBUG}") elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") - set(CMAKE_CXX_FLAGS ${CXX_FLAGS_FASTDEBUG}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_FASTDEBUG}") elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") - set(CMAKE_CXX_FLAGS ${CXX_FLAGS_RELEASE}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_RELEASE}") elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_GEN") - set(CMAKE_CXX_FLAGS ${CXX_FLAGS_PROFILE_GEN}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_PROFILE_GEN}") elseif ("${CMAKE_BUILD_TYPE}" STREQUAL "PROFILE_BUILD") - set(CMAKE_CXX_FLAGS ${CXX_FLAGS_PROFILE_BUILD}) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CXX_FLAGS_PROFILE_BUILD}") else() message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") endif () @@ -165,6 +165,7 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") # http://petereisentraut.blogspot.com/2011/05/ccache-and-clang.html # http://petereisentraut.blogspot.com/2011/09/ccache-and-clang-part-2.html set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Qunused-arguments") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CLANG_OPTIONS}") endif() # Sanity check linking option. @@ -559,7 +560,7 @@ if (${CLANG_TIDY_FOUND}) add_custom_target(clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 1 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc | sed -e '/_generated/g'`) # runs clang-tidy and exits with a non-zero exit code if any errors are found. - add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json + add_custom_target(check-clang-tidy ${BUILD_SUPPORT_DIR}/run-clang-tidy.sh ${CLANG_TIDY_BIN} ${CMAKE_BINARY_DIR}/compile_commands.json 0 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc |grep -v -F -f ${CMAKE_CURRENT_SOURCE_DIR}/src/.clang-tidy-ignore | sed -e '/_generated/g'`) endif() diff --git a/cpp/src/arrow/parquet/test-util.h b/cpp/src/arrow/parquet/test-util.h index 1496082d5c6..cc8723bf6ec 100644 --- a/cpp/src/arrow/parquet/test-util.h +++ b/cpp/src/arrow/parquet/test-util.h @@ -67,7 +67,7 @@ std::shared_ptr
MakeSimpleTable( template void ExpectArray(T* expected, Array* result) { PrimitiveArray* p_array = static_cast(result); - for (size_t i = 0; i < result->length(); i++) { + for (int i = 0; i < result->length(); i++) { EXPECT_EQ(expected[i], reinterpret_cast(p_array->data()->data())[i]); } } diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 1223901d550..4005e3b2b0c 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -50,6 +50,8 @@ class FileWriter::Impl { virtual ~Impl() {} private: + friend class FileWriter; + MemoryPool* pool_; PoolBuffer data_buffer_; PoolBuffer def_levels_buffer_; @@ -94,7 +96,7 @@ Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer, auto buffer_ptr = reinterpret_cast(data_buffer_.mutable_data()); int buffer_idx = 0; - for (size_t i = 0; i < length; i++) { + for (int i = 0; i < length; i++) { if (data->IsNull(offset + i)) { def_levels_ptr[i] = 0; } else { @@ -156,6 +158,10 @@ Status FileWriter::Close() { return impl_->Close(); } +MemoryPool* FileWriter::memory_pool() const { + return impl_->pool_; +} + FileWriter::~FileWriter() {} Status WriteFlatTable(const Table* table, MemoryPool* pool, diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index 83e799f7ed1..93693f51184 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -49,6 +49,8 @@ class FileWriter { virtual ~FileWriter(); + MemoryPool* memory_pool() const; + private: class Impl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/util/macros.h b/cpp/src/arrow/util/macros.h index 69ecda16ceb..e2bb355115b 100644 --- a/cpp/src/arrow/util/macros.h +++ b/cpp/src/arrow/util/macros.h @@ -19,8 +19,10 @@ #define ARROW_UTIL_MACROS_H // From Google gutil +#ifndef DISALLOW_COPY_AND_ASSIGN #define DISALLOW_COPY_AND_ASSIGN(TypeName) \ TypeName(const TypeName&) = delete; \ TypeName& operator=(const TypeName&) = delete +#endif #endif // ARROW_UTIL_MACROS_H From f7ade7bfeaa7e0d7fb3dd9d5a93e29a413cc142a Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 21 Jun 2016 15:11:26 -0700 Subject: [PATCH 091/210] ARROW-223: Do not link against libpython Author: Uwe L. Korn Closes #95 from xhochy/arrow-223 and squashes the following commits: 4fdf1e7 [Uwe L. Korn] ARROW-223: Do not link against libpython Change-Id: I1238a48aaf94ab175b367551f74c335c6455d78a --- python/cmake_modules/FindPythonLibsNew.cmake | 6 +++++- python/cmake_modules/UseCython.cmake | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cmake_modules/FindPythonLibsNew.cmake b/python/cmake_modules/FindPythonLibsNew.cmake index 0f2295aa43b..5cb65c9f1a4 100644 --- a/python/cmake_modules/FindPythonLibsNew.cmake +++ b/python/cmake_modules/FindPythonLibsNew.cmake @@ -224,7 +224,11 @@ FUNCTION(PYTHON_ADD_MODULE _NAME ) SET_TARGET_PROPERTIES(${_NAME} PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") ELSE() - TARGET_LINK_LIBRARIES(${_NAME} ${PYTHON_LIBRARIES}) + # In general, we should not link against libpython as we do not embed + # the Python interpreter. The python binary itself can then define where + # the symbols should loaded from. + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES LINK_FLAGS + "-Wl,-undefined,dynamic_lookup") ENDIF() IF(PYTHON_MODULE_${_NAME}_BUILD_SHARED) SET_TARGET_PROPERTIES(${_NAME} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}") diff --git a/python/cmake_modules/UseCython.cmake b/python/cmake_modules/UseCython.cmake index 3b1c201edff..cee6066d31d 100644 --- a/python/cmake_modules/UseCython.cmake +++ b/python/cmake_modules/UseCython.cmake @@ -163,7 +163,6 @@ function( cython_add_module _name pyx_target_name generated_files) include_directories( ${PYTHON_INCLUDE_DIRS} ) python_add_module( ${_name} ${_generated_files} ${other_module_sources} ) add_dependencies( ${_name} ${pyx_target_name}) - target_link_libraries( ${_name} ${PYTHON_LIBRARIES} ) endfunction() include( CMakeParseArguments ) From ef90830290491294d2fccfc5dcb16d3c0f96a70a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 24 Jun 2016 16:41:08 -0700 Subject: [PATCH 092/210] ARROW-222: Prototyping an IO interface for Arrow, with initial HDFS target - Switch Travis CI back to Ubuntu trusty (old Boost in precise has issues with C++11) - Adapt SFrame libhdfs shim for arrow - Create C++ public API within arrow:io to libhdfs - Implement and test many functions in libhdfs - Start Cython wrapper interface to arrow_io. Begin Python file-like interface, unit tests - Add thirdparty hdfs.h so builds are possible without a local Hadoop distro (e.g. in Travis CI). Change-Id: I4a46e50f6c1c22787baa3749d8a542216341e630 --- .travis.yml | 5 +- NOTICE.txt | 9 + ci/travis_before_script_cpp.sh | 15 +- cpp/CMakeLists.txt | 60 +- cpp/doc/HDFS.md | 39 + cpp/src/arrow/io/CMakeLists.txt | 97 ++ cpp/src/arrow/io/hdfs-io-test.cc | 315 +++++++ cpp/src/arrow/io/hdfs.cc | 458 ++++++++++ cpp/src/arrow/io/hdfs.h | 213 +++++ cpp/src/arrow/io/interfaces.h | 71 ++ cpp/src/arrow/io/libhdfs_shim.cc | 544 ++++++++++++ cpp/src/arrow/parquet/parquet-io-test.cc | 4 +- cpp/thirdparty/hadoop/include/hdfs.h | 1024 ++++++++++++++++++++++ dev/merge_arrow_pr.py | 5 +- python/CMakeLists.txt | 6 +- python/cmake_modules/FindArrow.cmake | 17 +- python/conda.recipe/meta.yaml | 1 + python/pyarrow/error.pxd | 4 +- python/pyarrow/error.pyx | 14 +- python/pyarrow/includes/common.pxd | 18 + python/pyarrow/includes/libarrow.pxd | 19 - python/pyarrow/includes/libarrow_io.pxd | 93 ++ python/pyarrow/io.pyx | 504 +++++++++++ python/pyarrow/tests/test_array.py | 47 +- python/pyarrow/tests/test_io.py | 126 +++ python/setup.py | 9 +- 26 files changed, 3656 insertions(+), 61 deletions(-) create mode 100644 NOTICE.txt create mode 100644 cpp/doc/HDFS.md create mode 100644 cpp/src/arrow/io/CMakeLists.txt create mode 100644 cpp/src/arrow/io/hdfs-io-test.cc create mode 100644 cpp/src/arrow/io/hdfs.cc create mode 100644 cpp/src/arrow/io/hdfs.h create mode 100644 cpp/src/arrow/io/interfaces.h create mode 100644 cpp/src/arrow/io/libhdfs_shim.cc create mode 100644 cpp/thirdparty/hadoop/include/hdfs.h create mode 100644 python/pyarrow/includes/libarrow_io.pxd create mode 100644 python/pyarrow/io.pyx create mode 100644 python/pyarrow/tests/test_io.py diff --git a/.travis.yml b/.travis.yml index ac2b0d457cb..97229b1ceb3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ sudo: required -dist: precise +dist: trusty addons: apt: sources: @@ -12,6 +12,9 @@ addons: - ccache - cmake - valgrind + - libboost-dev + - libboost-filesystem-dev + - libboost-system-dev matrix: fast_finish: true diff --git a/NOTICE.txt b/NOTICE.txt new file mode 100644 index 00000000000..0310c897cd7 --- /dev/null +++ b/NOTICE.txt @@ -0,0 +1,9 @@ +Apache Arrow +Copyright 2016 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +This product includes software from the SFrame project (BSD, 3-clause). +* Copyright (C) 2015 Dato, Inc. +* Copyright (c) 2009 Carnegie Mellon University. diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 9060cc9b5ef..08551f3b009 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -23,12 +23,21 @@ echo $GTEST_HOME : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} -CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DARROW_PARQUET=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" +CMAKE_COMMON_FLAGS="\ +-DARROW_BUILD_BENCHMARKS=ON \ +-DARROW_PARQUET=ON \ +-DARROW_HDFS=on \ +-DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" if [ $TRAVIS_OS_NAME == "linux" ]; then - cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR + cmake -DARROW_TEST_MEMCHECK=on \ + $CMAKE_COMMON_FLAGS \ + -DCMAKE_CXX_FLAGS="-Werror" \ + $CPP_DIR else - cmake $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR + cmake $CMAKE_COMMON_FLAGS \ + -DCMAKE_CXX_FLAGS="-Werror" \ + $CPP_DIR fi make -j4 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index bdf757238cc..18b47599b93 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -62,6 +62,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow IPC extensions" ON) + option(ARROW_HDFS + "Build the Arrow IO extensions for the Hadoop file system" + OFF) + option(ARROW_SSE3 "Build Arrow with SSE3" ON) @@ -454,6 +458,47 @@ if ("$ENV{GBENCHMARK_HOME}" STREQUAL "") set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed) endif() +# ---------------------------------------------------------------------- +# Add Boost dependencies (code adapted from Apache Kudu (incubating)) + +# find boost headers and libs +set(Boost_DEBUG TRUE) +set(Boost_USE_MULTITHREADED ON) +set(Boost_USE_STATIC_LIBS ON) +find_package(Boost COMPONENTS system filesystem REQUIRED) +include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) +set(BOOST_STATIC_LIBS ${Boost_LIBRARIES}) +list(LENGTH BOOST_STATIC_LIBS BOOST_STATIC_LIBS_LEN) + +# Find Boost shared libraries. +set(Boost_USE_STATIC_LIBS OFF) +find_package(Boost COMPONENTS system filesystem REQUIRED) +set(BOOST_SHARED_LIBS ${Boost_LIBRARIES}) +list(LENGTH BOOST_SHARED_LIBS BOOST_SHARED_LIBS_LEN) +list(SORT BOOST_SHARED_LIBS) + +message(STATUS "Boost include dir: " ${Boost_INCLUDE_DIRS}) +message(STATUS "Boost libraries: " ${Boost_LIBRARIES}) + +math(EXPR LAST_IDX "${BOOST_STATIC_LIBS_LEN} - 1") +foreach(IDX RANGE ${LAST_IDX}) + list(GET BOOST_STATIC_LIBS ${IDX} BOOST_STATIC_LIB) + list(GET BOOST_SHARED_LIBS ${IDX} BOOST_SHARED_LIB) + + # Remove the prefix/suffix from the library name. + # + # e.g. libboost_system-mt --> boost_system + get_filename_component(LIB_NAME ${BOOST_STATIC_LIB} NAME_WE) + string(REGEX REPLACE "lib([^-]*)(-mt)?" "\\1" LIB_NAME_NO_PREFIX_SUFFIX ${LIB_NAME}) + ADD_THIRDPARTY_LIB(${LIB_NAME_NO_PREFIX_SUFFIX} + STATIC_LIB "${BOOST_STATIC_LIB}" + SHARED_LIB "${BOOST_SHARED_LIB}") + list(APPEND ARROW_BOOST_LIBS ${LIB_NAME_NO_PREFIX_SUFFIX}) +endforeach() +include_directories(SYSTEM ${Boost_INCLUDE_DIR}) + +# ---------------------------------------------------------------------- +# Enable / disable tests and benchmarks if(ARROW_BUILD_TESTS) add_custom_target(unittest ctest -L unittest) @@ -529,12 +574,24 @@ endif (UNIX) # "make lint" target ############################################################ if (UNIX) + + file(GLOB_RECURSE LINT_FILES + "${CMAKE_CURRENT_SOURCE_DIR}/src/*.h" + "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc" + ) + + FOREACH(item ${LINT_FILES}) + IF(NOT (item MATCHES "_generated.h")) + LIST(APPEND FILTERED_LINT_FILES ${item}) + ENDIF() + ENDFOREACH(item ${LINT_FILES}) + # Full lint add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py --verbose=2 --linelength=90 --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references - `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g'`) + ${FILTERED_LINT_FILES}) endif (UNIX) @@ -624,6 +681,7 @@ set_target_properties(arrow target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) add_subdirectory(src/arrow) +add_subdirectory(src/arrow/io) add_subdirectory(src/arrow/util) add_subdirectory(src/arrow/types) diff --git a/cpp/doc/HDFS.md b/cpp/doc/HDFS.md new file mode 100644 index 00000000000..e0d5dfda21d --- /dev/null +++ b/cpp/doc/HDFS.md @@ -0,0 +1,39 @@ +## Using Arrow's HDFS (Apache Hadoop Distributed File System) interface + +### Build requirements + +To build the integration, pass the following option to CMake + +```shell +-DARROW_HDFS=on +``` + +For convenience, we have bundled `hdfs.h` for libhdfs from Apache Hadoop in +Arrow's thirdparty. If you wish to build against the `hdfs.h` in your installed +Hadoop distribution, set the `$HADOOP_HOME` environment variable. + +### Runtime requirements + +By default, the HDFS client C++ class in `libarrow_io` uses the libhdfs JNI +interface to the Java Hadoop client. This library is loaded **at runtime** +(rather than at link / library load time, since the library may not be in your +LD_LIBRARY_PATH), and relies on some environment variables. + +* `HADOOP_HOME`: the root of your installed Hadoop distribution. Check in the + `lib/native` directory to look for `libhdfs.so` if you have any questions + about which directory you're after. +* `JAVA_HOME`: the location of your Java SDK installation +* `CLASSPATH`: must contain the Hadoop jars. You can set these using: + +```shell +export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` +``` + +#### Setting $JAVA_HOME automatically on OS X + +The installed location of Java on OS X can vary, however the following snippet +will set it automatically for you: + +```shell +export JAVA_HOME=$(/usr/libexec/java_home) +``` \ No newline at end of file diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt new file mode 100644 index 00000000000..33b654f8190 --- /dev/null +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# ---------------------------------------------------------------------- +# arrow_io : Arrow IO interfaces + +set(ARROW_IO_LINK_LIBS + arrow +) + +set(ARROW_IO_PRIVATE_LINK_LIBS + boost_system + boost_filesystem +) + +set(ARROW_IO_TEST_LINK_LIBS + arrow_io + ${ARROW_IO_PRIVATE_LINK_LIBS}) + +set(ARROW_IO_SRCS +) + +if(ARROW_HDFS) + if(NOT THIRDPARTY_DIR) + message(FATAL_ERROR "THIRDPARTY_DIR not set") + endif() + + if (DEFINED ENV{HADOOP_HOME}) + set(HADOOP_HOME $ENV{HADOOP_HOME}) + else() + set(HADOOP_HOME "${THIRDPARTY_DIR}/hadoop") + endif() + + set(HDFS_H_PATH "${HADOOP_HOME}/include/hdfs.h") + if (NOT EXISTS ${HDFS_H_PATH}) + message(FATAL_ERROR "Did not find hdfs.h at ${HDFS_H_PATH}") + endif() + message(STATUS "Found hdfs.h at: " ${HDFS_H_PATH}) + message(STATUS "Building libhdfs shim component") + + include_directories(SYSTEM "${HADOOP_HOME}/include") + + set(ARROW_HDFS_SRCS + hdfs.cc + libhdfs_shim.cc) + + set_property(SOURCE ${ARROW_HDFS_SRCS} + APPEND_STRING PROPERTY + COMPILE_FLAGS "-DHAS_HADOOP") + + set(ARROW_IO_SRCS + ${ARROW_HDFS_SRCS} + ${ARROW_IO_SRCS}) + + ADD_ARROW_TEST(hdfs-io-test) + ARROW_TEST_LINK_LIBRARIES(hdfs-io-test + ${ARROW_IO_TEST_LINK_LIBS}) +endif() + +add_library(arrow_io SHARED + ${ARROW_IO_SRCS} +) +target_link_libraries(arrow_io LINK_PUBLIC ${ARROW_IO_LINK_LIBS}) +target_link_libraries(arrow_io LINK_PRIVATE ${ARROW_IO_PRIVATE_LINK_LIBS}) + +SET_TARGET_PROPERTIES(arrow_io PROPERTIES LINKER_LANGUAGE CXX) + +if (APPLE) + set_target_properties(arrow_io + PROPERTIES + BUILD_WITH_INSTALL_RPATH ON + INSTALL_NAME_DIR "@rpath") +endif() + +# Headers: top level +install(FILES + hdfs.h + interfaces.h + DESTINATION include/arrow/io) + +install(TARGETS arrow_io + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/cpp/src/arrow/io/hdfs-io-test.cc b/cpp/src/arrow/io/hdfs-io-test.cc new file mode 100644 index 00000000000..11d67aeba20 --- /dev/null +++ b/cpp/src/arrow/io/hdfs-io-test.cc @@ -0,0 +1,315 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include // NOLINT + +#include "arrow/io/hdfs.h" +#include "arrow/test-util.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace io { + +std::vector RandomData(int64_t size) { + std::vector buffer(size); + test::random_bytes(size, 0, buffer.data()); + return buffer; +} + +class TestHdfsClient : public ::testing::Test { + public: + Status MakeScratchDir() { + if (client_->Exists(scratch_dir_)) { + RETURN_NOT_OK((client_->Delete(scratch_dir_, true))); + } + return client_->CreateDirectory(scratch_dir_); + } + + Status WriteDummyFile(const std::string& path, const uint8_t* buffer, int64_t size, + bool append = false, int buffer_size = 0, int replication = 0, + int default_block_size = 0) { + std::shared_ptr file; + RETURN_NOT_OK(client_->OpenWriteable( + path, append, buffer_size, replication, default_block_size, &file)); + + RETURN_NOT_OK(file->Write(buffer, size)); + RETURN_NOT_OK(file->Close()); + + return Status::OK(); + } + + std::string ScratchPath(const std::string& name) { + std::stringstream ss; + ss << scratch_dir_ << "/" << name; + return ss.str(); + } + + std::string HdfsAbsPath(const std::string& relpath) { + std::stringstream ss; + ss << "hdfs://" << conf_.host << ":" << conf_.port << relpath; + return ss.str(); + } + + protected: + // Set up shared state between unit tests + static void SetUpTestCase() { + if (!ConnectLibHdfs().ok()) { + std::cout << "Loading libhdfs failed, skipping tests gracefully" << std::endl; + return; + } + + loaded_libhdfs_ = true; + + const char* host = std::getenv("ARROW_HDFS_TEST_HOST"); + const char* port = std::getenv("ARROW_HDFS_TEST_PORT"); + const char* user = std::getenv("ARROW_HDFS_TEST_USER"); + + ASSERT_TRUE(user) << "Set ARROW_HDFS_TEST_USER"; + + conf_.host = host == nullptr ? "localhost" : host; + conf_.user = user; + conf_.port = port == nullptr ? 20500 : atoi(port); + + ASSERT_OK(HdfsClient::Connect(&conf_, &client_)); + } + + static void TearDownTestCase() { + if (client_) { + EXPECT_OK(client_->Delete(scratch_dir_, true)); + EXPECT_OK(client_->Disconnect()); + } + } + + static bool loaded_libhdfs_; + + // Resources shared amongst unit tests + static HdfsConnectionConfig conf_; + static std::string scratch_dir_; + static std::shared_ptr client_; +}; + +bool TestHdfsClient::loaded_libhdfs_ = false; +HdfsConnectionConfig TestHdfsClient::conf_ = HdfsConnectionConfig(); + +std::string TestHdfsClient::scratch_dir_ = + boost::filesystem::unique_path("/tmp/arrow-hdfs/scratch-%%%%").native(); + +std::shared_ptr TestHdfsClient::client_ = nullptr; + +#define SKIP_IF_NO_LIBHDFS() \ + if (!loaded_libhdfs_) { \ + std::cout << "No libhdfs, skipping" << std::endl; \ + return; \ + } + +TEST_F(TestHdfsClient, ConnectsAgain) { + SKIP_IF_NO_LIBHDFS(); + + std::shared_ptr client; + ASSERT_OK(HdfsClient::Connect(&conf_, &client)); + ASSERT_OK(client->Disconnect()); +} + +TEST_F(TestHdfsClient, CreateDirectory) { + SKIP_IF_NO_LIBHDFS(); + + std::string path = ScratchPath("create-directory"); + + if (client_->Exists(path)) { ASSERT_OK(client_->Delete(path, true)); } + + ASSERT_OK(client_->CreateDirectory(path)); + ASSERT_TRUE(client_->Exists(path)); + EXPECT_OK(client_->Delete(path, true)); + ASSERT_FALSE(client_->Exists(path)); +} + +TEST_F(TestHdfsClient, GetCapacityUsed) { + SKIP_IF_NO_LIBHDFS(); + + // Who knows what is actually in your DFS cluster, but expect it to have + // positive used bytes and capacity + int64_t nbytes = 0; + ASSERT_OK(client_->GetCapacity(&nbytes)); + ASSERT_LT(0, nbytes); + + ASSERT_OK(client_->GetUsed(&nbytes)); + ASSERT_LT(0, nbytes); +} + +TEST_F(TestHdfsClient, GetPathInfo) { + SKIP_IF_NO_LIBHDFS(); + + HdfsPathInfo info; + + ASSERT_OK(MakeScratchDir()); + + // Directory info + ASSERT_OK(client_->GetPathInfo(scratch_dir_, &info)); + ASSERT_EQ(ObjectType::DIRECTORY, info.kind); + ASSERT_EQ(HdfsAbsPath(scratch_dir_), info.name); + ASSERT_EQ(conf_.user, info.owner); + + // TODO(wesm): test group, other attrs + + auto path = ScratchPath("test-file"); + + const int size = 100; + + std::vector buffer = RandomData(size); + + ASSERT_OK(WriteDummyFile(path, buffer.data(), size)); + ASSERT_OK(client_->GetPathInfo(path, &info)); + + ASSERT_EQ(ObjectType::FILE, info.kind); + ASSERT_EQ(HdfsAbsPath(path), info.name); + ASSERT_EQ(conf_.user, info.owner); + ASSERT_EQ(size, info.size); +} + +TEST_F(TestHdfsClient, AppendToFile) { + SKIP_IF_NO_LIBHDFS(); + + ASSERT_OK(MakeScratchDir()); + + auto path = ScratchPath("test-file"); + const int size = 100; + + std::vector buffer = RandomData(size); + ASSERT_OK(WriteDummyFile(path, buffer.data(), size)); + + // now append + ASSERT_OK(WriteDummyFile(path, buffer.data(), size, true)); + + HdfsPathInfo info; + ASSERT_OK(client_->GetPathInfo(path, &info)); + ASSERT_EQ(size * 2, info.size); +} + +TEST_F(TestHdfsClient, ListDirectory) { + SKIP_IF_NO_LIBHDFS(); + + const int size = 100; + std::vector data = RandomData(size); + + auto p1 = ScratchPath("test-file-1"); + auto p2 = ScratchPath("test-file-2"); + auto d1 = ScratchPath("test-dir-1"); + + ASSERT_OK(MakeScratchDir()); + ASSERT_OK(WriteDummyFile(p1, data.data(), size)); + ASSERT_OK(WriteDummyFile(p2, data.data(), size / 2)); + ASSERT_OK(client_->CreateDirectory(d1)); + + std::vector listing; + ASSERT_OK(client_->ListDirectory(scratch_dir_, &listing)); + + // Do it again, appends! + ASSERT_OK(client_->ListDirectory(scratch_dir_, &listing)); + + ASSERT_EQ(6, listing.size()); + + // Argh, well, shouldn't expect the listing to be in any particular order + for (size_t i = 0; i < listing.size(); ++i) { + const HdfsPathInfo& info = listing[i]; + if (info.name == HdfsAbsPath(p1)) { + ASSERT_EQ(ObjectType::FILE, info.kind); + ASSERT_EQ(size, info.size); + } else if (info.name == HdfsAbsPath(p2)) { + ASSERT_EQ(ObjectType::FILE, info.kind); + ASSERT_EQ(size / 2, info.size); + } else if (info.name == HdfsAbsPath(d1)) { + ASSERT_EQ(ObjectType::DIRECTORY, info.kind); + } else { + FAIL() << "Unexpected path: " << info.name; + } + } +} + +TEST_F(TestHdfsClient, ReadableMethods) { + SKIP_IF_NO_LIBHDFS(); + + ASSERT_OK(MakeScratchDir()); + + auto path = ScratchPath("test-file"); + const int size = 100; + + std::vector data = RandomData(size); + ASSERT_OK(WriteDummyFile(path, data.data(), size)); + + std::shared_ptr file; + ASSERT_OK(client_->OpenReadable(path, &file)); + + // Test GetSize -- move this into its own unit test if ever needed + int64_t file_size; + ASSERT_OK(file->GetSize(&file_size)); + ASSERT_EQ(size, file_size); + + uint8_t buffer[50]; + int32_t bytes_read = 0; + + ASSERT_OK(file->Read(50, &bytes_read, buffer)); + ASSERT_EQ(0, std::memcmp(buffer, data.data(), 50)); + ASSERT_EQ(50, bytes_read); + + ASSERT_OK(file->Read(50, &bytes_read, buffer)); + ASSERT_EQ(0, std::memcmp(buffer, data.data() + 50, 50)); + ASSERT_EQ(50, bytes_read); + + // EOF + ASSERT_OK(file->Read(1, &bytes_read, buffer)); + ASSERT_EQ(0, bytes_read); + + // ReadAt to EOF + ASSERT_OK(file->ReadAt(60, 100, &bytes_read, buffer)); + ASSERT_EQ(40, bytes_read); + ASSERT_EQ(0, std::memcmp(buffer, data.data() + 60, bytes_read)); + + // Seek, Tell + ASSERT_OK(file->Seek(60)); + + int64_t position; + ASSERT_OK(file->Tell(&position)); + ASSERT_EQ(60, position); +} + +TEST_F(TestHdfsClient, RenameFile) { + SKIP_IF_NO_LIBHDFS(); + + ASSERT_OK(MakeScratchDir()); + + auto src_path = ScratchPath("src-file"); + auto dst_path = ScratchPath("dst-file"); + const int size = 100; + + std::vector data = RandomData(size); + ASSERT_OK(WriteDummyFile(src_path, data.data(), size)); + + ASSERT_OK(client_->Rename(src_path, dst_path)); + + ASSERT_FALSE(client_->Exists(src_path)); + ASSERT_TRUE(client_->Exists(dst_path)); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc new file mode 100644 index 00000000000..6da6ea4e71b --- /dev/null +++ b/cpp/src/arrow/io/hdfs.cc @@ -0,0 +1,458 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include + +#include "arrow/io/hdfs.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace io { + +#define CHECK_FAILURE(RETURN_VALUE, WHAT) \ + do { \ + if (RETURN_VALUE == -1) { \ + std::stringstream ss; \ + ss << "HDFS: " << WHAT << " failed"; \ + return Status::IOError(ss.str()); \ + } \ + } while (0) + +static Status CheckReadResult(int ret) { + // Check for error on -1 (possibly errno set) + + // ret == 0 at end of file, which is OK + if (ret == -1) { + // EOF + std::stringstream ss; + ss << "HDFS read failed, errno: " << errno; + return Status::IOError(ss.str()); + } + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// File reading + +class HdfsAnyFileImpl { + public: + void set_members(const std::string& path, hdfsFS fs, hdfsFile handle) { + path_ = path; + fs_ = fs; + file_ = handle; + is_open_ = true; + } + + Status Seek(int64_t position) { + int ret = hdfsSeek(fs_, file_, position); + CHECK_FAILURE(ret, "seek"); + return Status::OK(); + } + + Status Tell(int64_t* offset) { + int64_t ret = hdfsTell(fs_, file_); + CHECK_FAILURE(ret, "tell"); + *offset = ret; + return Status::OK(); + } + + bool is_open() const { return is_open_; } + + protected: + std::string path_; + + // These are pointers in libhdfs, so OK to copy + hdfsFS fs_; + hdfsFile file_; + + bool is_open_; +}; + +// Private implementation for read-only files +class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { + public: + HdfsReadableFileImpl() {} + + Status Close() { + if (is_open_) { + int ret = hdfsCloseFile(fs_, file_); + CHECK_FAILURE(ret, "CloseFile"); + is_open_ = false; + } + return Status::OK(); + } + + Status ReadAt(int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { + tSize ret = hdfsPread(fs_, file_, static_cast(position), + reinterpret_cast(buffer), nbytes); + RETURN_NOT_OK(CheckReadResult(ret)); + *bytes_read = ret; + return Status::OK(); + } + + Status Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { + tSize ret = hdfsRead(fs_, file_, reinterpret_cast(buffer), nbytes); + RETURN_NOT_OK(CheckReadResult(ret)); + *bytes_read = ret; + return Status::OK(); + } + + Status GetSize(int64_t* size) { + hdfsFileInfo* entry = hdfsGetPathInfo(fs_, path_.c_str()); + if (entry == nullptr) { return Status::IOError("HDFS: GetPathInfo failed"); } + + *size = entry->mSize; + hdfsFreeFileInfo(entry, 1); + return Status::OK(); + } +}; + +HdfsReadableFile::HdfsReadableFile() { + impl_.reset(new HdfsReadableFileImpl()); +} + +HdfsReadableFile::~HdfsReadableFile() { + impl_->Close(); +} + +Status HdfsReadableFile::Close() { + return impl_->Close(); +} + +Status HdfsReadableFile::ReadAt( + int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { + return impl_->ReadAt(position, nbytes, bytes_read, buffer); +} + +Status HdfsReadableFile::Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { + return impl_->Read(nbytes, bytes_read, buffer); +} + +Status HdfsReadableFile::GetSize(int64_t* size) { + return impl_->GetSize(size); +} + +Status HdfsReadableFile::Seek(int64_t position) { + return impl_->Seek(position); +} + +Status HdfsReadableFile::Tell(int64_t* position) { + return impl_->Tell(position); +} + +// ---------------------------------------------------------------------- +// File writing + +// Private implementation for writeable-only files +class HdfsWriteableFile::HdfsWriteableFileImpl : public HdfsAnyFileImpl { + public: + HdfsWriteableFileImpl() {} + + Status Close() { + if (is_open_) { + int ret = hdfsFlush(fs_, file_); + CHECK_FAILURE(ret, "Flush"); + ret = hdfsCloseFile(fs_, file_); + CHECK_FAILURE(ret, "CloseFile"); + is_open_ = false; + } + return Status::OK(); + } + + Status Write(const uint8_t* buffer, int32_t nbytes, int32_t* bytes_written) { + tSize ret = hdfsWrite(fs_, file_, reinterpret_cast(buffer), nbytes); + CHECK_FAILURE(ret, "Write"); + *bytes_written = ret; + return Status::OK(); + } +}; + +HdfsWriteableFile::HdfsWriteableFile() { + impl_.reset(new HdfsWriteableFileImpl()); +} + +HdfsWriteableFile::~HdfsWriteableFile() { + impl_->Close(); +} + +Status HdfsWriteableFile::Close() { + return impl_->Close(); +} + +Status HdfsWriteableFile::Write( + const uint8_t* buffer, int32_t nbytes, int32_t* bytes_read) { + return impl_->Write(buffer, nbytes, bytes_read); +} + +Status HdfsWriteableFile::Write(const uint8_t* buffer, int32_t nbytes) { + int32_t bytes_written_dummy = 0; + return Write(buffer, nbytes, &bytes_written_dummy); +} + +Status HdfsWriteableFile::Tell(int64_t* position) { + return impl_->Tell(position); +} + +// ---------------------------------------------------------------------- +// HDFS client + +// TODO(wesm): this could throw std::bad_alloc in the course of copying strings +// into the path info object +static void SetPathInfo(const hdfsFileInfo* input, HdfsPathInfo* out) { + out->kind = input->mKind == kObjectKindFile ? ObjectType::FILE : ObjectType::DIRECTORY; + out->name = std::string(input->mName); + out->owner = std::string(input->mOwner); + out->group = std::string(input->mGroup); + + out->last_access_time = static_cast(input->mLastAccess); + out->last_modified_time = static_cast(input->mLastMod); + out->size = static_cast(input->mSize); + + out->replication = input->mReplication; + out->block_size = input->mBlockSize; + + out->permissions = input->mPermissions; +} + +// Private implementation +class HdfsClient::HdfsClientImpl { + public: + HdfsClientImpl() {} + + Status Connect(const HdfsConnectionConfig* config) { + RETURN_NOT_OK(ConnectLibHdfs()); + + fs_ = hdfsConnectAsUser(config->host.c_str(), config->port, config->user.c_str()); + + if (fs_ == nullptr) { return Status::IOError("HDFS connection failed"); } + namenode_host_ = config->host; + port_ = config->port; + user_ = config->user; + + return Status::OK(); + } + + Status CreateDirectory(const std::string& path) { + int ret = hdfsCreateDirectory(fs_, path.c_str()); + CHECK_FAILURE(ret, "create directory"); + return Status::OK(); + } + + Status Delete(const std::string& path, bool recursive) { + int ret = hdfsDelete(fs_, path.c_str(), static_cast(recursive)); + CHECK_FAILURE(ret, "delete"); + return Status::OK(); + } + + Status Disconnect() { + int ret = hdfsDisconnect(fs_); + CHECK_FAILURE(ret, "hdfsFS::Disconnect"); + return Status::OK(); + } + + bool Exists(const std::string& path) { + // hdfsExists does not distinguish between RPC failure and the file not + // existing + int ret = hdfsExists(fs_, path.c_str()); + return ret == 0; + } + + Status GetCapacity(int64_t* nbytes) { + tOffset ret = hdfsGetCapacity(fs_); + CHECK_FAILURE(ret, "GetCapacity"); + *nbytes = ret; + return Status::OK(); + } + + Status GetUsed(int64_t* nbytes) { + tOffset ret = hdfsGetUsed(fs_); + CHECK_FAILURE(ret, "GetUsed"); + *nbytes = ret; + return Status::OK(); + } + + Status GetPathInfo(const std::string& path, HdfsPathInfo* info) { + hdfsFileInfo* entry = hdfsGetPathInfo(fs_, path.c_str()); + + if (entry == nullptr) { return Status::IOError("HDFS: GetPathInfo failed"); } + + SetPathInfo(entry, info); + hdfsFreeFileInfo(entry, 1); + + return Status::OK(); + } + + Status ListDirectory(const std::string& path, std::vector* listing) { + int num_entries = 0; + hdfsFileInfo* entries = hdfsListDirectory(fs_, path.c_str(), &num_entries); + + if (entries == nullptr) { + // If the directory is empty, entries is NULL but errno is 0. Non-zero + // errno indicates error + // + // Note: errno is thread-locala + if (errno == 0) { num_entries = 0; } + { return Status::IOError("HDFS: list directory failed"); } + } + + // Allocate additional space for elements + + int vec_offset = listing->size(); + listing->resize(vec_offset + num_entries); + + for (int i = 0; i < num_entries; ++i) { + SetPathInfo(entries + i, &(*listing)[vec_offset + i]); + } + + // Free libhdfs file info + hdfsFreeFileInfo(entries, num_entries); + + return Status::OK(); + } + + Status OpenReadable(const std::string& path, std::shared_ptr* file) { + hdfsFile handle = hdfsOpenFile(fs_, path.c_str(), O_RDONLY, 0, 0, 0); + + if (handle == nullptr) { + // TODO(wesm): determine cause of failure + std::stringstream ss; + ss << "Unable to open file " << path; + return Status::IOError(ss.str()); + } + + // std::make_shared does not work with private ctors + *file = std::shared_ptr(new HdfsReadableFile()); + (*file)->impl_->set_members(path, fs_, handle); + + return Status::OK(); + } + + Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file) { + int flags = O_WRONLY; + if (append) flags |= O_APPEND; + + hdfsFile handle = hdfsOpenFile( + fs_, path.c_str(), flags, buffer_size, replication, default_block_size); + + if (handle == nullptr) { + // TODO(wesm): determine cause of failure + std::stringstream ss; + ss << "Unable to open file " << path; + return Status::IOError(ss.str()); + } + + // std::make_shared does not work with private ctors + *file = std::shared_ptr(new HdfsWriteableFile()); + (*file)->impl_->set_members(path, fs_, handle); + + return Status::OK(); + } + + Status Rename(const std::string& src, const std::string& dst) { + int ret = hdfsRename(fs_, src.c_str(), dst.c_str()); + CHECK_FAILURE(ret, "Rename"); + return Status::OK(); + } + + private: + std::string namenode_host_; + std::string user_; + int port_; + + hdfsFS fs_; +}; + +// ---------------------------------------------------------------------- +// Public API for HDFSClient + +HdfsClient::HdfsClient() { + impl_.reset(new HdfsClientImpl()); +} + +HdfsClient::~HdfsClient() {} + +Status HdfsClient::Connect( + const HdfsConnectionConfig* config, std::shared_ptr* fs) { + // ctor is private, make_shared will not work + *fs = std::shared_ptr(new HdfsClient()); + + RETURN_NOT_OK((*fs)->impl_->Connect(config)); + return Status::OK(); +} + +Status HdfsClient::CreateDirectory(const std::string& path) { + return impl_->CreateDirectory(path); +} + +Status HdfsClient::Delete(const std::string& path, bool recursive) { + return impl_->Delete(path, recursive); +} + +Status HdfsClient::Disconnect() { + return impl_->Disconnect(); +} + +bool HdfsClient::Exists(const std::string& path) { + return impl_->Exists(path); +} + +Status HdfsClient::GetPathInfo(const std::string& path, HdfsPathInfo* info) { + return impl_->GetPathInfo(path, info); +} + +Status HdfsClient::GetCapacity(int64_t* nbytes) { + return impl_->GetCapacity(nbytes); +} + +Status HdfsClient::GetUsed(int64_t* nbytes) { + return impl_->GetUsed(nbytes); +} + +Status HdfsClient::ListDirectory( + const std::string& path, std::vector* listing) { + return impl_->ListDirectory(path, listing); +} + +Status HdfsClient::OpenReadable( + const std::string& path, std::shared_ptr* file) { + return impl_->OpenReadable(path, file); +} + +Status HdfsClient::OpenWriteable(const std::string& path, bool append, + int32_t buffer_size, int16_t replication, int64_t default_block_size, + std::shared_ptr* file) { + return impl_->OpenWriteable( + path, append, buffer_size, replication, default_block_size, file); +} + +Status HdfsClient::OpenWriteable( + const std::string& path, bool append, std::shared_ptr* file) { + return OpenWriteable(path, append, 0, 0, 0, file); +} + +Status HdfsClient::Rename(const std::string& src, const std::string& dst) { + return impl_->Rename(src, dst); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h new file mode 100644 index 00000000000..a1972db9615 --- /dev/null +++ b/cpp/src/arrow/io/hdfs.h @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_HDFS +#define ARROW_IO_HDFS + +#include +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/macros.h" + +namespace arrow { + +class Status; + +namespace io { + +Status ConnectLibHdfs(); + +class HdfsClient; +class HdfsReadableFile; +class HdfsWriteableFile; + +struct HdfsPathInfo { + ObjectType::type kind; + + std::string name; + std::string owner; + std::string group; + + // Access times in UNIX timestamps (seconds) + int64_t size; + int64_t block_size; + + int32_t last_modified_time; + int32_t last_access_time; + + int16_t replication; + int16_t permissions; +}; + +struct HdfsConnectionConfig { + std::string host; + int port; + std::string user; + + // TODO: Kerberos, etc. +}; + +class HdfsClient : public FileSystemClient { + public: + ~HdfsClient(); + + // Connect to an HDFS cluster at indicated host, port, and as user + // + // @param host (in) + // @param port (in) + // @param user (in): user to identify as + // @param fs (out): the created client + // @returns Status + static Status Connect( + const HdfsConnectionConfig* config, std::shared_ptr* fs); + + // Create directory and all parents + // + // @param path (in): absolute HDFS path + // @returns Status + Status CreateDirectory(const std::string& path); + + // Delete file or directory + // @param path: absolute path to data + // @param recursive: if path is a directory, delete contents as well + // @returns error status on failure + Status Delete(const std::string& path, bool recursive = false); + + // Disconnect from cluster + // + // @returns Status + Status Disconnect(); + + // @param path (in): absolute HDFS path + // @returns bool, true if the path exists, false if not (or on error) + bool Exists(const std::string& path); + + // @param path (in): absolute HDFS path + // @param info (out) + // @returns Status + Status GetPathInfo(const std::string& path, HdfsPathInfo* info); + + // @param nbytes (out): total capacity of the filesystem + // @returns Status + Status GetCapacity(int64_t* nbytes); + + // @param nbytes (out): total bytes used of the filesystem + // @returns Status + Status GetUsed(int64_t* nbytes); + + Status ListDirectory(const std::string& path, std::vector* listing); + + // @param path file path to change + // @param owner pass nullptr for no change + // @param group pass nullptr for no change + Status Chown(const std::string& path, const char* owner, const char* group); + + Status Chmod(const std::string& path, int mode); + + // Move file or directory from source path to destination path within the + // current filesystem + Status Rename(const std::string& src, const std::string& dst); + + // TODO(wesm): GetWorkingDirectory, SetWorkingDirectory + + // Open an HDFS file in READ mode. Returns error + // status if the file is not found. + // + // @param path complete file path + Status OpenReadable(const std::string& path, std::shared_ptr* file); + + // FileMode::WRITE options + // @param path complete file path + // @param buffer_size, 0 for default + // @param replication, 0 for default + // @param default_block_size, 0 for default + Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, + int16_t replication, int64_t default_block_size, + std::shared_ptr* file); + + Status OpenWriteable( + const std::string& path, bool append, std::shared_ptr* file); + + private: + friend class HdfsReadableFile; + friend class HdfsWriteableFile; + + class HdfsClientImpl; + std::unique_ptr impl_; + + HdfsClient(); + DISALLOW_COPY_AND_ASSIGN(HdfsClient); +}; + +class HdfsReadableFile : public RandomAccessFile { + public: + ~HdfsReadableFile(); + + Status Close() override; + + Status GetSize(int64_t* size) override; + + Status ReadAt( + int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) override; + + Status Seek(int64_t position) override; + Status Tell(int64_t* position) override; + + // NOTE: If you wish to read a particular range of a file in a multithreaded + // context, you may prefer to use ReadAt to avoid locking issues + Status Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) override; + + private: + class HdfsReadableFileImpl; + std::unique_ptr impl_; + + friend class HdfsClient::HdfsClientImpl; + + HdfsReadableFile(); + DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); +}; + +class HdfsWriteableFile : public WriteableFile { + public: + ~HdfsWriteableFile(); + + Status Close() override; + + Status Write(const uint8_t* buffer, int32_t nbytes) override; + + Status Write(const uint8_t* buffer, int32_t nbytes, int32_t* bytes_written); + + Status Tell(int64_t* position) override; + + private: + class HdfsWriteableFileImpl; + std::unique_ptr impl_; + + friend class HdfsClient::HdfsClientImpl; + + HdfsWriteableFile(); + + DISALLOW_COPY_AND_ASSIGN(HdfsWriteableFile); +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_HDFS diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h new file mode 100644 index 00000000000..4bd8a8ffc2f --- /dev/null +++ b/cpp/src/arrow/io/interfaces.h @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_INTERFACES +#define ARROW_IO_INTERFACES + +#include + +namespace arrow { + +class Status; + +namespace io { + +struct FileMode { + enum type { READ, WRITE, READWRITE }; +}; + +struct ObjectType { + enum type { FILE, DIRECTORY }; +}; + +class FileSystemClient { + public: + virtual ~FileSystemClient() {} +}; + +class FileBase { + virtual Status Close() = 0; + + virtual Status Tell(int64_t* position) = 0; +}; + +class ReadableFile : public FileBase { + public: + virtual Status ReadAt( + int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) = 0; + + virtual Status Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) = 0; + + virtual Status GetSize(int64_t* size) = 0; +}; + +class RandomAccessFile : public ReadableFile { + public: + virtual Status Seek(int64_t position) = 0; +}; + +class WriteableFile : public FileBase { + public: + virtual Status Write(const uint8_t* buffer, int32_t nbytes) = 0; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_INTERFACES diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc new file mode 100644 index 00000000000..f75266536e5 --- /dev/null +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -0,0 +1,544 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This shim interface to libhdfs (for runtime shared library loading) has been +// adapted from the SFrame project, released under the ASF-compatible 3-clause +// BSD license +// +// Using this required having the $JAVA_HOME and $HADOOP_HOME environment +// variables set, so that libjvm and libhdfs can be located easily + +// Copyright (C) 2015 Dato, Inc. +// All rights reserved. +// +// This software may be modified and distributed under the terms +// of the BSD license. See the LICENSE file for details. + +#ifdef HAS_HADOOP + +#ifndef _WIN32 +#include +#else +#include +#include + +// TODO(wesm): address when/if we add windows support +// #include +#endif + +extern "C" { +#include +} + +#include +#include +#include +#include +#include +#include + +#include // NOLINT +#include // NOLINT + +#include "arrow/util/status.h" + +namespace fs = boost::filesystem; + +extern "C" { + +#ifndef _WIN32 +static void* libhdfs_handle = NULL; +static void* libjvm_handle = NULL; +#else +static HINSTANCE libhdfs_handle = NULL; +static HINSTANCE libjvm_handle = NULL; +#endif +/* + * All the shim pointers + */ + +// NOTE(wesm): cpplint does not like use of short and other imprecise C types + +static hdfsFS (*ptr_hdfsConnectAsUser)( + const char* host, tPort port, const char* user) = NULL; +static hdfsFS (*ptr_hdfsConnect)(const char* host, tPort port) = NULL; +static int (*ptr_hdfsDisconnect)(hdfsFS fs) = NULL; + +static hdfsFile (*ptr_hdfsOpenFile)(hdfsFS fs, const char* path, int flags, + int bufferSize, short replication, tSize blocksize) = NULL; // NOLINT + +static int (*ptr_hdfsCloseFile)(hdfsFS fs, hdfsFile file) = NULL; +static int (*ptr_hdfsExists)(hdfsFS fs, const char* path) = NULL; +static int (*ptr_hdfsSeek)(hdfsFS fs, hdfsFile file, tOffset desiredPos) = NULL; +static tOffset (*ptr_hdfsTell)(hdfsFS fs, hdfsFile file) = NULL; +static tSize (*ptr_hdfsRead)(hdfsFS fs, hdfsFile file, void* buffer, tSize length) = NULL; +static tSize (*ptr_hdfsPread)( + hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length) = NULL; +static tSize (*ptr_hdfsWrite)( + hdfsFS fs, hdfsFile file, const void* buffer, tSize length) = NULL; +static int (*ptr_hdfsFlush)(hdfsFS fs, hdfsFile file) = NULL; +static int (*ptr_hdfsAvailable)(hdfsFS fs, hdfsFile file) = NULL; +static int (*ptr_hdfsCopy)( + hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) = NULL; +static int (*ptr_hdfsMove)( + hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) = NULL; +static int (*ptr_hdfsDelete)(hdfsFS fs, const char* path, int recursive) = NULL; +static int (*ptr_hdfsRename)(hdfsFS fs, const char* oldPath, const char* newPath) = NULL; +static char* (*ptr_hdfsGetWorkingDirectory)( + hdfsFS fs, char* buffer, size_t bufferSize) = NULL; +static int (*ptr_hdfsSetWorkingDirectory)(hdfsFS fs, const char* path) = NULL; +static int (*ptr_hdfsCreateDirectory)(hdfsFS fs, const char* path) = NULL; +static int (*ptr_hdfsSetReplication)( + hdfsFS fs, const char* path, int16_t replication) = NULL; +static hdfsFileInfo* (*ptr_hdfsListDirectory)( + hdfsFS fs, const char* path, int* numEntries) = NULL; +static hdfsFileInfo* (*ptr_hdfsGetPathInfo)(hdfsFS fs, const char* path) = NULL; +static void (*ptr_hdfsFreeFileInfo)(hdfsFileInfo* hdfsFileInfo, int numEntries) = NULL; +static char*** (*ptr_hdfsGetHosts)( + hdfsFS fs, const char* path, tOffset start, tOffset length) = NULL; +static void (*ptr_hdfsFreeHosts)(char*** blockHosts) = NULL; +static tOffset (*ptr_hdfsGetDefaultBlockSize)(hdfsFS fs) = NULL; +static tOffset (*ptr_hdfsGetCapacity)(hdfsFS fs) = NULL; +static tOffset (*ptr_hdfsGetUsed)(hdfsFS fs) = NULL; +static int (*ptr_hdfsChown)( + hdfsFS fs, const char* path, const char* owner, const char* group) = NULL; +static int (*ptr_hdfsChmod)(hdfsFS fs, const char* path, short mode) = NULL; // NOLINT +static int (*ptr_hdfsUtime)(hdfsFS fs, const char* path, tTime mtime, tTime atime) = NULL; + +// Helper functions for dlopens +static std::vector get_potential_libjvm_paths(); +static std::vector get_potential_libhdfs_paths(); +static arrow::Status try_dlopen(std::vector potential_paths, const char* name, +#ifndef _WIN32 + void*& out_handle); +#else + HINSTANCE& out_handle); +#endif + +#define GET_SYMBOL(SYMBOL_NAME) \ + if (!ptr_##SYMBOL_NAME) { \ + *reinterpret_cast(&ptr_##SYMBOL_NAME) = get_symbol("" #SYMBOL_NAME); \ + } + +static void* get_symbol(const char* symbol) { + if (libhdfs_handle == NULL) return NULL; +#ifndef _WIN32 + return dlsym(libhdfs_handle, symbol); +#else + + void* ret = reinterpret_cast(GetProcAddress(libhdfs_handle, symbol)); + if (ret == NULL) { + // logstream(LOG_INFO) << "GetProcAddress error: " + // << get_last_err_str(GetLastError()) << std::endl; + } + return ret; +#endif +} + +hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char* user) { + return ptr_hdfsConnectAsUser(host, port, user); +} + +// Returns NULL on failure +hdfsFS hdfsConnect(const char* host, tPort port) { + if (ptr_hdfsConnect) { + return ptr_hdfsConnect(host, port); + } else { + // TODO: error reporting when shim setup fails + return NULL; + } +} + +int hdfsDisconnect(hdfsFS fs) { + return ptr_hdfsDisconnect(fs); +} + +hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags, int bufferSize, + short replication, tSize blocksize) { // NOLINT + return ptr_hdfsOpenFile(fs, path, flags, bufferSize, replication, blocksize); +} + +int hdfsCloseFile(hdfsFS fs, hdfsFile file) { + return ptr_hdfsCloseFile(fs, file); +} + +int hdfsExists(hdfsFS fs, const char* path) { + return ptr_hdfsExists(fs, path); +} + +int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos) { + return ptr_hdfsSeek(fs, file, desiredPos); +} + +tOffset hdfsTell(hdfsFS fs, hdfsFile file) { + return ptr_hdfsTell(fs, file); +} + +tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length) { + return ptr_hdfsRead(fs, file, buffer, length); +} + +tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer, tSize length) { + return ptr_hdfsPread(fs, file, position, buffer, length); +} + +tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer, tSize length) { + return ptr_hdfsWrite(fs, file, buffer, length); +} + +int hdfsFlush(hdfsFS fs, hdfsFile file) { + return ptr_hdfsFlush(fs, file); +} + +int hdfsAvailable(hdfsFS fs, hdfsFile file) { + GET_SYMBOL(hdfsAvailable); + if (ptr_hdfsAvailable) + return ptr_hdfsAvailable(fs, file); + else + return 0; +} + +int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) { + GET_SYMBOL(hdfsCopy); + if (ptr_hdfsCopy) + return ptr_hdfsCopy(srcFS, src, dstFS, dst); + else + return 0; +} + +int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst) { + GET_SYMBOL(hdfsMove); + if (ptr_hdfsMove) + return ptr_hdfsMove(srcFS, src, dstFS, dst); + else + return 0; +} + +int hdfsDelete(hdfsFS fs, const char* path, int recursive) { + return ptr_hdfsDelete(fs, path, recursive); +} + +int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath) { + GET_SYMBOL(hdfsRename); + if (ptr_hdfsRename) + return ptr_hdfsRename(fs, oldPath, newPath); + else + return 0; +} + +char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize) { + GET_SYMBOL(hdfsGetWorkingDirectory); + if (ptr_hdfsGetWorkingDirectory) { + return ptr_hdfsGetWorkingDirectory(fs, buffer, bufferSize); + } else { + return NULL; + } +} + +int hdfsSetWorkingDirectory(hdfsFS fs, const char* path) { + GET_SYMBOL(hdfsSetWorkingDirectory); + if (ptr_hdfsSetWorkingDirectory) { + return ptr_hdfsSetWorkingDirectory(fs, path); + } else { + return 0; + } +} + +int hdfsCreateDirectory(hdfsFS fs, const char* path) { + return ptr_hdfsCreateDirectory(fs, path); +} + +int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication) { + GET_SYMBOL(hdfsSetReplication); + if (ptr_hdfsSetReplication) { + return ptr_hdfsSetReplication(fs, path, replication); + } else { + return 0; + } +} + +hdfsFileInfo* hdfsListDirectory(hdfsFS fs, const char* path, int* numEntries) { + return ptr_hdfsListDirectory(fs, path, numEntries); +} + +hdfsFileInfo* hdfsGetPathInfo(hdfsFS fs, const char* path) { + return ptr_hdfsGetPathInfo(fs, path); +} + +void hdfsFreeFileInfo(hdfsFileInfo* hdfsFileInfo, int numEntries) { + ptr_hdfsFreeFileInfo(hdfsFileInfo, numEntries); +} + +char*** hdfsGetHosts(hdfsFS fs, const char* path, tOffset start, tOffset length) { + GET_SYMBOL(hdfsGetHosts); + if (ptr_hdfsGetHosts) { + return ptr_hdfsGetHosts(fs, path, start, length); + } else { + return NULL; + } +} + +void hdfsFreeHosts(char*** blockHosts) { + GET_SYMBOL(hdfsFreeHosts); + if (ptr_hdfsFreeHosts) { ptr_hdfsFreeHosts(blockHosts); } +} + +tOffset hdfsGetDefaultBlockSize(hdfsFS fs) { + GET_SYMBOL(hdfsGetDefaultBlockSize); + if (ptr_hdfsGetDefaultBlockSize) { + return ptr_hdfsGetDefaultBlockSize(fs); + } else { + return 0; + } +} + +tOffset hdfsGetCapacity(hdfsFS fs) { + return ptr_hdfsGetCapacity(fs); +} + +tOffset hdfsGetUsed(hdfsFS fs) { + return ptr_hdfsGetUsed(fs); +} + +int hdfsChown(hdfsFS fs, const char* path, const char* owner, const char* group) { + GET_SYMBOL(hdfsChown); + if (ptr_hdfsChown) { + return ptr_hdfsChown(fs, path, owner, group); + } else { + return 0; + } +} + +int hdfsChmod(hdfsFS fs, const char* path, short mode) { // NOLINT + GET_SYMBOL(hdfsChmod); + if (ptr_hdfsChmod) { + return ptr_hdfsChmod(fs, path, mode); + } else { + return 0; + } +} + +int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime) { + GET_SYMBOL(hdfsUtime); + if (ptr_hdfsUtime) { + return ptr_hdfsUtime(fs, path, mtime, atime); + } else { + return 0; + } +} + +static std::vector get_potential_libhdfs_paths() { + std::vector libhdfs_potential_paths = { + // find one in the local directory + fs::path("./libhdfs.so"), fs::path("./hdfs.dll"), + // find a global libhdfs.so + fs::path("libhdfs.so"), fs::path("hdfs.dll"), + }; + + const char* hadoop_home = std::getenv("HADOOP_HOME"); + if (hadoop_home != nullptr) { + auto path = fs::path(hadoop_home) / "lib/native/libhdfs.so"; + libhdfs_potential_paths.push_back(path); + } + return libhdfs_potential_paths; +} + +static std::vector get_potential_libjvm_paths() { + std::vector libjvm_potential_paths; + + std::vector search_prefixes; + std::vector search_suffixes; + std::string file_name; + +// From heuristics +#ifdef __WIN32 + search_prefixes = {""}; + search_suffixes = {"/jre/bin/server", "/bin/server"}; + file_name = "jvm.dll"; +#elif __APPLE__ + search_prefixes = {""}; + search_suffixes = {""}; + file_name = "libjvm.dylib"; + +// SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are +// expecting users to set an environment variable +#else + search_prefixes = { + "/usr/lib/jvm/default-java", // ubuntu / debian distros + "/usr/lib/jvm/java", // rhel6 + "/usr/lib/jvm", // centos6 + "/usr/lib64/jvm", // opensuse 13 + "/usr/local/lib/jvm/default-java", // alt ubuntu / debian distros + "/usr/local/lib/jvm/java", // alt rhel6 + "/usr/local/lib/jvm", // alt centos6 + "/usr/local/lib64/jvm", // alt opensuse 13 + "/usr/local/lib/jvm/java-7-openjdk-amd64", // alt ubuntu / debian distros + "/usr/lib/jvm/java-7-openjdk-amd64", // alt ubuntu / debian distros + "/usr/local/lib/jvm/java-6-openjdk-amd64", // alt ubuntu / debian distros + "/usr/lib/jvm/java-6-openjdk-amd64", // alt ubuntu / debian distros + "/usr/lib/jvm/java-7-oracle", // alt ubuntu + "/usr/lib/jvm/java-8-oracle", // alt ubuntu + "/usr/lib/jvm/java-6-oracle", // alt ubuntu + "/usr/local/lib/jvm/java-7-oracle", // alt ubuntu + "/usr/local/lib/jvm/java-8-oracle", // alt ubuntu + "/usr/local/lib/jvm/java-6-oracle", // alt ubuntu + "/usr/lib/jvm/default", // alt centos + "/usr/java/latest", // alt centos + }; + search_suffixes = {"/jre/lib/amd64/server"}; + file_name = "libjvm.so"; +#endif + // From direct environment variable + char* env_value = NULL; + if ((env_value = getenv("JAVA_HOME")) != NULL) { + // logstream(LOG_INFO) << "Found environment variable " << env_name << ": " << + // env_value << std::endl; + search_prefixes.insert(search_prefixes.begin(), env_value); + } + + // Generate cross product between search_prefixes, search_suffixes, and file_name + for (auto& prefix : search_prefixes) { + for (auto& suffix : search_suffixes) { + auto path = (fs::path(prefix) / fs::path(suffix) / fs::path(file_name)); + libjvm_potential_paths.push_back(path); + } + } + + return libjvm_potential_paths; +} + +#ifndef _WIN32 +static arrow::Status try_dlopen( + std::vector potential_paths, const char* name, void*& out_handle) { + std::vector error_messages; + + for (auto& i : potential_paths) { + i.make_preferred(); + // logstream(LOG_INFO) << "Trying " << i.string().c_str() << std::endl; + out_handle = dlopen(i.native().c_str(), RTLD_NOW | RTLD_LOCAL); + + if (out_handle != NULL) { + // logstream(LOG_INFO) << "Success!" << std::endl; + break; + } else { + const char* err_msg = dlerror(); + if (err_msg != NULL) { + error_messages.push_back(std::string(err_msg)); + } else { + error_messages.push_back(std::string(" returned NULL")); + } + } + } + + if (out_handle == NULL) { + std::stringstream ss; + ss << "Unable to load " << name; + return arrow::Status::IOError(ss.str()); + } + + return arrow::Status::OK(); +} + +#else +static arrow::Status try_dlopen( + std::vector potential_paths, const char* name, HINSTANCE& out_handle) { + std::vector error_messages; + + for (auto& i : potential_paths) { + i.make_preferred(); + // logstream(LOG_INFO) << "Trying " << i.string().c_str() << std::endl; + + out_handle = LoadLibrary(i.string().c_str()); + + if (out_handle != NULL) { + // logstream(LOG_INFO) << "Success!" << std::endl; + break; + } else { + // error_messages.push_back(get_last_err_str(GetLastError())); + } + } + + if (out_handle == NULL) { + std::stringstream ss; + ss << "Unable to load " << name; + return arrow::Status::IOError(ss.str()); + } + + return arrow::Status::OK(); +} +#endif // _WIN32 + +} // extern "C" + +#define GET_SYMBOL_REQUIRED(SYMBOL_NAME) \ + do { \ + if (!ptr_##SYMBOL_NAME) { \ + *reinterpret_cast(&ptr_##SYMBOL_NAME) = get_symbol("" #SYMBOL_NAME); \ + } \ + if (!ptr_##SYMBOL_NAME) \ + return Status::IOError("Getting symbol " #SYMBOL_NAME "failed"); \ + } while (0) + +namespace arrow { +namespace io { + +Status ConnectLibHdfs() { + static std::mutex lock; + std::lock_guard guard(lock); + + static bool shim_attempted = false; + if (!shim_attempted) { + shim_attempted = true; + + std::vector libjvm_potential_paths = get_potential_libjvm_paths(); + RETURN_NOT_OK(try_dlopen(libjvm_potential_paths, "libjvm", libjvm_handle)); + + std::vector libhdfs_potential_paths = get_potential_libhdfs_paths(); + RETURN_NOT_OK(try_dlopen(libhdfs_potential_paths, "libhdfs", libhdfs_handle)); + } else if (libhdfs_handle == nullptr) { + return Status::IOError("Prior attempt to load libhdfs failed"); + } + + GET_SYMBOL_REQUIRED(hdfsConnect); + GET_SYMBOL_REQUIRED(hdfsConnectAsUser); + GET_SYMBOL_REQUIRED(hdfsCreateDirectory); + GET_SYMBOL_REQUIRED(hdfsDelete); + GET_SYMBOL_REQUIRED(hdfsDisconnect); + GET_SYMBOL_REQUIRED(hdfsExists); + GET_SYMBOL_REQUIRED(hdfsFreeFileInfo); + GET_SYMBOL_REQUIRED(hdfsGetCapacity); + GET_SYMBOL_REQUIRED(hdfsGetUsed); + GET_SYMBOL_REQUIRED(hdfsGetPathInfo); + GET_SYMBOL_REQUIRED(hdfsListDirectory); + + // File methods + GET_SYMBOL_REQUIRED(hdfsCloseFile); + GET_SYMBOL_REQUIRED(hdfsFlush); + GET_SYMBOL_REQUIRED(hdfsOpenFile); + GET_SYMBOL_REQUIRED(hdfsRead); + GET_SYMBOL_REQUIRED(hdfsPread); + GET_SYMBOL_REQUIRED(hdfsSeek); + GET_SYMBOL_REQUIRED(hdfsTell); + GET_SYMBOL_REQUIRED(hdfsWrite); + + return Status::OK(); +} + +} // namespace io +} // namespace arrow + +#endif // HAS_HADOOP diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index db779d8309c..edcac887056 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -126,8 +126,8 @@ class TestParquetIO : public ::testing::Test { size_t chunk_size = values.size() / num_chunks; for (int i = 0; i < num_chunks; i++) { auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = static_cast*>( - row_group_writer->NextColumn()); + auto column_writer = + static_cast*>(row_group_writer->NextColumn()); T* data = values.data() + i * chunk_size; column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); column_writer->Close(); diff --git a/cpp/thirdparty/hadoop/include/hdfs.h b/cpp/thirdparty/hadoop/include/hdfs.h new file mode 100644 index 00000000000..a4df6ae3b2b --- /dev/null +++ b/cpp/thirdparty/hadoop/include/hdfs.h @@ -0,0 +1,1024 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LIBHDFS_HDFS_H +#define LIBHDFS_HDFS_H + +#include /* for EINTERNAL, etc. */ +#include /* for O_RDONLY, O_WRONLY */ +#include /* for uint64_t, etc. */ +#include /* for time_t */ + +/* + * Support export of DLL symbols during libhdfs build, and import of DLL symbols + * during client application build. A client application may optionally define + * symbol LIBHDFS_DLL_IMPORT in its build. This is not strictly required, but + * the compiler can produce more efficient code with it. + */ +#ifdef WIN32 + #ifdef LIBHDFS_DLL_EXPORT + #define LIBHDFS_EXTERNAL __declspec(dllexport) + #elif LIBHDFS_DLL_IMPORT + #define LIBHDFS_EXTERNAL __declspec(dllimport) + #else + #define LIBHDFS_EXTERNAL + #endif +#else + #ifdef LIBHDFS_DLL_EXPORT + #define LIBHDFS_EXTERNAL __attribute__((visibility("default"))) + #elif LIBHDFS_DLL_IMPORT + #define LIBHDFS_EXTERNAL __attribute__((visibility("default"))) + #else + #define LIBHDFS_EXTERNAL + #endif +#endif + +#ifndef O_RDONLY +#define O_RDONLY 1 +#endif + +#ifndef O_WRONLY +#define O_WRONLY 2 +#endif + +#ifndef EINTERNAL +#define EINTERNAL 255 +#endif + +#define ELASTIC_BYTE_BUFFER_POOL_CLASS \ + "org/apache/hadoop/io/ElasticByteBufferPool" + +/** All APIs set errno to meaningful values */ + +#ifdef __cplusplus +extern "C" { +#endif + /** + * Some utility decls used in libhdfs. + */ + struct hdfsBuilder; + typedef int32_t tSize; /// size of data for read/write io ops + typedef time_t tTime; /// time type in seconds + typedef int64_t tOffset;/// offset within the file + typedef uint16_t tPort; /// port + typedef enum tObjectKind { + kObjectKindFile = 'F', + kObjectKindDirectory = 'D', + } tObjectKind; + struct hdfsStreamBuilder; + + + /** + * The C reflection of org.apache.org.hadoop.FileSystem . + */ + struct hdfs_internal; + typedef struct hdfs_internal* hdfsFS; + + struct hdfsFile_internal; + typedef struct hdfsFile_internal* hdfsFile; + + struct hadoopRzOptions; + + struct hadoopRzBuffer; + + /** + * Determine if a file is open for read. + * + * @param file The HDFS file + * @return 1 if the file is open for read; 0 otherwise + */ + LIBHDFS_EXTERNAL + int hdfsFileIsOpenForRead(hdfsFile file); + + /** + * Determine if a file is open for write. + * + * @param file The HDFS file + * @return 1 if the file is open for write; 0 otherwise + */ + LIBHDFS_EXTERNAL + int hdfsFileIsOpenForWrite(hdfsFile file); + + struct hdfsReadStatistics { + uint64_t totalBytesRead; + uint64_t totalLocalBytesRead; + uint64_t totalShortCircuitBytesRead; + uint64_t totalZeroCopyBytesRead; + }; + + /** + * Get read statistics about a file. This is only applicable to files + * opened for reading. + * + * @param file The HDFS file + * @param stats (out parameter) on a successful return, the read + * statistics. Unchanged otherwise. You must free the + * returned statistics with hdfsFileFreeReadStatistics. + * @return 0 if the statistics were successfully returned, + * -1 otherwise. On a failure, please check errno against + * ENOTSUP. webhdfs, LocalFilesystem, and so forth may + * not support read statistics. + */ + LIBHDFS_EXTERNAL + int hdfsFileGetReadStatistics(hdfsFile file, + struct hdfsReadStatistics **stats); + + /** + * @param stats HDFS read statistics for a file. + * + * @return the number of remote bytes read. + */ + LIBHDFS_EXTERNAL + int64_t hdfsReadStatisticsGetRemoteBytesRead( + const struct hdfsReadStatistics *stats); + + /** + * Clear the read statistics for a file. + * + * @param file The file to clear the read statistics of. + * + * @return 0 on success; the error code otherwise. + * EINVAL: the file is not open for reading. + * ENOTSUP: the file does not support clearing the read + * statistics. + * Errno will also be set to this code on failure. + */ + LIBHDFS_EXTERNAL + int hdfsFileClearReadStatistics(hdfsFile file); + + /** + * Free some HDFS read statistics. + * + * @param stats The HDFS read statistics to free. + */ + LIBHDFS_EXTERNAL + void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats); + + /** + * hdfsConnectAsUser - Connect to a hdfs file system as a specific user + * Connect to the hdfs. + * @param nn The NameNode. See hdfsBuilderSetNameNode for details. + * @param port The port on which the server is listening. + * @param user the user name (this is hadoop domain user). Or NULL is equivelant to hhdfsConnect(host, port) + * @return Returns a handle to the filesystem or NULL on error. + * @deprecated Use hdfsBuilderConnect instead. + */ + LIBHDFS_EXTERNAL + hdfsFS hdfsConnectAsUser(const char* nn, tPort port, const char *user); + + /** + * hdfsConnect - Connect to a hdfs file system. + * Connect to the hdfs. + * @param nn The NameNode. See hdfsBuilderSetNameNode for details. + * @param port The port on which the server is listening. + * @return Returns a handle to the filesystem or NULL on error. + * @deprecated Use hdfsBuilderConnect instead. + */ + LIBHDFS_EXTERNAL + hdfsFS hdfsConnect(const char* nn, tPort port); + + /** + * hdfsConnect - Connect to an hdfs file system. + * + * Forces a new instance to be created + * + * @param nn The NameNode. See hdfsBuilderSetNameNode for details. + * @param port The port on which the server is listening. + * @param user The user name to use when connecting + * @return Returns a handle to the filesystem or NULL on error. + * @deprecated Use hdfsBuilderConnect instead. + */ + LIBHDFS_EXTERNAL + hdfsFS hdfsConnectAsUserNewInstance(const char* nn, tPort port, const char *user ); + + /** + * hdfsConnect - Connect to an hdfs file system. + * + * Forces a new instance to be created + * + * @param nn The NameNode. See hdfsBuilderSetNameNode for details. + * @param port The port on which the server is listening. + * @return Returns a handle to the filesystem or NULL on error. + * @deprecated Use hdfsBuilderConnect instead. + */ + LIBHDFS_EXTERNAL + hdfsFS hdfsConnectNewInstance(const char* nn, tPort port); + + /** + * Connect to HDFS using the parameters defined by the builder. + * + * The HDFS builder will be freed, whether or not the connection was + * successful. + * + * Every successful call to hdfsBuilderConnect should be matched with a call + * to hdfsDisconnect, when the hdfsFS is no longer needed. + * + * @param bld The HDFS builder + * @return Returns a handle to the filesystem, or NULL on error. + */ + LIBHDFS_EXTERNAL + hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld); + + /** + * Create an HDFS builder. + * + * @return The HDFS builder, or NULL on error. + */ + LIBHDFS_EXTERNAL + struct hdfsBuilder *hdfsNewBuilder(void); + + /** + * Force the builder to always create a new instance of the FileSystem, + * rather than possibly finding one in the cache. + * + * @param bld The HDFS builder + */ + LIBHDFS_EXTERNAL + void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld); + + /** + * Set the HDFS NameNode to connect to. + * + * @param bld The HDFS builder + * @param nn The NameNode to use. + * + * If the string given is 'default', the default NameNode + * configuration will be used (from the XML configuration files) + * + * If NULL is given, a LocalFileSystem will be created. + * + * If the string starts with a protocol type such as file:// or + * hdfs://, this protocol type will be used. If not, the + * hdfs:// protocol type will be used. + * + * You may specify a NameNode port in the usual way by + * passing a string of the format hdfs://:. + * Alternately, you may set the port with + * hdfsBuilderSetNameNodePort. However, you must not pass the + * port in two different ways. + */ + LIBHDFS_EXTERNAL + void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn); + + /** + * Set the port of the HDFS NameNode to connect to. + * + * @param bld The HDFS builder + * @param port The port. + */ + LIBHDFS_EXTERNAL + void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port); + + /** + * Set the username to use when connecting to the HDFS cluster. + * + * @param bld The HDFS builder + * @param userName The user name. The string will be shallow-copied. + */ + LIBHDFS_EXTERNAL + void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName); + + /** + * Set the path to the Kerberos ticket cache to use when connecting to + * the HDFS cluster. + * + * @param bld The HDFS builder + * @param kerbTicketCachePath The Kerberos ticket cache path. The string + * will be shallow-copied. + */ + LIBHDFS_EXTERNAL + void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld, + const char *kerbTicketCachePath); + + /** + * Free an HDFS builder. + * + * It is normally not necessary to call this function since + * hdfsBuilderConnect frees the builder. + * + * @param bld The HDFS builder + */ + LIBHDFS_EXTERNAL + void hdfsFreeBuilder(struct hdfsBuilder *bld); + + /** + * Set a configuration string for an HdfsBuilder. + * + * @param key The key to set. + * @param val The value, or NULL to set no value. + * This will be shallow-copied. You are responsible for + * ensuring that it remains valid until the builder is + * freed. + * + * @return 0 on success; nonzero error code otherwise. + */ + LIBHDFS_EXTERNAL + int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key, + const char *val); + + /** + * Get a configuration string. + * + * @param key The key to find + * @param val (out param) The value. This will be set to NULL if the + * key isn't found. You must free this string with + * hdfsConfStrFree. + * + * @return 0 on success; nonzero error code otherwise. + * Failure to find the key is not an error. + */ + LIBHDFS_EXTERNAL + int hdfsConfGetStr(const char *key, char **val); + + /** + * Get a configuration integer. + * + * @param key The key to find + * @param val (out param) The value. This will NOT be changed if the + * key isn't found. + * + * @return 0 on success; nonzero error code otherwise. + * Failure to find the key is not an error. + */ + LIBHDFS_EXTERNAL + int hdfsConfGetInt(const char *key, int32_t *val); + + /** + * Free a configuration string found with hdfsConfGetStr. + * + * @param val A configuration string obtained from hdfsConfGetStr + */ + LIBHDFS_EXTERNAL + void hdfsConfStrFree(char *val); + + /** + * hdfsDisconnect - Disconnect from the hdfs file system. + * Disconnect from hdfs. + * @param fs The configured filesystem handle. + * @return Returns 0 on success, -1 on error. + * Even if there is an error, the resources associated with the + * hdfsFS will be freed. + */ + LIBHDFS_EXTERNAL + int hdfsDisconnect(hdfsFS fs); + + /** + * hdfsOpenFile - Open a hdfs file in given mode. + * @deprecated Use the hdfsStreamBuilder functions instead. + * This function does not support setting block sizes bigger than 2 GB. + * + * @param fs The configured filesystem handle. + * @param path The full path to the file. + * @param flags - an | of bits/fcntl.h file flags - supported flags are O_RDONLY, O_WRONLY (meaning create or overwrite i.e., implies O_TRUNCAT), + * O_WRONLY|O_APPEND. Other flags are generally ignored other than (O_RDWR || (O_EXCL & O_CREAT)) which return NULL and set errno equal ENOTSUP. + * @param bufferSize Size of buffer for read/write - pass 0 if you want + * to use the default configured values. + * @param replication Block replication - pass 0 if you want to use + * the default configured values. + * @param blocksize Size of block - pass 0 if you want to use the + * default configured values. Note that if you want a block size bigger + * than 2 GB, you must use the hdfsStreamBuilder API rather than this + * deprecated function. + * @return Returns the handle to the open file or NULL on error. + */ + LIBHDFS_EXTERNAL + hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags, + int bufferSize, short replication, tSize blocksize); + + /** + * hdfsStreamBuilderAlloc - Allocate an HDFS stream builder. + * + * @param fs The configured filesystem handle. + * @param path The full path to the file. Will be deep-copied. + * @param flags The open flags, as in hdfsOpenFile. + * @return Returns the hdfsStreamBuilder, or NULL on error. + */ + LIBHDFS_EXTERNAL + struct hdfsStreamBuilder *hdfsStreamBuilderAlloc(hdfsFS fs, + const char *path, int flags); + + /** + * hdfsStreamBuilderFree - Free an HDFS file builder. + * + * It is normally not necessary to call this function since + * hdfsStreamBuilderBuild frees the builder. + * + * @param bld The hdfsStreamBuilder to free. + */ + LIBHDFS_EXTERNAL + void hdfsStreamBuilderFree(struct hdfsStreamBuilder *bld); + + /** + * hdfsStreamBuilderSetBufferSize - Set the stream buffer size. + * + * @param bld The hdfs stream builder. + * @param bufferSize The buffer size to set. + * + * @return 0 on success, or -1 on error. Errno will be set on error. + */ + LIBHDFS_EXTERNAL + int hdfsStreamBuilderSetBufferSize(struct hdfsStreamBuilder *bld, + int32_t bufferSize); + + /** + * hdfsStreamBuilderSetReplication - Set the replication for the stream. + * This is only relevant for output streams, which will create new blocks. + * + * @param bld The hdfs stream builder. + * @param replication The replication to set. + * + * @return 0 on success, or -1 on error. Errno will be set on error. + * If you call this on an input stream builder, you will get + * EINVAL, because this configuration is not relevant to input + * streams. + */ + LIBHDFS_EXTERNAL + int hdfsStreamBuilderSetReplication(struct hdfsStreamBuilder *bld, + int16_t replication); + + /** + * hdfsStreamBuilderSetDefaultBlockSize - Set the default block size for + * the stream. This is only relevant for output streams, which will create + * new blocks. + * + * @param bld The hdfs stream builder. + * @param defaultBlockSize The default block size to set. + * + * @return 0 on success, or -1 on error. Errno will be set on error. + * If you call this on an input stream builder, you will get + * EINVAL, because this configuration is not relevant to input + * streams. + */ + LIBHDFS_EXTERNAL + int hdfsStreamBuilderSetDefaultBlockSize(struct hdfsStreamBuilder *bld, + int64_t defaultBlockSize); + + /** + * hdfsStreamBuilderBuild - Build the stream by calling open or create. + * + * @param bld The hdfs stream builder. This pointer will be freed, whether + * or not the open succeeds. + * + * @return the stream pointer on success, or NULL on error. Errno will be + * set on error. + */ + LIBHDFS_EXTERNAL + hdfsFile hdfsStreamBuilderBuild(struct hdfsStreamBuilder *bld); + + /** + * hdfsTruncateFile - Truncate a hdfs file to given lenght. + * @param fs The configured filesystem handle. + * @param path The full path to the file. + * @param newlength The size the file is to be truncated to + * @return 1 if the file has been truncated to the desired newlength + * and is immediately available to be reused for write operations + * such as append. + * 0 if a background process of adjusting the length of the last + * block has been started, and clients should wait for it to + * complete before proceeding with further file updates. + * -1 on error. + */ + int hdfsTruncateFile(hdfsFS fs, const char* path, tOffset newlength); + + /** + * hdfsUnbufferFile - Reduce the buffering done on a file. + * + * @param file The file to unbuffer. + * @return 0 on success + * ENOTSUP if the file does not support unbuffering + * Errno will also be set to this value. + */ + LIBHDFS_EXTERNAL + int hdfsUnbufferFile(hdfsFile file); + + /** + * hdfsCloseFile - Close an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + * On error, errno will be set appropriately. + * If the hdfs file was valid, the memory associated with it will + * be freed at the end of this call, even if there was an I/O + * error. + */ + LIBHDFS_EXTERNAL + int hdfsCloseFile(hdfsFS fs, hdfsFile file); + + + /** + * hdfsExists - Checks if a given path exsits on the filesystem + * @param fs The configured filesystem handle. + * @param path The path to look for + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsExists(hdfsFS fs, const char *path); + + + /** + * hdfsSeek - Seek to given offset in file. + * This works only for files opened in read-only mode. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param desiredPos Offset into the file to seek into. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsSeek(hdfsFS fs, hdfsFile file, tOffset desiredPos); + + + /** + * hdfsTell - Get the current offset in the file, in bytes. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Current offset, -1 on error. + */ + LIBHDFS_EXTERNAL + tOffset hdfsTell(hdfsFS fs, hdfsFile file); + + + /** + * hdfsRead - Read data from an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param buffer The buffer to copy read bytes into. + * @param length The length of the buffer. + * @return On success, a positive number indicating how many bytes + * were read. + * On end-of-file, 0. + * On error, -1. Errno will be set to the error code. + * Just like the POSIX read function, hdfsRead will return -1 + * and set errno to EINTR if data is temporarily unavailable, + * but we are not yet at the end of the file. + */ + LIBHDFS_EXTERNAL + tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length); + + /** + * hdfsPread - Positional read of data from an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param position Position from which to read + * @param buffer The buffer to copy read bytes into. + * @param length The length of the buffer. + * @return See hdfsRead + */ + LIBHDFS_EXTERNAL + tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, + void* buffer, tSize length); + + + /** + * hdfsWrite - Write data into an open file. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @param buffer The data. + * @param length The no. of bytes to write. + * @return Returns the number of bytes written, -1 on error. + */ + LIBHDFS_EXTERNAL + tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer, + tSize length); + + + /** + * hdfsWrite - Flush the data. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsFlush(hdfsFS fs, hdfsFile file); + + + /** + * hdfsHFlush - Flush out the data in client's user buffer. After the + * return of this call, new readers will see the data. + * @param fs configured filesystem handle + * @param file file handle + * @return 0 on success, -1 on error and sets errno + */ + LIBHDFS_EXTERNAL + int hdfsHFlush(hdfsFS fs, hdfsFile file); + + + /** + * hdfsHSync - Similar to posix fsync, Flush out the data in client's + * user buffer. all the way to the disk device (but the disk may have + * it in its cache). + * @param fs configured filesystem handle + * @param file file handle + * @return 0 on success, -1 on error and sets errno + */ + LIBHDFS_EXTERNAL + int hdfsHSync(hdfsFS fs, hdfsFile file); + + + /** + * hdfsAvailable - Number of bytes that can be read from this + * input stream without blocking. + * @param fs The configured filesystem handle. + * @param file The file handle. + * @return Returns available bytes; -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsAvailable(hdfsFS fs, hdfsFile file); + + + /** + * hdfsCopy - Copy file from one filesystem to another. + * @param srcFS The handle to source filesystem. + * @param src The path of source file. + * @param dstFS The handle to destination filesystem. + * @param dst The path of destination file. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + + /** + * hdfsMove - Move file from one filesystem to another. + * @param srcFS The handle to source filesystem. + * @param src The path of source file. + * @param dstFS The handle to destination filesystem. + * @param dst The path of destination file. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst); + + + /** + * hdfsDelete - Delete file. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @param recursive if path is a directory and set to + * non-zero, the directory is deleted else throws an exception. In + * case of a file the recursive argument is irrelevant. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsDelete(hdfsFS fs, const char* path, int recursive); + + /** + * hdfsRename - Rename file. + * @param fs The configured filesystem handle. + * @param oldPath The path of the source file. + * @param newPath The path of the destination file. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath); + + + /** + * hdfsGetWorkingDirectory - Get the current working directory for + * the given filesystem. + * @param fs The configured filesystem handle. + * @param buffer The user-buffer to copy path of cwd into. + * @param bufferSize The length of user-buffer. + * @return Returns buffer, NULL on error. + */ + LIBHDFS_EXTERNAL + char* hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize); + + + /** + * hdfsSetWorkingDirectory - Set the working directory. All relative + * paths will be resolved relative to it. + * @param fs The configured filesystem handle. + * @param path The path of the new 'cwd'. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsSetWorkingDirectory(hdfsFS fs, const char* path); + + + /** + * hdfsCreateDirectory - Make the given file and all non-existent + * parents into directories. + * @param fs The configured filesystem handle. + * @param path The path of the directory. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsCreateDirectory(hdfsFS fs, const char* path); + + + /** + * hdfsSetReplication - Set the replication of the specified + * file to the supplied value + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns 0 on success, -1 on error. + */ + LIBHDFS_EXTERNAL + int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication); + + + /** + * hdfsFileInfo - Information about a file/directory. + */ + typedef struct { + tObjectKind mKind; /* file or directory */ + char *mName; /* the name of the file */ + tTime mLastMod; /* the last modification time for the file in seconds */ + tOffset mSize; /* the size of the file in bytes */ + short mReplication; /* the count of replicas */ + tOffset mBlockSize; /* the block size for the file */ + char *mOwner; /* the owner of the file */ + char *mGroup; /* the group associated with the file */ + short mPermissions; /* the permissions associated with the file */ + tTime mLastAccess; /* the last access time for the file in seconds */ + } hdfsFileInfo; + + + /** + * hdfsListDirectory - Get list of files/directories for a given + * directory-path. hdfsFreeFileInfo should be called to deallocate memory. + * @param fs The configured filesystem handle. + * @param path The path of the directory. + * @param numEntries Set to the number of files/directories in path. + * @return Returns a dynamically-allocated array of hdfsFileInfo + * objects; NULL on error or empty directory. + * errno is set to non-zero on error or zero on success. + */ + LIBHDFS_EXTERNAL + hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char* path, + int *numEntries); + + + /** + * hdfsGetPathInfo - Get information about a path as a (dynamically + * allocated) single hdfsFileInfo struct. hdfsFreeFileInfo should be + * called when the pointer is no longer needed. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @return Returns a dynamically-allocated hdfsFileInfo object; + * NULL on error. + */ + LIBHDFS_EXTERNAL + hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char* path); + + + /** + * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields) + * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo + * objects. + * @param numEntries The size of the array. + */ + LIBHDFS_EXTERNAL + void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries); + + /** + * hdfsFileIsEncrypted: determine if a file is encrypted based on its + * hdfsFileInfo. + * @return -1 if there was an error (errno will be set), 0 if the file is + * not encrypted, 1 if the file is encrypted. + */ + LIBHDFS_EXTERNAL + int hdfsFileIsEncrypted(hdfsFileInfo *hdfsFileInfo); + + + /** + * hdfsGetHosts - Get hostnames where a particular block (determined by + * pos & blocksize) of a file is stored. The last element in the array + * is NULL. Due to replication, a single block could be present on + * multiple hosts. + * @param fs The configured filesystem handle. + * @param path The path of the file. + * @param start The start of the block. + * @param length The length of the block. + * @return Returns a dynamically-allocated 2-d array of blocks-hosts; + * NULL on error. + */ + LIBHDFS_EXTERNAL + char*** hdfsGetHosts(hdfsFS fs, const char* path, + tOffset start, tOffset length); + + + /** + * hdfsFreeHosts - Free up the structure returned by hdfsGetHosts + * @param hdfsFileInfo The array of dynamically-allocated hdfsFileInfo + * objects. + * @param numEntries The size of the array. + */ + LIBHDFS_EXTERNAL + void hdfsFreeHosts(char ***blockHosts); + + + /** + * hdfsGetDefaultBlockSize - Get the default blocksize. + * + * @param fs The configured filesystem handle. + * @deprecated Use hdfsGetDefaultBlockSizeAtPath instead. + * + * @return Returns the default blocksize, or -1 on error. + */ + LIBHDFS_EXTERNAL + tOffset hdfsGetDefaultBlockSize(hdfsFS fs); + + + /** + * hdfsGetDefaultBlockSizeAtPath - Get the default blocksize at the + * filesystem indicated by a given path. + * + * @param fs The configured filesystem handle. + * @param path The given path will be used to locate the actual + * filesystem. The full path does not have to exist. + * + * @return Returns the default blocksize, or -1 on error. + */ + LIBHDFS_EXTERNAL + tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char *path); + + + /** + * hdfsGetCapacity - Return the raw capacity of the filesystem. + * @param fs The configured filesystem handle. + * @return Returns the raw-capacity; -1 on error. + */ + LIBHDFS_EXTERNAL + tOffset hdfsGetCapacity(hdfsFS fs); + + + /** + * hdfsGetUsed - Return the total raw size of all files in the filesystem. + * @param fs The configured filesystem handle. + * @return Returns the total-size; -1 on error. + */ + LIBHDFS_EXTERNAL + tOffset hdfsGetUsed(hdfsFS fs); + + /** + * Change the user and/or group of a file or directory. + * + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param owner User string. Set to NULL for 'no change' + * @param group Group string. Set to NULL for 'no change' + * @return 0 on success else -1 + */ + LIBHDFS_EXTERNAL + int hdfsChown(hdfsFS fs, const char* path, const char *owner, + const char *group); + + /** + * hdfsChmod + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param mode the bitmask to set it to + * @return 0 on success else -1 + */ + LIBHDFS_EXTERNAL + int hdfsChmod(hdfsFS fs, const char* path, short mode); + + /** + * hdfsUtime + * @param fs The configured filesystem handle. + * @param path the path to the file or directory + * @param mtime new modification time or -1 for no change + * @param atime new access time or -1 for no change + * @return 0 on success else -1 + */ + LIBHDFS_EXTERNAL + int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime); + + /** + * Allocate a zero-copy options structure. + * + * You must free all options structures allocated with this function using + * hadoopRzOptionsFree. + * + * @return A zero-copy options structure, or NULL if one could + * not be allocated. If NULL is returned, errno will + * contain the error number. + */ + LIBHDFS_EXTERNAL + struct hadoopRzOptions *hadoopRzOptionsAlloc(void); + + /** + * Determine whether we should skip checksums in read0. + * + * @param opts The options structure. + * @param skip Nonzero to skip checksums sometimes; zero to always + * check them. + * + * @return 0 on success; -1 plus errno on failure. + */ + LIBHDFS_EXTERNAL + int hadoopRzOptionsSetSkipChecksum( + struct hadoopRzOptions *opts, int skip); + + /** + * Set the ByteBufferPool to use with read0. + * + * @param opts The options structure. + * @param className If this is NULL, we will not use any + * ByteBufferPool. If this is non-NULL, it will be + * treated as the name of the pool class to use. + * For example, you can use + * ELASTIC_BYTE_BUFFER_POOL_CLASS. + * + * @return 0 if the ByteBufferPool class was found and + * instantiated; + * -1 plus errno otherwise. + */ + LIBHDFS_EXTERNAL + int hadoopRzOptionsSetByteBufferPool( + struct hadoopRzOptions *opts, const char *className); + + /** + * Free a hadoopRzOptionsFree structure. + * + * @param opts The options structure to free. + * Any associated ByteBufferPool will also be freed. + */ + LIBHDFS_EXTERNAL + void hadoopRzOptionsFree(struct hadoopRzOptions *opts); + + /** + * Perform a byte buffer read. + * If possible, this will be a zero-copy (mmap) read. + * + * @param file The file to read from. + * @param opts An options structure created by hadoopRzOptionsAlloc. + * @param maxLength The maximum length to read. We may read fewer bytes + * than this length. + * + * @return On success, we will return a new hadoopRzBuffer. + * This buffer will continue to be valid and readable + * until it is released by readZeroBufferFree. Failure to + * release a buffer will lead to a memory leak. + * You can access the data within the hadoopRzBuffer with + * hadoopRzBufferGet. If you have reached EOF, the data + * within the hadoopRzBuffer will be NULL. You must still + * free hadoopRzBuffer instances containing NULL. + * + * On failure, we will return NULL plus an errno code. + * errno = EOPNOTSUPP indicates that we could not do a + * zero-copy read, and there was no ByteBufferPool + * supplied. + */ + LIBHDFS_EXTERNAL + struct hadoopRzBuffer* hadoopReadZero(hdfsFile file, + struct hadoopRzOptions *opts, int32_t maxLength); + + /** + * Determine the length of the buffer returned from readZero. + * + * @param buffer a buffer returned from readZero. + * @return the length of the buffer. + */ + LIBHDFS_EXTERNAL + int32_t hadoopRzBufferLength(const struct hadoopRzBuffer *buffer); + + /** + * Get a pointer to the raw buffer returned from readZero. + * + * To find out how many bytes this buffer contains, call + * hadoopRzBufferLength. + * + * @param buffer a buffer returned from readZero. + * @return a pointer to the start of the buffer. This will be + * NULL when end-of-file has been reached. + */ + LIBHDFS_EXTERNAL + const void *hadoopRzBufferGet(const struct hadoopRzBuffer *buffer); + + /** + * Release a buffer obtained through readZero. + * + * @param file The hdfs stream that created this buffer. This must be + * the same stream you called hadoopReadZero on. + * @param buffer The buffer to release. + */ + LIBHDFS_EXTERNAL + void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer *buffer); + +#ifdef __cplusplus +} +#endif + +#undef LIBHDFS_EXTERNAL +#endif /*LIBHDFS_HDFS_H*/ + +/** + * vim: ts=4: sw=4: et + */ diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 981779ffb4c..8f47f93b26d 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -173,7 +173,10 @@ def merge_pr(pr_num, target_ref): for c in commits: merge_message_flags += ["-m", c] - run_cmd(['git', 'commit', '--author="%s"' % primary_author] + merge_message_flags) + run_cmd(['git', 'commit', + '--no-verify', # do not run commit hooks + '--author="%s"' % primary_author] + + merge_message_flags) continue_maybe("Merge complete (local ref %s). Push to %s?" % ( target_branch_name, PUSH_REMOTE_NAME)) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index f1becfcf449..fdbfce99656 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -348,8 +348,10 @@ find_package(Arrow REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) +ADD_THIRDPARTY_LIB(arrow_io + SHARED_LIB ${ARROW_IO_SHARED_LIB}) ADD_THIRDPARTY_LIB(arrow_parquet - SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) + SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) ############################################################ # Linker setup @@ -428,6 +430,7 @@ set(PYARROW_SRCS set(LINK_LIBS arrow + arrow_io arrow_parquet ) @@ -449,6 +452,7 @@ set(CYTHON_EXTENSIONS array config error + io parquet scalar schema diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index f0b258ed027..6bd305615fc 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -47,13 +47,24 @@ find_library(ARROW_PARQUET_LIB_PATH NAMES arrow_parquet ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) +find_library(ARROW_IO_LIB_PATH NAMES arrow_io + PATHS + ${ARROW_SEARCH_LIB_PATH} + NO_DEFAULT_PATH) + if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) + set(ARROW_IO_LIB_NAME libarrow_io) set(ARROW_PARQUET_LIB_NAME libarrow_parquet) + set(ARROW_LIBS ${ARROW_SEARCH_LIB_PATH}) set(ARROW_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_LIB_NAME}.a) set(ARROW_SHARED_LIB ${ARROW_LIBS}/${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + + set(ARROW_IO_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_IO_LIB_NAME}.a) + set(ARROW_IO_SHARED_LIB ${ARROW_LIBS}/${ARROW_IO_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) else () @@ -62,7 +73,9 @@ endif () if (ARROW_FOUND) if (NOT Arrow_FIND_QUIETLY) - message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}, ${ARROW_PARQUET_LIB_PATH}") + message(STATUS "Found the Arrow core library: ${ARROW_LIB_PATH}") + message(STATUS "Found the Arrow IO library: ${ARROW_IO_LIB_PATH}") + message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") endif () else () if (NOT Arrow_FIND_QUIETLY) @@ -82,6 +95,8 @@ mark_as_advanced( ARROW_LIBS ARROW_STATIC_LIB ARROW_SHARED_LIB + ARROW_IO_STATIC_LIB + ARROW_IO_SHARED_LIB ARROW_PARQUET_STATIC_LIB ARROW_PARQUET_SHARED_LIB ) diff --git a/python/conda.recipe/meta.yaml b/python/conda.recipe/meta.yaml index 85d24b6bc32..98ae4141e3b 100644 --- a/python/conda.recipe/meta.yaml +++ b/python/conda.recipe/meta.yaml @@ -26,6 +26,7 @@ requirements: run: - arrow-cpp + - parquet-cpp - python - numpy - pandas diff --git a/python/pyarrow/error.pxd b/python/pyarrow/error.pxd index 97ba0ef2e9f..1fb6fad396a 100644 --- a/python/pyarrow/error.pxd +++ b/python/pyarrow/error.pxd @@ -18,5 +18,5 @@ from pyarrow.includes.libarrow cimport CStatus from pyarrow.includes.pyarrow cimport * -cdef check_cstatus(const CStatus& status) -cdef check_status(const Status& status) +cdef int check_cstatus(const CStatus& status) nogil except -1 +cdef int check_status(const Status& status) nogil except -1 diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx index 5a6a038a92e..244019321a7 100644 --- a/python/pyarrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -22,16 +22,18 @@ from pyarrow.compat import frombytes class ArrowException(Exception): pass -cdef check_cstatus(const CStatus& status): +cdef int check_cstatus(const CStatus& status) nogil except -1: if status.ok(): - return + return 0 cdef c_string c_message = status.ToString() - raise ArrowException(frombytes(c_message)) + with gil: + raise ArrowException(frombytes(c_message)) -cdef check_status(const Status& status): +cdef int check_status(const Status& status) nogil except -1: if status.ok(): - return + return 0 cdef c_string c_message = status.ToString() - raise ArrowException(frombytes(c_message)) + with gil: + raise ArrowException(frombytes(c_message)) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 1f6ecee5105..133797bc37b 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -33,3 +33,21 @@ cdef extern from "": cdef extern from "": void Py_XDECREF(PyObject* o) +cdef extern from "arrow/api.h" namespace "arrow" nogil: + # We can later add more of the common status factory methods as needed + cdef CStatus CStatus_OK "Status::OK"() + + cdef cppclass CStatus "arrow::Status": + CStatus() + + c_string ToString() + + c_bool ok() + c_bool IsOutOfMemory() + c_bool IsKeyError() + c_bool IsNotImplemented() + c_bool IsInvalid() + + cdef cppclass Buffer: + uint8_t* data() + int64_t size() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 90414e3d542..91ce069df8f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -19,25 +19,6 @@ from pyarrow.includes.common cimport * -cdef extern from "arrow/api.h" namespace "arrow" nogil: - # We can later add more of the common status factory methods as needed - cdef CStatus CStatus_OK "Status::OK"() - - cdef cppclass CStatus "arrow::Status": - CStatus() - - c_string ToString() - - c_bool ok() - c_bool IsOutOfMemory() - c_bool IsKeyError() - c_bool IsNotImplemented() - c_bool IsInvalid() - - cdef cppclass Buffer: - uint8_t* data() - int64_t size() - cdef extern from "arrow/api.h" namespace "arrow" nogil: enum Type" arrow::Type::type": diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd new file mode 100644 index 00000000000..d874ba30912 --- /dev/null +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from pyarrow.includes.common cimport * + +cdef extern from "arrow/io/interfaces.h" nogil: + enum ObjectType" arrow::io::ObjectType::type": + ObjectType_FILE" arrow::io::ObjectType::FILE" + ObjectType_DIRECTORY" arrow::io::ObjectType::DIRECTORY" + +cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: + CStatus ConnectLibHdfs() + + cdef cppclass HdfsConnectionConfig: + c_string host + int port + c_string user + + cdef cppclass HdfsPathInfo: + ObjectType kind; + c_string name + c_string owner + c_string group + int32_t last_modified_time + int32_t last_access_time + int64_t size + int16_t replication + int64_t block_size + int16_t permissions + + cdef cppclass CHdfsFile: + CStatus Close() + CStatus Seek(int64_t position) + CStatus Tell(int64_t* position) + + cdef cppclass HdfsReadableFile(CHdfsFile): + CStatus GetSize(int64_t* size) + CStatus Read(int32_t nbytes, int32_t* bytes_read, + uint8_t* buffer) + + CStatus ReadAt(int64_t position, int32_t nbytes, + int32_t* bytes_read, uint8_t* buffer) + + cdef cppclass HdfsWriteableFile(CHdfsFile): + CStatus Write(const uint8_t* buffer, int32_t nbytes) + + CStatus Write(const uint8_t* buffer, int32_t nbytes, + int32_t* bytes_written) + + cdef cppclass CHdfsClient" arrow::io::HdfsClient": + @staticmethod + CStatus Connect(const HdfsConnectionConfig* config, + shared_ptr[CHdfsClient]* client) + + CStatus CreateDirectory(const c_string& path) + + CStatus Delete(const c_string& path, c_bool recursive) + + CStatus Disconnect() + + c_bool Exists(const c_string& path) + + CStatus GetCapacity(int64_t* nbytes) + CStatus GetUsed(int64_t* nbytes) + + CStatus ListDirectory(const c_string& path, + vector[HdfsPathInfo]* listing) + + CStatus Rename(const c_string& src, const c_string& dst) + + CStatus OpenReadable(const c_string& path, + shared_ptr[HdfsReadableFile]* handle) + + CStatus OpenWriteable(const c_string& path, c_bool append, + int32_t buffer_size, int16_t replication, + int64_t default_block_size, + shared_ptr[HdfsWriteableFile]* handle) diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx new file mode 100644 index 00000000000..8b97671e453 --- /dev/null +++ b/python/pyarrow/io.pyx @@ -0,0 +1,504 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Cython wrappers for IO interfaces defined in arrow/io + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from libc.stdlib cimport malloc, free + +from pyarrow.includes.libarrow cimport * +cimport pyarrow.includes.pyarrow as pyarrow +from pyarrow.includes.libarrow_io cimport * + +from pyarrow.compat import frombytes, tobytes +from pyarrow.error cimport check_cstatus + +cimport cpython as cp + +import re +import sys +import threading + +_HDFS_PATH_RE = re.compile('hdfs://(.*):(\d+)(.*)') + +try: + # Python 3 + from queue import Queue, Empty as QueueEmpty, Full as QueueFull +except ImportError: + from Queue import Queue, Empty as QueueEmpty, Full as QueueFull + + +def have_libhdfs(): + try: + check_cstatus(ConnectLibHdfs()) + return True + except: + return False + + +def strip_hdfs_abspath(path): + m = _HDFS_PATH_RE.match(path) + if m: + return m.group(3) + else: + return path + + +cdef class HdfsClient: + cdef: + shared_ptr[CHdfsClient] client + + cdef readonly: + object host + int port + object user + bint is_open + + def __cinit__(self): + self.is_open = False + + def __dealloc__(self): + if self.is_open: + self.close() + + def close(self): + self._ensure_client() + with nogil: + check_cstatus(self.client.get().Disconnect()) + self.is_open = False + + cdef _ensure_client(self): + if self.client.get() == NULL: + raise IOError('HDFS client improperly initialized') + elif not self.is_open: + raise IOError('HDFS client is closed') + + @classmethod + def connect(cls, host, port, user): + """ + + Parameters + ---------- + host : + port : + user : + + Notes + ----- + The first time you call this method, it will take longer than usual due + to JNI spin-up time. + + Returns + ------- + client : HDFSClient + """ + cdef: + HdfsClient out = HdfsClient() + HdfsConnectionConfig conf + + conf.host = tobytes(host) + conf.port = port + conf.user = tobytes(user) + + with nogil: + check_cstatus( + CHdfsClient.Connect(&conf, &out.client)) + out.is_open = True + + return out + + def exists(self, path): + """ + Returns True if the path is known to the cluster, False if it does not + (or there is an RPC error) + """ + self._ensure_client() + + cdef c_string c_path = tobytes(path) + cdef c_bool result + with nogil: + result = self.client.get().Exists(c_path) + return result + + def ls(self, path, bint full_info=True): + """ + Retrieve directory contents and metadata, if requested. + + Parameters + ---------- + path : HDFS path + full_info : boolean, default True + If False, only return list of paths + + Returns + ------- + result : list of dicts (full_info=True) or strings (full_info=False) + """ + cdef: + c_string c_path = tobytes(path) + vector[HdfsPathInfo] listing + list results = [] + int i + + self._ensure_client() + + with nogil: + check_cstatus(self.client.get() + .ListDirectory(c_path, &listing)) + + cdef const HdfsPathInfo* info + for i in range(listing.size()): + info = &listing[i] + + # Try to trim off the hdfs://HOST:PORT piece + name = strip_hdfs_abspath(frombytes(info.name)) + + if full_info: + kind = ('file' if info.kind == ObjectType_FILE + else 'directory') + + results.append({ + 'kind': kind, + 'name': name, + 'owner': frombytes(info.owner), + 'group': frombytes(info.group), + 'list_modified_time': info.last_modified_time, + 'list_access_time': info.last_access_time, + 'size': info.size, + 'replication': info.replication, + 'block_size': info.block_size, + 'permissions': info.permissions + }) + else: + results.append(name) + + return results + + def mkdir(self, path): + """ + Create indicated directory and any necessary parent directories + """ + self._ensure_client() + + cdef c_string c_path = tobytes(path) + with nogil: + check_cstatus(self.client.get() + .CreateDirectory(c_path)) + + def delete(self, path, bint recursive=False): + """ + Delete the indicated file or directory + + Parameters + ---------- + path : string + recursive : boolean, default False + If True, also delete child paths for directories + """ + self._ensure_client() + + cdef c_string c_path = tobytes(path) + with nogil: + check_cstatus(self.client.get() + .Delete(c_path, recursive)) + + def open(self, path, mode='rb', buffer_size=None, replication=None, + default_block_size=None): + """ + Parameters + ---------- + mode : string, 'rb', 'wb', 'ab' + """ + self._ensure_client() + + cdef HdfsFile out = HdfsFile() + + if mode not in ('rb', 'wb', 'ab'): + raise Exception("Mode must be 'rb' (read), " + "'wb' (write, new file), or 'ab' (append)") + + cdef c_string c_path = tobytes(path) + cdef c_bool append = False + + # 0 in libhdfs means "use the default" + cdef int32_t c_buffer_size = buffer_size or 0 + cdef int16_t c_replication = replication or 0 + cdef int64_t c_default_block_size = default_block_size or 0 + + if mode in ('wb', 'ab'): + if mode == 'ab': + append = True + + with nogil: + check_cstatus( + self.client.get() + .OpenWriteable(c_path, append, c_buffer_size, + c_replication, c_default_block_size, + &out.wr_file)) + + out.is_readonly = False + else: + with nogil: + check_cstatus(self.client.get() + .OpenReadable(c_path, &out.rd_file)) + out.is_readonly = True + + if c_buffer_size == 0: + c_buffer_size = 2 ** 16 + + out.mode = mode + out.buffer_size = c_buffer_size + out.parent = self + out.is_open = True + + return out + + def upload(self, path, stream, buffer_size=2**16): + """ + Upload file-like object to HDFS path + """ + write_queue = Queue(50) + + f = self.open(path, 'wb') + + done = False + exc_info = None + def bg_write(): + try: + while not done or write_queue.qsize() > 0: + try: + buf = write_queue.get(timeout=0.01) + except QueueEmpty: + continue + + f.write(buf) + + except Exception as e: + exc_info = sys.exc_info() + + writer_thread = threading.Thread(target=bg_write) + writer_thread.start() + + try: + while True: + buf = stream.read(buffer_size) + if not buf: + break + + write_queue.put_nowait(buf) + finally: + done = True + + writer_thread.join() + if exc_info is not None: + raise exc_info[0], exc_info[1], exc_info[2] + + def download(self, path, stream, buffer_size=None): + f = self.open(path, 'rb', buffer_size=buffer_size) + f.download(stream) + + +cdef class HdfsFile: + cdef: + shared_ptr[HdfsReadableFile] rd_file + shared_ptr[HdfsWriteableFile] wr_file + bint is_readonly + bint is_open + object parent + + cdef readonly: + int32_t buffer_size + object mode + + def __cinit__(self): + self.is_open = False + + def __dealloc__(self): + if self.is_open: + self.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, tb): + self.close() + + def close(self): + if self.is_open: + with nogil: + if self.is_readonly: + check_cstatus(self.rd_file.get().Close()) + else: + check_cstatus(self.wr_file.get().Close()) + self.is_open = False + + cdef _assert_readable(self): + if not self.is_readonly: + raise IOError("only valid on readonly files") + + cdef _assert_writeable(self): + if self.is_readonly: + raise IOError("only valid on writeonly files") + + def size(self): + cdef int64_t size + self._assert_readable() + with nogil: + check_cstatus(self.rd_file.get().GetSize(&size)) + return size + + def tell(self): + cdef int64_t position + with nogil: + if self.is_readonly: + check_cstatus(self.rd_file.get().Tell(&position)) + else: + check_cstatus(self.wr_file.get().Tell(&position)) + return position + + def seek(self, int64_t position): + self._assert_readable() + with nogil: + check_cstatus(self.rd_file.get().Seek(position)) + + def read(self, int nbytes): + """ + Read indicated number of bytes from the file, up to EOF + """ + cdef: + int32_t bytes_read = 0 + uint8_t* buf + + self._assert_readable() + + # This isn't ideal -- PyBytes_FromStringAndSize copies the data from + # the passed buffer, so it's hard for us to avoid doubling the memory + buf = malloc(nbytes) + if buf == NULL: + raise MemoryError("Failed to allocate {0} bytes".format(nbytes)) + + cdef int32_t total_bytes = 0 + + cdef int rpc_chunksize = min(self.buffer_size, nbytes) + + try: + with nogil: + while total_bytes < nbytes: + check_cstatus(self.rd_file.get() + .Read(rpc_chunksize, &bytes_read, + buf + total_bytes)) + + total_bytes += bytes_read + + # EOF + if bytes_read == 0: + break + result = cp.PyBytes_FromStringAndSize(buf, + total_bytes) + finally: + free(buf) + + return result + + def download(self, stream_or_path): + """ + Read file completely to local path (rather than reading completely into + memory). First seeks to the beginning of the file. + """ + cdef: + int32_t bytes_read = 0 + uint8_t* buf + self._assert_readable() + + write_queue = Queue(50) + + if not hasattr(stream_or_path, 'read'): + stream = open(stream_or_path, 'wb') + cleanup = lambda: stream.close() + else: + stream = stream_or_path + cleanup = lambda: None + + done = False + exc_info = None + def bg_write(): + try: + while not done or write_queue.qsize() > 0: + try: + buf = write_queue.get(timeout=0.01) + except QueueEmpty: + continue + stream.write(buf) + except Exception as e: + exc_info = sys.exc_info() + finally: + cleanup() + + self.seek(0) + + writer_thread = threading.Thread(target=bg_write) + + # This isn't ideal -- PyBytes_FromStringAndSize copies the data from + # the passed buffer, so it's hard for us to avoid doubling the memory + buf = malloc(self.buffer_size) + if buf == NULL: + raise MemoryError("Failed to allocate {0} bytes" + .format(self.buffer_size)) + + writer_thread.start() + + cdef int64_t total_bytes = 0 + + try: + while True: + with nogil: + check_cstatus(self.rd_file.get() + .Read(self.buffer_size, &bytes_read, buf)) + + total_bytes += bytes_read + + # EOF + if bytes_read == 0: + break + + pybuf = cp.PyBytes_FromStringAndSize(buf, + bytes_read) + + write_queue.put_nowait(pybuf) + finally: + free(buf) + done = True + + writer_thread.join() + if exc_info is not None: + raise exc_info[0], exc_info[1], exc_info[2] + + def write(self, data): + """ + Write bytes-like (unicode, encoded to UTF-8) to file + """ + self._assert_writeable() + + data = tobytes(data) + + cdef const uint8_t* buf = cp.PyBytes_AS_STRING(data) + cdef int32_t bufsize = len(data) + with nogil: + check_cstatus(self.wr_file.get().Write(buf, bufsize)) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index bf5a22089cd..86147f8df5a 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -15,25 +15,24 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest import pyarrow import pyarrow.formatting as fmt -class TestArrayAPI(unittest.TestCase): +def test_repr_on_pre_init_array(): + arr = pyarrow.array.Array() + assert len(repr(arr)) > 0 - def test_repr_on_pre_init_array(self): - arr = pyarrow.array.Array() - assert len(repr(arr)) > 0 - def test_getitem_NA(self): - arr = pyarrow.from_pylist([1, None, 2]) - assert arr[1] is pyarrow.NA +def test_getitem_NA(): + arr = pyarrow.from_pylist([1, None, 2]) + assert arr[1] is pyarrow.NA - def test_list_format(self): - arr = pyarrow.from_pylist([[1], None, [2, 3, None]]) - result = fmt.array_format(arr) - expected = """\ + +def test_list_format(): + arr = pyarrow.from_pylist([[1], None, [2, 3, None]]) + result = fmt.array_format(arr) + expected = """\ [ [1], NA, @@ -41,23 +40,25 @@ def test_list_format(self): 3, NA] ]""" - assert result == expected + assert result == expected + - def test_string_format(self): - arr = pyarrow.from_pylist(['', None, 'foo']) - result = fmt.array_format(arr) - expected = """\ +def test_string_format(): + arr = pyarrow.from_pylist(['', None, 'foo']) + result = fmt.array_format(arr) + expected = """\ [ '', NA, 'foo' ]""" - assert result == expected + assert result == expected + - def test_long_array_format(self): - arr = pyarrow.from_pylist(range(100)) - result = fmt.array_format(arr, window=2) - expected = """\ +def test_long_array_format(): + arr = pyarrow.from_pylist(range(100)) + result = fmt.array_format(arr, window=2) + expected = """\ [ 0, 1, @@ -65,4 +66,4 @@ def test_long_array_format(self): 98, 99 ]""" - assert result == expected + assert result == expected diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py new file mode 100644 index 00000000000..328e923b941 --- /dev/null +++ b/python/pyarrow/tests/test_io.py @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import BytesIO +from os.path import join as pjoin +import os +import random + +import pytest + +import pyarrow.io as io + +#---------------------------------------------------------------------- +# HDFS tests + + +def hdfs_test_client(): + host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost') + user = os.environ['ARROW_HDFS_TEST_USER'] + try: + port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500)) + except ValueError: + raise ValueError('Env variable ARROW_HDFS_TEST_PORT was not ' + 'an integer') + + return io.HdfsClient.connect(host, port, user) + + +libhdfs = pytest.mark.skipif(not io.have_libhdfs(), + reason='No libhdfs available on system') + + +HDFS_TMP_PATH = '/tmp/pyarrow-test-{0}'.format(random.randint(0, 1000)) + +@pytest.fixture(scope='session') +def hdfs(request): + fixture = hdfs_test_client() + def teardown(): + fixture.delete(HDFS_TMP_PATH, recursive=True) + fixture.close() + request.addfinalizer(teardown) + return fixture + + +@libhdfs +def test_hdfs_close(): + client = hdfs_test_client() + assert client.is_open + client.close() + assert not client.is_open + + with pytest.raises(Exception): + client.ls('/') + + +@libhdfs +def test_hdfs_mkdir(hdfs): + path = pjoin(HDFS_TMP_PATH, 'test-dir/test-dir') + parent_path = pjoin(HDFS_TMP_PATH, 'test-dir') + + hdfs.mkdir(path) + assert hdfs.exists(path) + + hdfs.delete(parent_path, recursive=True) + assert not hdfs.exists(path) + + +@libhdfs +def test_hdfs_ls(hdfs): + base_path = pjoin(HDFS_TMP_PATH, 'ls-test') + hdfs.mkdir(base_path) + + dir_path = pjoin(base_path, 'a-dir') + f1_path = pjoin(base_path, 'a-file-1') + + hdfs.mkdir(dir_path) + + f = hdfs.open(f1_path, 'wb') + f.write('a' * 10) + + contents = sorted(hdfs.ls(base_path, False)) + assert contents == [dir_path, f1_path] + + +@libhdfs +def test_hdfs_download_upload(hdfs): + base_path = pjoin(HDFS_TMP_PATH, 'upload-test') + + data = b'foobarbaz' + buf = BytesIO(data) + buf.seek(0) + + hdfs.upload(base_path, buf) + + out_buf = BytesIO() + hdfs.download(base_path, out_buf) + out_buf.seek(0) + assert out_buf.getvalue() == data + + +@libhdfs +def test_hdfs_file_context_manager(hdfs): + path = pjoin(HDFS_TMP_PATH, 'ctx-manager') + + data = b'foo' + with hdfs.open(path, 'wb') as f: + f.write(data) + + with hdfs.open(path, 'rb') as f: + assert f.size() == 3 + result = f.read(10) + assert result == data diff --git a/python/setup.py b/python/setup.py index 7edeb914331..59410d75a61 100644 --- a/python/setup.py +++ b/python/setup.py @@ -214,7 +214,14 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['array', 'config', 'error', 'parquet', 'scalar', 'schema', 'table'] + return ['array', + 'config', + 'error', + 'io', + 'parquet', + 'scalar', + 'schema', + 'table'] def get_names(self): return self._found_names From 2f52cf4eed1033d1bf1f043d9063e462e60d6605 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 12 Jun 2016 11:48:10 +0200 Subject: [PATCH 093/210] ARROW-215: Support other integer types and strings in Parquet I/O Change-Id: I72c6c82bc38c895a04172531bebbc78d4fb08732 --- cpp/src/arrow/parquet/parquet-io-test.cc | 461 ++++++++++++------- cpp/src/arrow/parquet/parquet-schema-test.cc | 4 +- cpp/src/arrow/parquet/reader.cc | 160 ++++++- cpp/src/arrow/parquet/schema.cc | 47 +- cpp/src/arrow/parquet/schema.h | 9 +- cpp/src/arrow/parquet/test-util.h | 136 +++++- cpp/src/arrow/parquet/writer.cc | 234 ++++++++-- cpp/src/arrow/parquet/writer.h | 9 +- cpp/src/arrow/test-util.h | 2 + cpp/src/arrow/types/primitive.cc | 5 + python/pyarrow/includes/parquet.pxd | 13 +- python/pyarrow/parquet.pyx | 22 +- python/pyarrow/tests/test_parquet.py | 43 +- 13 files changed, 901 insertions(+), 244 deletions(-) diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index edcac887056..572cae16e58 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -21,7 +21,9 @@ #include "arrow/parquet/test-util.h" #include "arrow/parquet/reader.h" #include "arrow/parquet/writer.h" +#include "arrow/types/construct.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -30,12 +32,15 @@ using ParquetBuffer = parquet::Buffer; using parquet::BufferReader; +using parquet::default_writer_properties; using parquet::InMemoryOutputStream; +using parquet::LogicalType; using parquet::ParquetFileReader; using parquet::ParquetFileWriter; using parquet::RandomAccessSource; using parquet::Repetition; using parquet::SchemaDescriptor; +using parquet::ParquetVersion; using ParquetType = parquet::Type; using parquet::schema::GroupNode; using parquet::schema::NodePtr; @@ -51,26 +56,114 @@ const int LARGE_SIZE = 10000; template struct test_traits {}; +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::BOOLEAN; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static uint8_t const value; +}; + +const uint8_t test_traits::value(1); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_8; + static uint8_t const value; +}; + +const uint8_t test_traits::value(64); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::INT_8; + static int8_t const value; +}; + +const int8_t test_traits::value(-64); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_16; + static uint16_t const value; +}; + +const uint16_t test_traits::value(1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::INT_16; + static int16_t const value; +}; + +const int16_t test_traits::value(-1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_32; + static uint32_t const value; +}; + +const uint32_t test_traits::value(1024); + template <> struct test_traits { static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static int32_t const value; +}; + +const int32_t test_traits::value(-1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT64; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_64; + static uint64_t const value; }; +const uint64_t test_traits::value(1024); + template <> struct test_traits { static constexpr ParquetType::type parquet_enum = ParquetType::INT64; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static int64_t const value; }; +const int64_t test_traits::value(-1024); + template <> struct test_traits { static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static float const value; }; +const float test_traits::value(2.1f); + template <> struct test_traits { static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static double const value; +}; + +const double test_traits::value(4.2); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY; + static constexpr LogicalType::type logical_enum = LogicalType::UTF8; + static std::string const value; }; +const std::string test_traits::value("Test"); + template using ParquetDataType = ::parquet::DataType::parquet_enum>; @@ -80,18 +173,18 @@ using ParquetWriter = ::parquet::TypedColumnWriter>; template class TestParquetIO : public ::testing::Test { public: - typedef typename TestType::c_type T; virtual void SetUp() {} - std::shared_ptr MakeSchema( - ParquetType::type parquet_type, Repetition::type repetition) { - auto pnode = PrimitiveNode::Make("column1", repetition, parquet_type); + std::shared_ptr MakeSchema(Repetition::type repetition) { + auto pnode = PrimitiveNode::Make("column1", repetition, + test_traits::parquet_enum, test_traits::logical_enum); NodePtr node_ = GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); return std::static_pointer_cast(node_); } - std::unique_ptr MakeWriter(std::shared_ptr& schema) { + std::unique_ptr MakeWriter( + const std::shared_ptr& schema) { sink_ = std::make_shared(); return ParquetFileWriter::Open(sink_, schema); } @@ -106,113 +199,74 @@ class TestParquetIO : public ::testing::Test { std::unique_ptr file_reader, std::shared_ptr* out) { arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); std::unique_ptr column_reader; - ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); + ASSERT_OK_NO_THROW(reader.GetFlatColumn(0, &column_reader)); ASSERT_NE(nullptr, column_reader.get()); + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); ASSERT_NE(nullptr, out->get()); } + void ReadAndCheckSingleColumnFile(Array* values) { + std::shared_ptr out; + ReadSingleColumnFile(ReaderFromSink(), &out); + ASSERT_TRUE(values->Equals(out)); + } + void ReadTableFromFile( std::unique_ptr file_reader, std::shared_ptr
* out) { arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); - ASSERT_NO_THROW(ASSERT_OK(reader.ReadFlatTable(out))); + ASSERT_OK_NO_THROW(reader.ReadFlatTable(out)); ASSERT_NE(nullptr, out->get()); } - std::unique_ptr TestFile(std::vector& values, int num_chunks) { - std::shared_ptr schema = - MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); - std::unique_ptr file_writer = MakeWriter(schema); - size_t chunk_size = values.size() / num_chunks; - for (int i = 0; i < num_chunks; i++) { - auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = - static_cast*>(row_group_writer->NextColumn()); - T* data = values.data() + i * chunk_size; - column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); - column_writer->Close(); - row_group_writer->Close(); - } - file_writer->Close(); - return ReaderFromSink(); + void ReadAndCheckSingleColumnTable(const std::shared_ptr& values) { + std::shared_ptr
out; + ReadTableFromFile(ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(values->length(), out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); + } + + template + void WriteFlatColumn(const std::shared_ptr& schema, + const std::shared_ptr& values) { + FileWriter writer(default_memory_pool(), MakeWriter(schema)); + ASSERT_OK_NO_THROW(writer.NewRowGroup(values->length())); + ASSERT_OK_NO_THROW(writer.WriteFlatColumnChunk(values.get())); + ASSERT_OK_NO_THROW(writer.Close()); } std::shared_ptr sink_; }; -typedef ::testing::Types TestTypes; - -TYPED_TEST_CASE(TestParquetIO, TestTypes); - -TYPED_TEST(TestParquetIO, SingleColumnRequiredRead) { - std::vector values(SMALL_SIZE, 128); - std::unique_ptr file_reader = this->TestFile(values, 1); - - std::shared_ptr out; - this->ReadSingleColumnFile(std::move(file_reader), &out); - - ExpectArray(values.data(), out.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnRequiredTableRead) { - std::vector values(SMALL_SIZE, 128); - std::unique_ptr file_reader = this->TestFile(values, 1); - - std::shared_ptr
out; - this->ReadTableFromFile(std::move(file_reader), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(SMALL_SIZE, out->num_rows()); - - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ExpectArray(values.data(), chunked_array->chunk(0).get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedRead) { - std::vector values(SMALL_SIZE, 128); - std::unique_ptr file_reader = this->TestFile(values, 4); - - std::shared_ptr out; - this->ReadSingleColumnFile(std::move(file_reader), &out); +// We habe separate tests for UInt32Type as this is currently the only type +// where a roundtrip does not yield the identical Array structure. +// There we write an UInt32 Array but receive an Int64 Array as result for +// Parquet version 1.0. - ExpectArray(values.data(), out.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedTableRead) { - std::vector values(SMALL_SIZE, 128); - std::unique_ptr file_reader = this->TestFile(values, 4); - - std::shared_ptr
out; - this->ReadTableFromFile(std::move(file_reader), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(SMALL_SIZE, out->num_rows()); +typedef ::testing::Types TestTypes; - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ExpectArray(values.data(), chunked_array->chunk(0).get()); -} +TYPED_TEST_CASE(TestParquetIO, TestTypes); TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) { - std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); + auto values = NonNullArray(SMALL_SIZE); - std::shared_ptr schema = - this->MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); - FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); - ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); - ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); - ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); + this->WriteFlatColumn(schema, values); - std::shared_ptr out; - this->ReadSingleColumnFile(this->ReaderFromSink(), &out); - ASSERT_TRUE(values->Equals(out)); + this->ReadAndCheckSingleColumnFile(values.get()); } TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { - std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); + auto values = NonNullArray(SMALL_SIZE); std::shared_ptr
table = MakeSimpleTable(values, false); this->sink_ = std::make_shared(); - ASSERT_NO_THROW(ASSERT_OK( - WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length()))); + ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, + values->length(), default_writer_properties())); std::shared_ptr
out; this->ReadTableFromFile(this->ReaderFromSink(), &out); @@ -226,113 +280,208 @@ TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); + auto values = NullableArray(SMALL_SIZE, 10); - std::shared_ptr schema = - this->MakeSchema(test_traits::parquet_enum, Repetition::OPTIONAL); - FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); - ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); - ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); - ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); + this->WriteFlatColumn(schema, values); - std::shared_ptr out; - this->ReadSingleColumnFile(this->ReaderFromSink(), &out); - ASSERT_TRUE(values->Equals(out)); + this->ReadAndCheckSingleColumnFile(values.get()); } TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); + std::shared_ptr values = NullableArray(SMALL_SIZE, 10); std::shared_ptr
table = MakeSimpleTable(values, true); this->sink_ = std::make_shared(); - ASSERT_NO_THROW(ASSERT_OK( - WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length()))); - - std::shared_ptr
out; - this->ReadTableFromFile(this->ReaderFromSink(), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(SMALL_SIZE, out->num_rows()); + ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, + values->length(), default_writer_properties())); - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); + this->ReadAndCheckSingleColumnTable(values); } -TYPED_TEST(TestParquetIO, SingleColumnIntRequiredChunkedWrite) { - std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); - std::shared_ptr values_chunk = - NonNullArray(SMALL_SIZE / 4, 128); +TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { + auto values = NonNullArray(SMALL_SIZE); + int64_t chunk_size = values->length() / 4; - std::shared_ptr schema = - this->MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); + std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); for (int i = 0; i < 4; i++) { - ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length()))); - ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk.get()))); + ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); + ASSERT_OK_NO_THROW( + writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); } - ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + ASSERT_OK_NO_THROW(writer.Close()); - std::shared_ptr out; - this->ReadSingleColumnFile(this->ReaderFromSink(), &out); - ASSERT_TRUE(values->Equals(out)); + this->ReadAndCheckSingleColumnFile(values.get()); } TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { - std::shared_ptr values = NonNullArray(LARGE_SIZE, 128); + auto values = NonNullArray(LARGE_SIZE); std::shared_ptr
table = MakeSimpleTable(values, false); this->sink_ = std::make_shared(); - ASSERT_NO_THROW( - ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512))); - - std::shared_ptr
out; - this->ReadTableFromFile(this->ReaderFromSink(), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(LARGE_SIZE, out->num_rows()); + ASSERT_OK_NO_THROW(WriteFlatTable( + table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); + this->ReadAndCheckSingleColumnTable(values); } TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { - std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); - std::shared_ptr values_chunk_nulls = - NullableArray(SMALL_SIZE / 4, 128, 10); - std::shared_ptr values_chunk = - NullableArray(SMALL_SIZE / 4, 128, 0); - - std::shared_ptr schema = - this->MakeSchema(test_traits::parquet_enum, Repetition::OPTIONAL); + int64_t chunk_size = SMALL_SIZE / 4; + auto values = NullableArray(SMALL_SIZE, 10); + + std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); - ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk_nulls->length()))); - ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk_nulls.get()))); - for (int i = 0; i < 3; i++) { - ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length()))); - ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk.get()))); + for (int i = 0; i < 4; i++) { + ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); + ASSERT_OK_NO_THROW( + writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); } - ASSERT_NO_THROW(ASSERT_OK(writer.Close())); + ASSERT_OK_NO_THROW(writer.Close()); - std::shared_ptr out; - this->ReadSingleColumnFile(this->ReaderFromSink(), &out); - ASSERT_TRUE(values->Equals(out)); + this->ReadAndCheckSingleColumnFile(values.get()); } TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(LARGE_SIZE, 128, 100); + auto values = NullableArray(LARGE_SIZE, 100); std::shared_ptr
table = MakeSimpleTable(values, true); this->sink_ = std::make_shared(); - ASSERT_NO_THROW( - ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512))); + ASSERT_OK_NO_THROW(WriteFlatTable( + table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); - std::shared_ptr
out; - this->ReadTableFromFile(this->ReaderFromSink(), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(LARGE_SIZE, out->num_rows()); + this->ReadAndCheckSingleColumnTable(values); +} - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +using TestUInt32ParquetIO = TestParquetIO; + +TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compability) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(LARGE_SIZE, 100); + std::shared_ptr
table = MakeSimpleTable(values, true); + + // Parquet 2.0 roundtrip should yield an uint32_t column again + this->sink_ = std::make_shared(); + std::shared_ptr<::parquet::WriterProperties> properties = + ::parquet::WriterProperties::Builder() + .version(ParquetVersion::PARQUET_2_0) + ->build(); + ASSERT_OK_NO_THROW( + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); + this->ReadAndCheckSingleColumnTable(values); +} + +TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(LARGE_SIZE, 100); + std::shared_ptr
table = MakeSimpleTable(values, true); + + // Parquet 1.0 returns an int64_t column as there is no way to tell a Parquet 1.0 + // reader that a column is unsigned. + this->sink_ = std::make_shared(); + std::shared_ptr<::parquet::WriterProperties> properties = + ::parquet::WriterProperties::Builder() + .version(ParquetVersion::PARQUET_1_0) + ->build(); + ASSERT_OK_NO_THROW( + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); + + std::shared_ptr expected_values; + std::shared_ptr int64_data = + std::make_shared(default_memory_pool()); + { + ASSERT_OK(int64_data->Resize(sizeof(int64_t) * values->length())); + int64_t* int64_data_ptr = reinterpret_cast(int64_data->mutable_data()); + const uint32_t* uint32_data_ptr = + reinterpret_cast(values->data()->data()); + // std::copy might be faster but this is explicit on the casts) + for (int64_t i = 0; i < values->length(); i++) { + int64_data_ptr[i] = static_cast(uint32_data_ptr[i]); + } + } + ASSERT_OK(MakePrimitiveArray(std::make_shared(), values->length(), + int64_data, values->null_count(), values->null_bitmap(), &expected_values)); + this->ReadAndCheckSingleColumnTable(expected_values); +} + +template +using ParquetCDataType = typename ParquetDataType::c_type; + +template +class TestPrimitiveParquetIO : public TestParquetIO { + public: + typedef typename TestType::c_type T; + + void TestFile(std::vector& values, int num_chunks, + std::unique_ptr* file_reader) { + std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); + std::unique_ptr file_writer = this->MakeWriter(schema); + size_t chunk_size = values.size() / num_chunks; + // Convert to Parquet's expected physical type + std::vector values_buffer( + sizeof(ParquetCDataType) * values.size()); + auto values_parquet = + reinterpret_cast*>(values_buffer.data()); + std::copy(values.cbegin(), values.cend(), values_parquet); + for (int i = 0; i < num_chunks; i++) { + auto row_group_writer = file_writer->AppendRowGroup(chunk_size); + auto column_writer = + static_cast*>(row_group_writer->NextColumn()); + ParquetCDataType* data = values_parquet + i * chunk_size; + column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); + column_writer->Close(); + row_group_writer->Close(); + } + file_writer->Close(); + *file_reader = this->ReaderFromSink(); + } + + void TestSingleColumnRequiredTableRead(int num_chunks) { + std::vector values(SMALL_SIZE, test_traits::value); + std::unique_ptr file_reader; + ASSERT_NO_THROW(TestFile(values, num_chunks, &file_reader)); + + std::shared_ptr
out; + this->ReadTableFromFile(std::move(file_reader), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ExpectArray(values.data(), chunked_array->chunk(0).get()); + } + + void TestSingleColumnRequiredRead(int num_chunks) { + std::vector values(SMALL_SIZE, test_traits::value); + std::unique_ptr file_reader; + ASSERT_NO_THROW(TestFile(values, num_chunks, &file_reader)); + + std::shared_ptr out; + this->ReadSingleColumnFile(std::move(file_reader), &out); + + ExpectArray(values.data(), out.get()); + } +}; + +typedef ::testing::Types PrimitiveTestTypes; + +TYPED_TEST_CASE(TestPrimitiveParquetIO, PrimitiveTestTypes); + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredRead) { + this->TestSingleColumnRequiredRead(1); +} + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredTableRead) { + this->TestSingleColumnRequiredTableRead(1); +} + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedRead) { + this->TestSingleColumnRequiredRead(4); +} + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { + this->TestSingleColumnRequiredTableRead(4); } } // namespace parquet diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index 8de739491b5..819cdd3ec43 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -183,7 +183,9 @@ class TestConvertArrowSchema : public ::testing::Test { Status ConvertSchema(const std::vector>& fields) { arrow_schema_ = std::make_shared(fields); - return ToParquetSchema(arrow_schema_.get(), &result_schema_); + std::shared_ptr<::parquet::WriterProperties> properties = + ::parquet::default_writer_properties(); + return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_schema_); } protected: diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 3b4882d4439..7b05665b230 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -17,6 +17,7 @@ #include "arrow/parquet/reader.h" +#include #include #include #include @@ -27,6 +28,7 @@ #include "arrow/schema.h" #include "arrow/table.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" #include "arrow/util/status.h" using parquet::ColumnReader; @@ -36,6 +38,19 @@ using parquet::TypedColumnReader; namespace arrow { namespace parquet { +template +struct ArrowTypeTraits { + typedef NumericBuilder builder_type; +}; + +template <> +struct ArrowTypeTraits { + typedef BooleanBuilder builder_type; +}; + +template +using BuilderType = typename ArrowTypeTraits::builder_type; + class FileReader::Impl { public: Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader); @@ -61,9 +76,45 @@ class FlatColumnReader::Impl { template Status TypedReadBatch(int batch_size, std::shared_ptr* out); + template + Status ReadNullableFlatBatch(const int16_t* def_levels, + typename ParquetType::c_type* values, int64_t values_read, int64_t levels_read, + BuilderType* builder); + template + Status ReadNonNullableBatch(typename ParquetType::c_type* values, int64_t values_read, + BuilderType* builder); + private: void NextRowGroup(); + template + struct can_copy_ptr { + static constexpr bool value = + std::is_same::value || + (std::is_integral{} && std::is_integral{} && + (sizeof(InType) == sizeof(OutType))); + }; + + template ::value>::type* = nullptr> + Status ConvertPhysicalType( + const InType* in_ptr, int64_t length, const OutType** out_ptr) { + *out_ptr = reinterpret_cast(in_ptr); + return Status::OK(); + } + + template ::value>::type* = nullptr> + Status ConvertPhysicalType( + const InType* in_ptr, int64_t length, const OutType** out_ptr) { + RETURN_NOT_OK(values_builder_buffer_.Resize(length * sizeof(OutType))); + OutType* mutable_out_ptr = + reinterpret_cast(values_builder_buffer_.mutable_data()); + std::copy(in_ptr, in_ptr + length, mutable_out_ptr); + *out_ptr = mutable_out_ptr; + return Status::OK(); + } + MemoryPool* pool_; const ::parquet::ColumnDescriptor* descr_; ::parquet::ParquetFileReader* reader_; @@ -155,13 +206,53 @@ FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor NextRowGroup(); } +template +Status FlatColumnReader::Impl::ReadNonNullableBatch(typename ParquetType::c_type* values, + int64_t values_read, BuilderType* builder) { + using ArrowCType = typename ArrowType::c_type; + using ParquetCType = typename ParquetType::c_type; + + DCHECK(builder); + const ArrowCType* values_ptr; + RETURN_NOT_OK( + (ConvertPhysicalType(values, values_read, &values_ptr))); + RETURN_NOT_OK(builder->Append(values_ptr, values_read)); + return Status::OK(); +} + +template +Status FlatColumnReader::Impl::ReadNullableFlatBatch(const int16_t* def_levels, + typename ParquetType::c_type* values, int64_t values_read, int64_t levels_read, + BuilderType* builder) { + using ArrowCType = typename ArrowType::c_type; + + DCHECK(builder); + RETURN_NOT_OK(values_builder_buffer_.Resize(levels_read * sizeof(ArrowCType))); + RETURN_NOT_OK(valid_bytes_buffer_.Resize(levels_read * sizeof(uint8_t))); + auto values_ptr = reinterpret_cast(values_builder_buffer_.mutable_data()); + uint8_t* valid_bytes = valid_bytes_buffer_.mutable_data(); + int values_idx = 0; + for (int64_t i = 0; i < levels_read; i++) { + if (def_levels[i] < descr_->max_definition_level()) { + valid_bytes[i] = 0; + } else { + valid_bytes[i] = 1; + values_ptr[i] = values[values_idx++]; + } + } + RETURN_NOT_OK(builder->Append(values_ptr, levels_read, valid_bytes)); + return Status::OK(); +} + template Status FlatColumnReader::Impl::TypedReadBatch( int batch_size, std::shared_ptr* out) { + using ParquetCType = typename ParquetType::c_type; + int values_to_read = batch_size; - NumericBuilder builder(pool_, field_->type); + BuilderType builder(pool_, field_->type); while ((values_to_read > 0) && column_reader_) { - values_buffer_.Resize(values_to_read * sizeof(typename ParquetType::c_type)); + values_buffer_.Resize(values_to_read * sizeof(ParquetCType)); if (descr_->max_definition_level() > 0) { def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); } @@ -169,31 +260,62 @@ Status FlatColumnReader::Impl::TypedReadBatch( int64_t values_read; int64_t levels_read; int16_t* def_levels = reinterpret_cast(def_levels_buffer_.mutable_data()); - auto values = - reinterpret_cast(values_buffer_.mutable_data()); + auto values = reinterpret_cast(values_buffer_.mutable_data()); PARQUET_CATCH_NOT_OK(levels_read = reader->ReadBatch( values_to_read, def_levels, nullptr, values, &values_read)); values_to_read -= levels_read; if (descr_->max_definition_level() == 0) { - RETURN_NOT_OK(builder.Append(values, values_read)); + RETURN_NOT_OK( + (ReadNonNullableBatch(values, values_read, &builder))); + } else { + // As per the defintion and checks for flat columns: + // descr_->max_definition_level() == 1 + RETURN_NOT_OK((ReadNullableFlatBatch( + def_levels, values, values_read, levels_read, &builder))); + } + if (!column_reader_->HasNext()) { NextRowGroup(); } + } + *out = builder.Finish(); + return Status::OK(); +} + +template <> +Status FlatColumnReader::Impl::TypedReadBatch( + int batch_size, std::shared_ptr* out) { + int values_to_read = batch_size; + StringBuilder builder(pool_, field_->type); + while ((values_to_read > 0) && column_reader_) { + values_buffer_.Resize(values_to_read * sizeof(::parquet::ByteArray)); + if (descr_->max_definition_level() > 0) { + def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); + } + auto reader = + dynamic_cast*>(column_reader_.get()); + int64_t values_read; + int64_t levels_read; + int16_t* def_levels = reinterpret_cast(def_levels_buffer_.mutable_data()); + auto values = reinterpret_cast<::parquet::ByteArray*>(values_buffer_.mutable_data()); + PARQUET_CATCH_NOT_OK(levels_read = reader->ReadBatch( + values_to_read, def_levels, nullptr, values, &values_read)); + values_to_read -= levels_read; + if (descr_->max_definition_level() == 0) { + for (int64_t i = 0; i < levels_read; i++) { + RETURN_NOT_OK( + builder.Append(reinterpret_cast(values[i].ptr), values[i].len)); + } } else { // descr_->max_definition_level() == 1 - RETURN_NOT_OK(values_builder_buffer_.Resize( - levels_read * sizeof(typename ParquetType::c_type))); - RETURN_NOT_OK(valid_bytes_buffer_.Resize(levels_read * sizeof(uint8_t))); - auto values_ptr = reinterpret_cast( - values_builder_buffer_.mutable_data()); - uint8_t* valid_bytes = valid_bytes_buffer_.mutable_data(); int values_idx = 0; for (int64_t i = 0; i < levels_read; i++) { if (def_levels[i] < descr_->max_definition_level()) { - valid_bytes[i] = 0; + RETURN_NOT_OK(builder.AppendNull()); } else { - valid_bytes[i] = 1; - values_ptr[i] = values[values_idx++]; + RETURN_NOT_OK( + builder.Append(reinterpret_cast(values[values_idx].ptr), + values[values_idx].len)); + values_idx++; } } - builder.Append(values_ptr, levels_read, valid_bytes); } if (!column_reader_->HasNext()) { NextRowGroup(); } } @@ -214,10 +336,18 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr* } switch (field_->type->type) { + TYPED_BATCH_CASE(BOOL, BooleanType, ::parquet::BooleanType) + TYPED_BATCH_CASE(UINT8, UInt8Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(INT8, Int8Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(UINT16, UInt16Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(INT16, Int16Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(UINT32, UInt32Type, ::parquet::Int32Type) TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(UINT64, UInt64Type, ::parquet::Int64Type) TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) + TYPED_BATCH_CASE(STRING, StringType, ::parquet::ByteArrayType) default: return Status::NotImplemented(field_->type->ToString()); } diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index c7979db3494..a79342afe2f 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -42,7 +42,12 @@ namespace parquet { const auto BOOL = std::make_shared(); const auto UINT8 = std::make_shared(); +const auto INT8 = std::make_shared(); +const auto UINT16 = std::make_shared(); +const auto INT16 = std::make_shared(); +const auto UINT32 = std::make_shared(); const auto INT32 = std::make_shared(); +const auto UINT64 = std::make_shared(); const auto INT64 = std::make_shared(); const auto FLOAT = std::make_shared(); const auto DOUBLE = std::make_shared(); @@ -92,6 +97,21 @@ static Status FromInt32(const PrimitiveNode* node, TypePtr* out) { case LogicalType::NONE: *out = INT32; break; + case LogicalType::UINT_8: + *out = UINT8; + break; + case LogicalType::INT_8: + *out = INT8; + break; + case LogicalType::UINT_16: + *out = UINT16; + break; + case LogicalType::INT_16: + *out = INT16; + break; + case LogicalType::UINT_32: + *out = UINT32; + break; case LogicalType::DECIMAL: *out = MakeDecimalType(node); break; @@ -107,6 +127,9 @@ static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { case LogicalType::NONE: *out = INT64; break; + case LogicalType::UINT_64: + *out = UINT64; + break; case LogicalType::DECIMAL: *out = MakeDecimalType(node); break; @@ -187,20 +210,21 @@ Status FromParquetSchema( } Status StructToNode(const std::shared_ptr& type, const std::string& name, - bool nullable, NodePtr* out) { + bool nullable, const ::parquet::WriterProperties& properties, NodePtr* out) { Repetition::type repetition = Repetition::REQUIRED; if (nullable) { repetition = Repetition::OPTIONAL; } std::vector children(type->num_children()); for (int i = 0; i < type->num_children(); i++) { - RETURN_NOT_OK(FieldToNode(type->child(i), &children[i])); + RETURN_NOT_OK(FieldToNode(type->child(i), properties, &children[i])); } *out = GroupNode::Make(name, repetition, children); return Status::OK(); } -Status FieldToNode(const std::shared_ptr& field, NodePtr* out) { +Status FieldToNode(const std::shared_ptr& field, + const ::parquet::WriterProperties& properties, NodePtr* out) { LogicalType::type logical_type = LogicalType::NONE; ParquetType::type type; Repetition::type repetition = Repetition::REQUIRED; @@ -231,8 +255,12 @@ Status FieldToNode(const std::shared_ptr& field, NodePtr* out) { logical_type = LogicalType::INT_16; break; case Type::UINT32: - type = ParquetType::INT32; - logical_type = LogicalType::UINT_32; + if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) { + type = ParquetType::INT64; + } else { + type = ParquetType::INT32; + logical_type = LogicalType::UINT_32; + } break; case Type::INT32: type = ParquetType::INT32; @@ -277,7 +305,7 @@ Status FieldToNode(const std::shared_ptr& field, NodePtr* out) { break; case Type::STRUCT: { auto struct_type = std::static_pointer_cast(field->type); - return StructToNode(struct_type, field->name, field->nullable, out); + return StructToNode(struct_type, field->name, field->nullable, properties, out); } break; default: // TODO: LIST, DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL, DECIMAL_TEXT, VARCHAR @@ -287,11 +315,12 @@ Status FieldToNode(const std::shared_ptr& field, NodePtr* out) { return Status::OK(); } -Status ToParquetSchema( - const Schema* arrow_schema, std::shared_ptr<::parquet::SchemaDescriptor>* out) { +Status ToParquetSchema(const Schema* arrow_schema, + const ::parquet::WriterProperties& properties, + std::shared_ptr<::parquet::SchemaDescriptor>* out) { std::vector nodes(arrow_schema->num_fields()); for (int i = 0; i < arrow_schema->num_fields(); i++) { - RETURN_NOT_OK(FieldToNode(arrow_schema->field(i), &nodes[i])); + RETURN_NOT_OK(FieldToNode(arrow_schema->field(i), properties, &nodes[i])); } NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h index ec5f96062e8..39bee059522 100644 --- a/cpp/src/arrow/parquet/schema.h +++ b/cpp/src/arrow/parquet/schema.h @@ -21,6 +21,7 @@ #include #include "parquet/api/schema.h" +#include "parquet/api/writer.h" #include "arrow/schema.h" #include "arrow/type.h" @@ -36,10 +37,12 @@ Status NodeToField(const ::parquet::schema::NodePtr& node, std::shared_ptr* out); -Status FieldToNode(const std::shared_ptr& field, ::parquet::schema::NodePtr* out); +Status FieldToNode(const std::shared_ptr& field, + const ::parquet::WriterProperties& properties, ::parquet::schema::NodePtr* out); -Status ToParquetSchema( - const Schema* arrow_schema, std::shared_ptr<::parquet::SchemaDescriptor>* out); +Status ToParquetSchema(const Schema* arrow_schema, + const ::parquet::WriterProperties& properties, + std::shared_ptr<::parquet::SchemaDescriptor>* out); } // namespace parquet diff --git a/cpp/src/arrow/parquet/test-util.h b/cpp/src/arrow/parquet/test-util.h index cc8723bf6ec..68a7fb94c2a 100644 --- a/cpp/src/arrow/parquet/test-util.h +++ b/cpp/src/arrow/parquet/test-util.h @@ -18,26 +18,90 @@ #include #include +#include "arrow/test-util.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" namespace arrow { namespace parquet { template -std::shared_ptr NonNullArray( - size_t size, typename ArrowType::c_type value) { - std::vector values(size, value); +using is_arrow_float = std::is_floating_point; + +template +using is_arrow_int = std::is_integral; + +template +using is_arrow_string = std::is_same; + +template +typename std::enable_if::value, + std::shared_ptr>::type +NonNullArray(size_t size) { + std::vector values; + ::arrow::test::random_real(size, 0, 0, 1, &values); NumericBuilder builder(default_memory_pool(), std::make_shared()); builder.Append(values.data(), values.size()); return std::static_pointer_cast(builder.Finish()); } -// This helper function only supports (size/2) nulls yet. +template +typename std::enable_if::value, + std::shared_ptr>::type +NonNullArray(size_t size) { + std::vector values; + ::arrow::test::randint(size, 0, 64, &values); + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size()); + return std::static_pointer_cast(builder.Finish()); +} + +template +typename std::enable_if::value, + std::shared_ptr>::type +NonNullArray(size_t size) { + StringBuilder builder(default_memory_pool(), std::make_shared()); + for (size_t i = 0; i < size; i++) { + builder.Append("test-string"); + } + return std::static_pointer_cast(builder.Finish()); +} + +template <> +std::shared_ptr NonNullArray(size_t size) { + std::vector values; + ::arrow::test::randint(size, 0, 1, &values); + BooleanBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size()); + return std::static_pointer_cast(builder.Finish()); +} + +// This helper function only supports (size/2) nulls. +template +typename std::enable_if::value, + std::shared_ptr>::type +NullableArray(size_t size, size_t num_nulls) { + std::vector values; + ::arrow::test::random_real(size, 0, 0, 1, &values); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size(), valid_bytes.data()); + return std::static_pointer_cast(builder.Finish()); +} + +// This helper function only supports (size/2) nulls. template -std::shared_ptr NullableArray( - size_t size, typename ArrowType::c_type value, size_t num_nulls) { - std::vector values(size, value); +typename std::enable_if::value, + std::shared_ptr>::type +NullableArray(size_t size, size_t num_nulls) { + std::vector values; + ::arrow::test::randint(size, 0, 64, &values); std::vector valid_bytes(size, 1); for (size_t i = 0; i < num_nulls; i++) { @@ -49,14 +113,49 @@ std::shared_ptr NullableArray( return std::static_pointer_cast(builder.Finish()); } -std::shared_ptr MakeColumn(const std::string& name, - const std::shared_ptr& array, bool nullable) { +// This helper function only supports (size/2) nulls yet. +template +typename std::enable_if::value, + std::shared_ptr>::type +NullableArray(size_t size, size_t num_nulls) { + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + StringBuilder builder(default_memory_pool(), std::make_shared()); + for (size_t i = 0; i < size; i++) { + builder.Append("test-string"); + } + return std::static_pointer_cast(builder.Finish()); +} + +// This helper function only supports (size/2) nulls yet. +template <> +std::shared_ptr NullableArray( + size_t size, size_t num_nulls) { + std::vector values; + ::arrow::test::randint(size, 0, 1, &values); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + BooleanBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size(), valid_bytes.data()); + return std::static_pointer_cast(builder.Finish()); +} + +std::shared_ptr MakeColumn( + const std::string& name, const std::shared_ptr& array, bool nullable) { auto field = std::make_shared(name, array->type(), nullable); return std::make_shared(field, array); } std::shared_ptr
MakeSimpleTable( - const std::shared_ptr& values, bool nullable) { + const std::shared_ptr& values, bool nullable) { std::shared_ptr column = MakeColumn("col", values, nullable); std::vector> columns({column}); std::vector> fields({column->field()}); @@ -72,6 +171,23 @@ void ExpectArray(T* expected, Array* result) { } } +template +void ExpectArray(typename ArrowType::c_type* expected, Array* result) { + PrimitiveArray* p_array = static_cast(result); + for (int64_t i = 0; i < result->length(); i++) { + EXPECT_EQ(expected[i], + reinterpret_cast(p_array->data()->data())[i]); + } +} + +template <> +void ExpectArray(uint8_t* expected, Array* result) { + BooleanBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(expected, result->length()); + std::shared_ptr expected_array = builder.Finish(); + EXPECT_TRUE(result->Equals(expected_array)); +} + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 4005e3b2b0c..63449bb20b1 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -25,11 +25,13 @@ #include "arrow/table.h" #include "arrow/types/construct.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" #include "arrow/parquet/schema.h" #include "arrow/parquet/utils.h" #include "arrow/util/status.h" using parquet::ParquetFileWriter; +using parquet::ParquetVersion; using parquet::schema::GroupNode; namespace arrow { @@ -41,10 +43,40 @@ class FileWriter::Impl { Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); Status NewRowGroup(int64_t chunk_size); - template + template Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data, int64_t offset, int64_t length); + + // TODO(uwe): Same code as in reader.cc the only difference is the name of the temporary + // buffer + template + struct can_copy_ptr { + static constexpr bool value = + std::is_same::value || + (std::is_integral{} && std::is_integral{} && + (sizeof(InType) == sizeof(OutType))); + }; + + template ::value>::type* = nullptr> + Status ConvertPhysicalType(const InType* in_ptr, int64_t, const OutType** out_ptr) { + *out_ptr = reinterpret_cast(in_ptr); + return Status::OK(); + } + + template ::value>::type* = nullptr> + Status ConvertPhysicalType( + const InType* in_ptr, int64_t length, const OutType** out_ptr) { + RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(OutType))); + OutType* mutable_out_ptr = reinterpret_cast(data_buffer_.mutable_data()); + std::copy(in_ptr, in_ptr + length, mutable_out_ptr); + *out_ptr = mutable_out_ptr; + return Status::OK(); + } + Status WriteFlatColumnChunk(const PrimitiveArray* data, int64_t offset, int64_t length); + Status WriteFlatColumnChunk(const StringArray* data, int64_t offset, int64_t length); Status Close(); virtual ~Impl() {} @@ -53,6 +85,8 @@ class FileWriter::Impl { friend class FileWriter; MemoryPool* pool_; + // Buffer used for storing the data of an array converted to the physical type + // as expected by parquet-cpp. PoolBuffer data_buffer_; PoolBuffer def_levels_buffer_; std::unique_ptr<::parquet::ParquetFileWriter> writer_; @@ -72,36 +106,95 @@ Status FileWriter::Impl::NewRowGroup(int64_t chunk_size) { return Status::OK(); } -template +template Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer, const PrimitiveArray* data, int64_t offset, int64_t length) { - // TODO: DCHECK((offset + length) <= data->length()); - auto data_ptr = - reinterpret_cast(data->data()->data()) + - offset; + using ArrowCType = typename ArrowType::c_type; + using ParquetCType = typename ParquetType::c_type; + + DCHECK((offset + length) <= data->length()); + auto data_ptr = reinterpret_cast(data->data()->data()) + offset; auto writer = reinterpret_cast<::parquet::TypedColumnWriter*>(column_writer); if (writer->descr()->max_definition_level() == 0) { // no nulls, just dump the data - PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, data_ptr)); + const ParquetCType* data_writer_ptr; + RETURN_NOT_OK((ConvertPhysicalType( + data_ptr, length, &data_writer_ptr))); + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, data_writer_ptr)); } else if (writer->descr()->max_definition_level() == 1) { RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); int16_t* def_levels_ptr = reinterpret_cast(def_levels_buffer_.mutable_data()); if (data->null_count() == 0) { std::fill(def_levels_ptr, def_levels_ptr + length, 1); - PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, data_ptr)); + const ParquetCType* data_writer_ptr; + RETURN_NOT_OK((ConvertPhysicalType( + data_ptr, length, &data_writer_ptr))); + PARQUET_CATCH_NOT_OK( + writer->WriteBatch(length, def_levels_ptr, nullptr, data_writer_ptr)); } else { - RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(typename ParquetType::c_type))); - auto buffer_ptr = - reinterpret_cast(data_buffer_.mutable_data()); + RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(ParquetCType))); + auto buffer_ptr = reinterpret_cast(data_buffer_.mutable_data()); int buffer_idx = 0; for (int i = 0; i < length; i++) { if (data->IsNull(offset + i)) { def_levels_ptr[i] = 0; } else { def_levels_ptr[i] = 1; - buffer_ptr[buffer_idx++] = data_ptr[i]; + buffer_ptr[buffer_idx++] = static_cast(data_ptr[i]); + } + } + PARQUET_CATCH_NOT_OK( + writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); + } + } else { + return Status::NotImplemented("no support for max definition level > 1 yet"); + } + PARQUET_CATCH_NOT_OK(writer->Close()); + return Status::OK(); +} + +// This specialization seems quite similar but it significantly differs in two points: +// * offset is added at the most latest time to the pointer as we have sub-byte access +// * Arrow data is stored bitwise thus we cannot use std::copy to transform from +// ArrowType::c_type to ParquetType::c_type +template <> +Status FileWriter::Impl::TypedWriteBatch<::parquet::BooleanType, BooleanType>( + ::parquet::ColumnWriter* column_writer, const PrimitiveArray* data, int64_t offset, + int64_t length) { + DCHECK((offset + length) <= data->length()); + RETURN_NOT_OK(data_buffer_.Resize(length)); + auto data_ptr = reinterpret_cast(data->data()->data()); + auto buffer_ptr = reinterpret_cast(data_buffer_.mutable_data()); + auto writer = reinterpret_cast<::parquet::TypedColumnWriter<::parquet::BooleanType>*>( + column_writer); + if (writer->descr()->max_definition_level() == 0) { + // no nulls, just dump the data + for (int64_t i = 0; i < length; i++) { + buffer_ptr[i] = util::get_bit(data_ptr, offset + i); + } + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, buffer_ptr)); + } else if (writer->descr()->max_definition_level() == 1) { + RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); + int16_t* def_levels_ptr = + reinterpret_cast(def_levels_buffer_.mutable_data()); + if (data->null_count() == 0) { + std::fill(def_levels_ptr, def_levels_ptr + length, 1); + for (int64_t i = 0; i < length; i++) { + buffer_ptr[i] = util::get_bit(data_ptr, offset + i); + } + // TODO(PARQUET-644): write boolean values as a packed bitmap + PARQUET_CATCH_NOT_OK( + writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); + } else { + int buffer_idx = 0; + for (int i = 0; i < length; i++) { + if (data->IsNull(offset + i)) { + def_levels_ptr[i] = 0; + } else { + def_levels_ptr[i] = 1; + buffer_ptr[buffer_idx++] = util::get_bit(data_ptr, offset + i); } } PARQUET_CATCH_NOT_OK( @@ -120,9 +213,9 @@ Status FileWriter::Impl::Close() { return Status::OK(); } -#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ - case Type::ENUM: \ - return TypedWriteBatch(writer, data, offset, length); \ +#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ + case Type::ENUM: \ + return TypedWriteBatch(writer, data, offset, length); \ break; Status FileWriter::Impl::WriteFlatColumnChunk( @@ -130,15 +223,76 @@ Status FileWriter::Impl::WriteFlatColumnChunk( ::parquet::ColumnWriter* writer; PARQUET_CATCH_NOT_OK(writer = row_group_writer_->NextColumn()); switch (data->type_enum()) { - TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) - TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) - TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) + TYPED_BATCH_CASE(BOOL, BooleanType, ::parquet::BooleanType) + TYPED_BATCH_CASE(UINT8, UInt8Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(INT8, Int8Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(UINT16, UInt16Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(INT16, Int16Type, ::parquet::Int32Type) + case Type::UINT32: + if (writer_->properties()->version() == ParquetVersion::PARQUET_1_0) { + // Parquet 1.0 reader cannot read the UINT_32 logical type. Thus we need + // to use the larger Int64Type to store them lossless. + return TypedWriteBatch<::parquet::Int64Type, UInt32Type>( + writer, data, offset, length); + } else { + return TypedWriteBatch<::parquet::Int32Type, UInt32Type>( + writer, data, offset, length); + } + TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) + TYPED_BATCH_CASE(UINT64, UInt64Type, ::parquet::Int64Type) + TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) + TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) + TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) default: return Status::NotImplemented(data->type()->ToString()); } } +Status FileWriter::Impl::WriteFlatColumnChunk( + const StringArray* data, int64_t offset, int64_t length) { + ::parquet::ColumnWriter* column_writer; + PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn()); + DCHECK((offset + length) <= data->length()); + RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(::parquet::ByteArray))); + auto buffer_ptr = reinterpret_cast<::parquet::ByteArray*>(data_buffer_.mutable_data()); + auto values = std::dynamic_pointer_cast(data->values()); + auto data_ptr = reinterpret_cast(values->data()->data()); + DCHECK(values != nullptr); + auto writer = reinterpret_cast<::parquet::TypedColumnWriter<::parquet::ByteArrayType>*>( + column_writer); + if (writer->descr()->max_definition_level() > 0) { + RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); + } + int16_t* def_levels_ptr = reinterpret_cast(def_levels_buffer_.mutable_data()); + if (writer->descr()->max_definition_level() == 0 || data->null_count() == 0) { + // no nulls, just dump the data + for (int64_t i = 0; i < length; i++) { + buffer_ptr[i] = ::parquet::ByteArray( + data->value_length(i + offset), data_ptr + data->value_offset(i)); + } + if (writer->descr()->max_definition_level() > 0) { + std::fill(def_levels_ptr, def_levels_ptr + length, 1); + } + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); + } else if (writer->descr()->max_definition_level() == 1) { + int buffer_idx = 0; + for (int64_t i = 0; i < length; i++) { + if (data->IsNull(offset + i)) { + def_levels_ptr[i] = 0; + } else { + def_levels_ptr[i] = 1; + buffer_ptr[buffer_idx++] = ::parquet::ByteArray( + data->value_length(i + offset), data_ptr + data->value_offset(i + offset)); + } + } + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); + } else { + return Status::NotImplemented("no support for max definition level > 1 yet"); + } + PARQUET_CATCH_NOT_OK(writer->Close()); + return Status::OK(); +} + FileWriter::FileWriter( MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer) : impl_(new FileWriter::Impl(pool, std::move(writer))) {} @@ -148,10 +302,20 @@ Status FileWriter::NewRowGroup(int64_t chunk_size) { } Status FileWriter::WriteFlatColumnChunk( - const PrimitiveArray* data, int64_t offset, int64_t length) { + const Array* array, int64_t offset, int64_t length) { int64_t real_length = length; - if (length == -1) { real_length = data->length(); } - return impl_->WriteFlatColumnChunk(data, offset, real_length); + if (length == -1) { real_length = array->length(); } + if (array->type_enum() == Type::STRING) { + auto string_array = dynamic_cast(array); + DCHECK(string_array); + return impl_->WriteFlatColumnChunk(string_array, offset, real_length); + } else { + auto primitive_array = dynamic_cast(array); + if (!primitive_array) { + return Status::NotImplemented("Table must consist of PrimitiveArray instances"); + } + return impl_->WriteFlatColumnChunk(primitive_array, offset, real_length); + } } Status FileWriter::Close() { @@ -165,40 +329,30 @@ MemoryPool* FileWriter::memory_pool() const { FileWriter::~FileWriter() {} Status WriteFlatTable(const Table* table, MemoryPool* pool, - std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size) { + const std::shared_ptr<::parquet::OutputStream>& sink, int64_t chunk_size, + const std::shared_ptr<::parquet::WriterProperties>& properties) { std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema; - RETURN_NOT_OK(ToParquetSchema(table->schema().get(), &parquet_schema)); + RETURN_NOT_OK( + ToParquetSchema(table->schema().get(), *properties.get(), &parquet_schema)); auto schema_node = std::static_pointer_cast(parquet_schema->schema()); std::unique_ptr parquet_writer = - ParquetFileWriter::Open(sink, schema_node); + ParquetFileWriter::Open(sink, schema_node, properties); FileWriter writer(pool, std::move(parquet_writer)); - // TODO: Support writing chunked arrays. + // TODO(ARROW-232) Support writing chunked arrays. for (int i = 0; i < table->num_columns(); i++) { if (table->column(i)->data()->num_chunks() != 1) { return Status::NotImplemented("No support for writing chunked arrays yet."); } } - // Cast to PrimitiveArray instances as we work with them. - std::vector> arrays(table->num_columns()); - for (int i = 0; i < table->num_columns(); i++) { - // num_chunks == 1 as per above loop - std::shared_ptr array = table->column(i)->data()->chunk(0); - auto primitive_array = std::dynamic_pointer_cast(array); - if (!primitive_array) { - PARQUET_IGNORE_NOT_OK(writer.Close()); - return Status::NotImplemented("Table must consist of PrimitiveArray instances"); - } - arrays[i] = primitive_array; - } - for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) { int64_t offset = chunk * chunk_size; int64_t size = std::min(chunk_size, table->num_rows() - offset); RETURN_NOT_OK_ELSE(writer.NewRowGroup(size), PARQUET_IGNORE_NOT_OK(writer.Close())); for (int i = 0; i < table->num_columns(); i++) { - RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size), + std::shared_ptr array = table->column(i)->data()->chunk(0); + RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(array.get(), offset, size), PARQUET_IGNORE_NOT_OK(writer.Close())); } } diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index 93693f51184..cfd80d80b79 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -25,10 +25,12 @@ namespace arrow { +class Array; class MemoryPool; class PrimitiveArray; class RowBatch; class Status; +class StringArray; class Table; namespace parquet { @@ -43,8 +45,7 @@ class FileWriter { FileWriter(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); Status NewRowGroup(int64_t chunk_size); - Status WriteFlatColumnChunk( - const PrimitiveArray* data, int64_t offset = 0, int64_t length = -1); + Status WriteFlatColumnChunk(const Array* data, int64_t offset = 0, int64_t length = -1); Status Close(); virtual ~FileWriter(); @@ -62,7 +63,9 @@ class FileWriter { * The table shall only consist of nullable, non-repeated columns of primitive type. */ Status WriteFlatTable(const Table* table, MemoryPool* pool, - std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size); + const std::shared_ptr<::parquet::OutputStream>& sink, int64_t chunk_size, + const std::shared_ptr<::parquet::WriterProperties>& properties = + ::parquet::default_writer_properties()); } // namespace parquet diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 2f81161d1d6..055dac74444 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -50,6 +50,8 @@ if (!s.ok()) { FAIL() << s.ToString(); } \ } while (0) +#define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) + #define EXPECT_OK(expr) \ do { \ Status s = (expr); \ diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 08fc8478e6d..f4b47f9d2f5 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -133,6 +133,11 @@ Status PrimitiveBuilder::Append( RETURN_NOT_OK(Reserve(length)); for (int i = 0; i < length; ++i) { + // Skip reading from unitialised memory + // TODO: This actually is only to keep valgrind happy but may or may not + // have a performance impact. + if ((valid_bytes != nullptr) && !valid_bytes[i]) continue; + if (values[i] > 0) { util::set_bit(raw_data_, length_ + i); } else { diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index 0918344070e..a2f83ea5ea5 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -32,6 +32,10 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: pass cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: + enum ParquetVersion" parquet::ParquetVersion::type": + PARQUET_1_0" parquet::ParquetVersion::PARQUET_1_0" + PARQUET_2_0" parquet::ParquetVersion::PARQUET_2_0" + cdef cppclass SchemaDescriptor: shared_ptr[Node] schema() GroupNode* group() @@ -80,6 +84,11 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: LocalFileOutputStream(const c_string& path) void Close() + cdef cppclass WriterProperties: + cppclass Builder: + Builder* version(ParquetVersion version) + shared_ptr[WriterProperties] build() + cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil: cdef cppclass FileReader: @@ -93,5 +102,7 @@ cdef extern from "arrow/parquet/schema.h" namespace "arrow::parquet" nogil: cdef extern from "arrow/parquet/writer.h" namespace "arrow::parquet" nogil: - cdef CStatus WriteFlatTable(const CTable* table, MemoryPool* pool, shared_ptr[OutputStream] sink, int64_t chunk_size) + cdef CStatus WriteFlatTable(const CTable* table, MemoryPool* pool, + const shared_ptr[OutputStream]& sink, int64_t chunk_size, + const shared_ptr[WriterProperties]& properties) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 3d5355ebe43..0b2b2088033 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -24,6 +24,7 @@ cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.includes.parquet cimport * from pyarrow.compat import tobytes +from pyarrow.error import ArrowException from pyarrow.error cimport check_cstatus from pyarrow.table cimport Table @@ -42,11 +43,13 @@ def read_table(filename, columns=None): # in Cython (due to missing rvalue support) reader = unique_ptr[FileReader](new FileReader(default_memory_pool(), ParquetFileReader.OpenFile(tobytes(filename)))) - check_cstatus(reader.get().ReadFlatTable(&ctable)) + with nogil: + check_cstatus(reader.get().ReadFlatTable(&ctable)) + table.init(ctable) return table -def write_table(table, filename, chunk_size=None): +def write_table(table, filename, chunk_size=None, version=None): """ Write a Table to Parquet format @@ -56,16 +59,29 @@ def write_table(table, filename, chunk_size=None): filename : string chunk_size : int The maximum number of rows in each Parquet RowGroup + version : {"1.0", "2.0"}, default "1.0" + The Parquet format version, defaults to 1.0 """ cdef Table table_ = table cdef CTable* ctable_ = table_.table cdef shared_ptr[OutputStream] sink + cdef WriterProperties.Builder properties_builder cdef int64_t chunk_size_ = 0 if chunk_size is None: chunk_size_ = min(ctable_.num_rows(), int(2**16)) else: chunk_size_ = chunk_size + if version is not None: + if version == "1.0": + properties_builder.version(PARQUET_1_0) + elif version == "2.0": + properties_builder.version(PARQUET_2_0) + else: + raise ArrowException("Unsupported Parquet format version") + sink.reset(new LocalFileOutputStream(tobytes(filename))) - check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, chunk_size_)) + with nogil: + check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, + chunk_size_, properties_builder.build())) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index d92cf4ca656..de9cfbb46e1 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -42,18 +42,55 @@ def test_single_pylist_column_roundtrip(tmpdir): data_read = col_read.data.chunk(0) assert data_written.equals(data_read) -def test_pandas_rountrip(tmpdir): +def test_pandas_parquet_2_0_rountrip(tmpdir): size = 10000 + np.random.seed(0) df = pd.DataFrame({ + 'uint8': np.arange(size, dtype=np.uint8), + 'uint16': np.arange(size, dtype=np.uint16), + 'uint32': np.arange(size, dtype=np.uint32), + 'uint64': np.arange(size, dtype=np.uint64), + 'int8': np.arange(size, dtype=np.int16), + 'int16': np.arange(size, dtype=np.int16), 'int32': np.arange(size, dtype=np.int32), 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), - 'float64': np.arange(size, dtype=np.float64) + 'float64': np.arange(size, dtype=np.float64), + 'bool': np.random.randn(size) > 0, + 'str': [str(x) for x in range(size)], + 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None] }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) - A.parquet.write_table(arrow_table, filename.strpath) + A.parquet.write_table(arrow_table, filename.strpath, version="2.0") table_read = pyarrow.parquet.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read) +def test_pandas_parquet_1_0_rountrip(tmpdir): + size = 10000 + np.random.seed(0) + df = pd.DataFrame({ + 'uint8': np.arange(size, dtype=np.uint8), + 'uint16': np.arange(size, dtype=np.uint16), + 'uint32': np.arange(size, dtype=np.uint32), + 'uint64': np.arange(size, dtype=np.uint64), + 'int8': np.arange(size, dtype=np.int16), + 'int16': np.arange(size, dtype=np.int16), + 'int32': np.arange(size, dtype=np.int32), + 'int64': np.arange(size, dtype=np.int64), + 'float32': np.arange(size, dtype=np.float32), + 'float64': np.arange(size, dtype=np.float64), + 'bool': np.random.randn(size) > 0 + }) + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = A.from_pandas_dataframe(df) + A.parquet.write_table(arrow_table, filename.strpath, version="1.0") + table_read = pyarrow.parquet.read_table(filename.strpath) + df_read = table_read.to_pandas() + + # We pass uint32_t as int64_t if we write Parquet version 1.0 + df['uint32'] = df['uint32'].values.astype(np.int64) + + pdt.assert_frame_equal(df, df_read) + From fab4c82d2668e4f8c450053c34dd70ea99365fac Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 1 Jul 2016 14:25:46 -0700 Subject: [PATCH 094/210] ARROW-234: Build libhdfs IO extension in conda artifacts Author: Wes McKinney Closes #97 from wesm/ARROW-234 and squashes the following commits: 3edb8d1 [Wes McKinney] Enable ARROW_HDFS extension in conda artifact --- cpp/conda.recipe/build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/conda.recipe/build.sh b/cpp/conda.recipe/build.sh index b10dd03349b..7e60ccc911f 100644 --- a/cpp/conda.recipe/build.sh +++ b/cpp/conda.recipe/build.sh @@ -49,6 +49,7 @@ cmake \ -DCMAKE_BUILD_TYPE=release \ -DCMAKE_INSTALL_PREFIX=$PREFIX \ -DCMAKE_SHARED_LINKER_FLAGS=$SHARED_LINKER_FLAGS \ + -DARROW_HDFS=on \ -DARROW_IPC=on \ -DARROW_PARQUET=on \ .. From 77598fa59a92c07dedf7d93307e5c72c5b2724d0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 10 Jul 2016 13:17:50 -0700 Subject: [PATCH 095/210] ARROW-233: Add visibility macros, add static build option This also resolves ARROW-213. Builds off work done in PARQUET-489. I inserted a hack to deal with the fast the boost libs in apt won't statically link properly. We'll deal with that some other time. Author: Wes McKinney Closes #100 from wesm/ARROW-233 and squashes the following commits: 0253827 [Wes McKinney] Remove -Wno-unused-local-typedef 69b03b0 [Wes McKinney] - Add visibility macros. Hide boost symbols in arrow_io - Hack around Travis CI inability to use its boost static libraries - Use parquet_shared name - More informative verbose test logs - Fix some gtest-1.7.0 crankiness - Fix a valgrind shared_ptr possible memory leak stemming from static variable referenced at compile-time in libarrow_parquet - Fix a bunch of compiler warnings in release builds --- ci/travis_install_conda.sh | 1 - ci/travis_script_cpp.sh | 2 +- ci/travis_script_python.sh | 6 +- cpp/CMakeLists.txt | 217 ++++++++++++----------- cpp/build-support/run-test.sh | 10 +- cpp/conda.recipe/build.sh | 13 +- cpp/src/arrow/array.h | 5 +- cpp/src/arrow/builder.h | 3 +- cpp/src/arrow/column.h | 5 +- cpp/src/arrow/io/CMakeLists.txt | 53 ++++-- cpp/src/arrow/io/hdfs-io-test.cc | 2 +- cpp/src/arrow/io/hdfs.h | 17 +- cpp/src/arrow/io/libhdfs_shim.cc | 3 +- cpp/src/arrow/io/symbols.map | 18 ++ cpp/src/arrow/ipc/CMakeLists.txt | 2 +- cpp/src/arrow/parquet/CMakeLists.txt | 4 +- cpp/src/arrow/parquet/parquet-io-test.cc | 18 +- cpp/src/arrow/parquet/reader.cc | 2 +- cpp/src/arrow/parquet/reader.h | 6 +- cpp/src/arrow/parquet/schema.h | 10 +- cpp/src/arrow/parquet/writer.cc | 4 +- cpp/src/arrow/parquet/writer.h | 6 +- cpp/src/arrow/schema.h | 4 +- cpp/src/arrow/symbols.map | 15 ++ cpp/src/arrow/table.h | 6 +- cpp/src/arrow/type.h | 39 ++-- cpp/src/arrow/types/construct.h | 11 +- cpp/src/arrow/types/decimal.h | 3 +- cpp/src/arrow/types/list.h | 7 +- cpp/src/arrow/types/primitive.h | 13 +- cpp/src/arrow/types/string-test.cc | 8 +- cpp/src/arrow/types/string.cc | 11 +- cpp/src/arrow/types/string.h | 16 +- cpp/src/arrow/types/struct-test.cc | 8 +- cpp/src/arrow/types/struct.h | 5 +- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/buffer.h | 12 +- cpp/src/arrow/util/memory-pool-test.cc | 2 +- cpp/src/arrow/util/memory-pool.h | 6 +- cpp/src/arrow/util/status.cc | 3 + cpp/src/arrow/util/status.h | 4 +- cpp/src/arrow/util/visibility.h | 32 ++++ python/conda.recipe/build.sh | 15 +- python/src/pyarrow/adapters/builtin.h | 2 + python/src/pyarrow/adapters/pandas.h | 5 + python/src/pyarrow/common.h | 6 +- python/src/pyarrow/config.h | 4 + python/src/pyarrow/helpers.h | 3 + python/src/pyarrow/status.h | 4 +- python/src/pyarrow/visibility.h | 32 ++++ 50 files changed, 439 insertions(+), 245 deletions(-) create mode 100644 cpp/src/arrow/io/symbols.map create mode 100644 cpp/src/arrow/symbols.map create mode 100644 cpp/src/arrow/util/visibility.h create mode 100644 python/src/pyarrow/visibility.h diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh index be7f59a4733..3a8f57bf8f1 100644 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -25,4 +25,3 @@ conda install --yes conda-build jinja2 anaconda-client # faster builds, please conda install -y nomkl - diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index 9cf4f8e3521..a3585507f0a 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -16,6 +16,6 @@ make lint # make check-clang-tidy # fi -ctest -L unittest +ctest -VV -L unittest popd diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 6d35785356a..4a377428ae4 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -7,7 +7,6 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ export MINICONDA=$TRAVIS_BUILD_DIR/miniconda export PATH="$MINICONDA/bin:$PATH" -export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH" export PARQUET_HOME=$MINICONDA # Share environment with C++ @@ -32,12 +31,15 @@ python_version_tests() { # Expensive dependencies install from Continuum package repo conda install -y pip numpy pandas cython + conda install -y parquet-cpp arrow-cpp -c apache/channel/dev + # Other stuff pip install pip install -r requirements.txt export ARROW_HOME=$ARROW_CPP_INSTALL - python setup.py build_ext --inplace + python setup.py build_ext \ + --inplace python -m pytest -vv -r sxX pyarrow } diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 18b47599b93..a39a7521231 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -44,12 +44,22 @@ endif(CCACHE_FOUND) # Top level cmake dir if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + option(ARROW_BUILD_STATIC + "Build the libarrow static libraries" + ON) + + option(ARROW_BUILD_SHARED + "Build the libarrow shared libraries" + ON) + option(ARROW_PARQUET "Build the Parquet adapter and link to libparquet" OFF) + option(ARROW_TEST_MEMCHECK - "Run the test suite using valgrind --tool=memcheck" - OFF) + "Run the test suite using valgrind --tool=memcheck" + OFF) + option(ARROW_BUILD_TESTS "Build the Arrow googletest unit tests" ON) @@ -66,6 +76,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the Arrow IO extensions for the Hadoop file system" OFF) + option(ARROW_BOOST_USE_SHARED + "Rely on boost shared libraries where relevant" + ON) + option(ARROW_SSE3 "Build Arrow with SSE3" ON) @@ -172,18 +186,6 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CLANG_OPTIONS}") endif() -# Sanity check linking option. -if (NOT ARROW_LINK) - set(ARROW_LINK "d") -elseif(NOT ("auto" MATCHES "^${ARROW_LINK}" OR - "dynamic" MATCHES "^${ARROW_LINK}" OR - "static" MATCHES "^${ARROW_LINK}")) - message(FATAL_ERROR "Unknown value for ARROW_LINK, must be auto|dynamic|static") -else() - # Remove all but the first letter. - string(SUBSTRING "${ARROW_LINK}" 0 1 ARROW_LINK) -endif() - # ASAN / TSAN / UBSAN include(san-config) @@ -203,61 +205,11 @@ if ("${ARROW_GENERATE_COVERAGE}") # For coverage to work properly, we need to use static linkage. Otherwise, # __gcov_flush() doesn't properly flush coverage from every module. # See http://stackoverflow.com/questions/28164543/using-gcov-flush-within-a-library-doesnt-force-the-other-modules-to-yield-gc - if("${ARROW_LINK}" STREQUAL "a") - message("Using static linking for coverage build") - set(ARROW_LINK "s") - elseif("${ARROW_LINK}" STREQUAL "d") - message(SEND_ERROR "Cannot use coverage with dynamic linking") - endif() -endif() - -# If we still don't know what kind of linking to perform, choose based on -# build type (developers like fast builds). -if ("${ARROW_LINK}" STREQUAL "a") - if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG" OR - "${CMAKE_BUILD_TYPE}" STREQUAL "FASTDEBUG") - message("Using dynamic linking for ${CMAKE_BUILD_TYPE} builds") - set(ARROW_LINK "d") - else() - message("Using static linking for ${CMAKE_BUILD_TYPE} builds") - set(ARROW_LINK "s") + if(NOT ARROW_BUILD_STATIC) + message(SEND_ERROR "Coverage requires the static lib to be built") endif() endif() -# Are we using the gold linker? It doesn't work with dynamic linking as -# weak symbols aren't properly overridden, causing tcmalloc to be omitted. -# Let's flag this as an error in RELEASE builds (we shouldn't release a -# product like this). -# -# See https://sourceware.org/bugzilla/show_bug.cgi?id=16979 for details. -# -# The gold linker is only for ELF binaries, which OSX doesn't use. We can -# just skip. -if (NOT APPLE) - execute_process(COMMAND ${CMAKE_CXX_COMPILER} -Wl,--version OUTPUT_VARIABLE LINKER_OUTPUT) -endif () -if (LINKER_OUTPUT MATCHES "gold") - if ("${ARROW_LINK}" STREQUAL "d" AND - "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") - message(SEND_ERROR "Cannot use gold with dynamic linking in a RELEASE build " - "as it would cause tcmalloc symbols to get dropped") - else() - message("Using gold linker") - endif() - set(ARROW_USING_GOLD 1) -else() - message("Using ld linker") -endif() - -# Having set ARROW_LINK due to build type and/or sanitizer, it's now safe to -# act on its value. -if ("${ARROW_LINK}" STREQUAL "d") - set(BUILD_SHARED_LIBS ON) - - # Position independent code is only necessary when producing shared objects. - add_definitions(-fPIC) -endif() - # set compile output directory string (TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME) @@ -290,6 +242,15 @@ set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}") set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}") include_directories(src) +############################################################ +# Visibility +############################################################ +# For generate_export_header() and add_compiler_export_flags(). +include(GenerateExportHeader) + +# Sets -fvisibility=hidden for gcc +add_compiler_export_flags() + ############################################################ # Benchmarking ############################################################ @@ -360,7 +321,7 @@ endfunction() # # Arguments after the test name will be passed to set_tests_properties(). function(ADD_ARROW_TEST REL_TEST_NAME) - if(NO_TESTS) + if(NO_TESTS OR NOT ARROW_BUILD_STATIC) return() endif() get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) @@ -377,13 +338,13 @@ function(ADD_ARROW_TEST REL_TEST_NAME) endif() if (ARROW_TEST_MEMCHECK) - SET_PROPERTY(TARGET ${TEST_NAME} - APPEND_STRING PROPERTY - COMPILE_FLAGS " -DARROW_VALGRIND") - add_test(${TEST_NAME} - valgrind --tool=memcheck --leak-check=full --error-exitcode=1 ${TEST_PATH}) + SET_PROPERTY(TARGET ${TEST_NAME} + APPEND_STRING PROPERTY + COMPILE_FLAGS " -DARROW_VALGRIND") + add_test(${TEST_NAME} + valgrind --tool=memcheck --leak-check=full --error-exitcode=1 ${TEST_PATH}) else() - add_test(${TEST_NAME} + add_test(${TEST_NAME} ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) endif() set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") @@ -427,19 +388,34 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() - if(("${ARROW_LINK}" STREQUAL "s" AND ARG_STATIC_LIB) OR (NOT ARG_SHARED_LIB)) + if(ARG_STATIC_LIB AND ARG_SHARED_LIB) if(NOT ARG_STATIC_LIB) message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") endif() + + SET(AUG_LIB_NAME "${LIB_NAME}_static") + add_library(${AUG_LIB_NAME} STATIC IMPORTED) + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") + + SET(AUG_LIB_NAME "${LIB_NAME}_shared") + add_library(${AUG_LIB_NAME} SHARED IMPORTED) + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + elseif(ARG_STATIC_LIB) add_library(${LIB_NAME} STATIC IMPORTED) set_target_properties(${LIB_NAME} PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") - else() + elseif(ARG_SHARED_LIB) add_library(${LIB_NAME} SHARED IMPORTED) set_target_properties(${LIB_NAME} PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + else() + message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") endif() if(ARG_DEPS) @@ -538,9 +514,17 @@ endif() ############################################################ # Linker setup ############################################################ -set(ARROW_MIN_TEST_LIBS arrow arrow_test_main ${ARROW_BASE_LIBS}) +set(ARROW_MIN_TEST_LIBS + arrow_static + arrow_test_main + ${ARROW_BASE_LIBS}) + set(ARROW_TEST_LINK_LIBS ${ARROW_MIN_TEST_LIBS}) -set(ARROW_BENCHMARK_LINK_LIBS arrow arrow_benchmark_main ${ARROW_BASE_LIBS}) + +set(ARROW_BENCHMARK_LINK_LIBS + arrow_static + arrow_benchmark_main + ${ARROW_BASE_LIBS}) ############################################################ # "make ctags" target @@ -576,14 +560,14 @@ endif (UNIX) if (UNIX) file(GLOB_RECURSE LINT_FILES - "${CMAKE_CURRENT_SOURCE_DIR}/src/*.h" - "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc" - ) + "${CMAKE_CURRENT_SOURCE_DIR}/src/*.h" + "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc" + ) FOREACH(item ${LINT_FILES}) - IF(NOT (item MATCHES "_generated.h")) + IF(NOT (item MATCHES "_generated.h")) LIST(APPEND FILTERED_LINT_FILES ${item}) - ENDIF() + ENDIF() ENDFOREACH(item ${LINT_FILES}) # Full lint @@ -628,7 +612,10 @@ endif() # Subdirectories ############################################################ -set(LIBARROW_LINK_LIBS +set(ARROW_LINK_LIBS +) + +set(ARROW_PRIVATE_LINK_LIBS ) set(ARROW_SRCS @@ -660,35 +647,67 @@ set(ARROW_SRCS src/arrow/util/status.cc ) -set(LIBARROW_LINKAGE "SHARED") - -add_library(arrow - ${LIBARROW_LINKAGE} +add_library(arrow_objlib OBJECT ${ARROW_SRCS} ) +# Necessary to make static linking into other shared libraries work properly +set_property(TARGET arrow_objlib PROPERTY POSITION_INDEPENDENT_CODE 1) + +if(NOT APPLE) + # Localize thirdparty symbols using a linker version script. This hides them + # from the client application. The OS X linker does not support the + # version-script option. + set(SHARED_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/src/arrow/symbols.map") +endif() + +if (ARROW_BUILD_SHARED) + add_library(arrow_shared SHARED $) + if(APPLE) + set_target_properties(arrow_shared PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + endif() + set_target_properties(arrow_shared + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" + LINK_FLAGS "${SHARED_LINK_FLAGS}" + OUTPUT_NAME "arrow") + target_link_libraries(arrow_shared + LINK_PUBLIC ${ARROW_LINK_LIBS} + LINK_PRIVATE ${ARROW_PRIVATE_LINK_LIBS}) + + install(TARGETS arrow_shared + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +endif() + +if (ARROW_BUILD_STATIC) + add_library(arrow_static STATIC $) + set_target_properties(arrow_static + PROPERTIES + LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" + OUTPUT_NAME "arrow") + + target_link_libraries(arrow_static + LINK_PUBLIC ${ARROW_LINK_LIBS} + LINK_PRIVATE ${ARROW_PRIVATE_LINK_LIBS}) + + install(TARGETS arrow_static + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +endif() + if (APPLE) - set_target_properties(arrow + set_target_properties(arrow_shared PROPERTIES BUILD_WITH_INSTALL_RPATH ON INSTALL_NAME_DIR "@rpath") endif() -set_target_properties(arrow - PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" -) -target_link_libraries(arrow ${LIBARROW_LINK_LIBS}) - add_subdirectory(src/arrow) add_subdirectory(src/arrow/io) add_subdirectory(src/arrow/util) add_subdirectory(src/arrow/types) -install(TARGETS arrow - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) - #---------------------------------------------------------------------- # Parquet adapter library @@ -715,7 +734,7 @@ if(ARROW_IPC) include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) add_library(flatbuffers STATIC IMPORTED) set_target_properties(flatbuffers PROPERTIES - IMPORTED_LOCATION ${FLATBUFFERS_STATIC_LIB}) + IMPORTED_LOCATION ${FLATBUFFERS_STATIC_LIB}) add_subdirectory(src/arrow/ipc) endif() diff --git a/cpp/build-support/run-test.sh b/cpp/build-support/run-test.sh index 0e628e26ecd..f563da53679 100755 --- a/cpp/build-support/run-test.sh +++ b/cpp/build-support/run-test.sh @@ -79,16 +79,16 @@ function setup_sanitizers() { TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$ROOT/build-support/tsan-suppressions.txt" TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" export TSAN_OPTIONS - + # Enable leak detection even under LLVM 3.4, where it was disabled by default. # This flag only takes effect when running an ASAN build. ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" export ASAN_OPTIONS - + # Set up suppressions for LeakSanitizer LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt" export LSAN_OPTIONS - + # Suppressions require symbolization. We'll default to using the symbolizer in # thirdparty. if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then @@ -107,7 +107,7 @@ function run_test() { | $ROOT/build-support/asan_symbolize.py \ | c++filt \ | $ROOT/build-support/stacktrace_addr2line.pl $TEST_EXECUTABLE \ - | $pipe_cmd > $LOGFILE + | $pipe_cmd 2>&1 | tee $LOGFILE STATUS=$? # TSAN doesn't always exit with a non-zero exit code due to a bug: @@ -198,7 +198,7 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do fi done -if [ $RUN_TYPE = "test" ]; then +if [ $RUN_TYPE = "test" ]; then post_process_tests fi diff --git a/cpp/conda.recipe/build.sh b/cpp/conda.recipe/build.sh index 7e60ccc911f..2f2b7482667 100644 --- a/cpp/conda.recipe/build.sh +++ b/cpp/conda.recipe/build.sh @@ -39,16 +39,17 @@ pwd source thirdparty/versions.sh export GTEST_HOME=`pwd`/thirdparty/$GTEST_BASEDIR -if [ `uname` == Linux ]; then - SHARED_LINKER_FLAGS='-static-libstdc++' -elif [ `uname` == Darwin ]; then - SHARED_LINKER_FLAGS='' -fi +# if [ `uname` == Linux ]; then +# SHARED_LINKER_FLAGS='-static-libstdc++' +# elif [ `uname` == Darwin ]; then +# SHARED_LINKER_FLAGS='' +# fi + +# -DCMAKE_SHARED_LINKER_FLAGS=$SHARED_LINKER_FLAGS \ cmake \ -DCMAKE_BUILD_TYPE=release \ -DCMAKE_INSTALL_PREFIX=$PREFIX \ - -DCMAKE_SHARED_LINKER_FLAGS=$SHARED_LINKER_FLAGS \ -DARROW_HDFS=on \ -DARROW_IPC=on \ -DARROW_PARQUET=on \ diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 76dc0f59814..c7ffb23ca18 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -24,6 +24,7 @@ #include "arrow/type.h" #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -35,7 +36,7 @@ class Status; // // The base class is only required to have a null bitmap buffer if the null // count is greater than 0 -class Array { +class ARROW_EXPORT Array { public: Array(const std::shared_ptr& type, int32_t length, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); @@ -83,7 +84,7 @@ class Array { }; // Degenerate null type Array -class NullArray : public Array { +class ARROW_EXPORT NullArray : public Array { public: NullArray(const std::shared_ptr& type, int32_t length) : Array(type, length, length, nullptr) {} diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 7d3f4398d73..5d9fb992ff0 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -25,6 +25,7 @@ #include "arrow/type.h" #include "arrow/util/macros.h" #include "arrow/util/status.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -38,7 +39,7 @@ static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; // This class provides a facilities for incrementally building the null bitmap // (see Append methods) and as a side effect the current number of slots and // the null count. -class ArrayBuilder { +class ARROW_EXPORT ArrayBuilder { public: explicit ArrayBuilder(MemoryPool* pool, const TypePtr& type) : pool_(pool), diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h index e409566e1f1..d5168cb032b 100644 --- a/cpp/src/arrow/column.h +++ b/cpp/src/arrow/column.h @@ -24,6 +24,7 @@ #include #include "arrow/type.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -34,7 +35,7 @@ typedef std::vector> ArrayVector; // A data structure managing a list of primitive Arrow arrays logically as one // large array -class ChunkedArray { +class ARROW_EXPORT ChunkedArray { public: explicit ChunkedArray(const ArrayVector& chunks); @@ -56,7 +57,7 @@ class ChunkedArray { // An immutable column data structure consisting of a field (type metadata) and // a logical chunked data array (which can be validated as all being the same // type). -class Column { +class ARROW_EXPORT Column { public: Column(const std::shared_ptr& field, const ArrayVector& chunks); Column(const std::shared_ptr& field, const std::shared_ptr& data); diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 33b654f8190..b8c0e138afb 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -19,13 +19,18 @@ # arrow_io : Arrow IO interfaces set(ARROW_IO_LINK_LIBS - arrow + arrow_shared ) -set(ARROW_IO_PRIVATE_LINK_LIBS - boost_system - boost_filesystem -) +if (ARROW_BOOST_USE_SHARED) + set(ARROW_IO_PRIVATE_LINK_LIBS + boost_system_shared + boost_filesystem_shared) +else() + set(ARROW_IO_PRIVATE_LINK_LIBS + boost_system_static + boost_filesystem_static) +endif() set(ARROW_IO_TEST_LINK_LIBS arrow_io @@ -36,18 +41,18 @@ set(ARROW_IO_SRCS if(ARROW_HDFS) if(NOT THIRDPARTY_DIR) - message(FATAL_ERROR "THIRDPARTY_DIR not set") + message(FATAL_ERROR "THIRDPARTY_DIR not set") endif() if (DEFINED ENV{HADOOP_HOME}) - set(HADOOP_HOME $ENV{HADOOP_HOME}) + set(HADOOP_HOME $ENV{HADOOP_HOME}) else() - set(HADOOP_HOME "${THIRDPARTY_DIR}/hadoop") + set(HADOOP_HOME "${THIRDPARTY_DIR}/hadoop") endif() set(HDFS_H_PATH "${HADOOP_HOME}/include/hdfs.h") if (NOT EXISTS ${HDFS_H_PATH}) - message(FATAL_ERROR "Did not find hdfs.h at ${HDFS_H_PATH}") + message(FATAL_ERROR "Did not find hdfs.h at ${HDFS_H_PATH}") endif() message(STATUS "Found hdfs.h at: " ${HDFS_H_PATH}) message(STATUS "Building libhdfs shim component") @@ -55,29 +60,39 @@ if(ARROW_HDFS) include_directories(SYSTEM "${HADOOP_HOME}/include") set(ARROW_HDFS_SRCS - hdfs.cc - libhdfs_shim.cc) + hdfs.cc + libhdfs_shim.cc) set_property(SOURCE ${ARROW_HDFS_SRCS} - APPEND_STRING PROPERTY - COMPILE_FLAGS "-DHAS_HADOOP") + APPEND_STRING PROPERTY + COMPILE_FLAGS "-DHAS_HADOOP") set(ARROW_IO_SRCS - ${ARROW_HDFS_SRCS} - ${ARROW_IO_SRCS}) + ${ARROW_HDFS_SRCS} + ${ARROW_IO_SRCS}) ADD_ARROW_TEST(hdfs-io-test) ARROW_TEST_LINK_LIBRARIES(hdfs-io-test - ${ARROW_IO_TEST_LINK_LIBS}) + ${ARROW_IO_TEST_LINK_LIBS}) endif() add_library(arrow_io SHARED ${ARROW_IO_SRCS} ) -target_link_libraries(arrow_io LINK_PUBLIC ${ARROW_IO_LINK_LIBS}) -target_link_libraries(arrow_io LINK_PRIVATE ${ARROW_IO_PRIVATE_LINK_LIBS}) +target_link_libraries(arrow_io + LINK_PUBLIC ${ARROW_IO_LINK_LIBS} + LINK_PRIVATE ${ARROW_IO_PRIVATE_LINK_LIBS}) + +if(NOT APPLE) + # Localize thirdparty symbols using a linker version script. This hides them + # from the client application. The OS X linker does not support the + # version-script option. + set(ARROW_IO_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map") +endif() -SET_TARGET_PROPERTIES(arrow_io PROPERTIES LINKER_LANGUAGE CXX) +SET_TARGET_PROPERTIES(arrow_io PROPERTIES + LINKER_LANGUAGE CXX + LINK_FLAGS "${ARROW_IO_LINK_FLAGS}") if (APPLE) set_target_properties(arrow_io diff --git a/cpp/src/arrow/io/hdfs-io-test.cc b/cpp/src/arrow/io/hdfs-io-test.cc index 11d67aeba20..d1bf140ae68 100644 --- a/cpp/src/arrow/io/hdfs-io-test.cc +++ b/cpp/src/arrow/io/hdfs-io-test.cc @@ -227,7 +227,7 @@ TEST_F(TestHdfsClient, ListDirectory) { // Do it again, appends! ASSERT_OK(client_->ListDirectory(scratch_dir_, &listing)); - ASSERT_EQ(6, listing.size()); + ASSERT_EQ(6, static_cast(listing.size())); // Argh, well, shouldn't expect the listing to be in any particular order for (size_t i = 0; i < listing.size(); ++i) { diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index a1972db9615..532e3c536a1 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -25,6 +25,7 @@ #include "arrow/io/interfaces.h" #include "arrow/util/macros.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -32,8 +33,6 @@ class Status; namespace io { -Status ConnectLibHdfs(); - class HdfsClient; class HdfsReadableFile; class HdfsWriteableFile; @@ -64,7 +63,7 @@ struct HdfsConnectionConfig { // TODO: Kerberos, etc. }; -class HdfsClient : public FileSystemClient { +class ARROW_EXPORT HdfsClient : public FileSystemClient { public: ~HdfsClient(); @@ -149,14 +148,14 @@ class HdfsClient : public FileSystemClient { friend class HdfsReadableFile; friend class HdfsWriteableFile; - class HdfsClientImpl; + class ARROW_NO_EXPORT HdfsClientImpl; std::unique_ptr impl_; HdfsClient(); DISALLOW_COPY_AND_ASSIGN(HdfsClient); }; -class HdfsReadableFile : public RandomAccessFile { +class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { public: ~HdfsReadableFile(); @@ -175,7 +174,7 @@ class HdfsReadableFile : public RandomAccessFile { Status Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) override; private: - class HdfsReadableFileImpl; + class ARROW_NO_EXPORT HdfsReadableFileImpl; std::unique_ptr impl_; friend class HdfsClient::HdfsClientImpl; @@ -184,7 +183,7 @@ class HdfsReadableFile : public RandomAccessFile { DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); }; -class HdfsWriteableFile : public WriteableFile { +class ARROW_EXPORT HdfsWriteableFile : public WriteableFile { public: ~HdfsWriteableFile(); @@ -197,7 +196,7 @@ class HdfsWriteableFile : public WriteableFile { Status Tell(int64_t* position) override; private: - class HdfsWriteableFileImpl; + class ARROW_NO_EXPORT HdfsWriteableFileImpl; std::unique_ptr impl_; friend class HdfsClient::HdfsClientImpl; @@ -207,6 +206,8 @@ class HdfsWriteableFile : public WriteableFile { DISALLOW_COPY_AND_ASSIGN(HdfsWriteableFile); }; +Status ARROW_EXPORT ConnectLibHdfs(); + } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index f75266536e5..003570d4fde 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -55,6 +55,7 @@ extern "C" { #include // NOLINT #include "arrow/util/status.h" +#include "arrow/util/visibility.h" namespace fs = boost::filesystem; @@ -496,7 +497,7 @@ static arrow::Status try_dlopen( namespace arrow { namespace io { -Status ConnectLibHdfs() { +Status ARROW_EXPORT ConnectLibHdfs() { static std::mutex lock; std::lock_guard guard(lock); diff --git a/cpp/src/arrow/io/symbols.map b/cpp/src/arrow/io/symbols.map new file mode 100644 index 00000000000..b4ad98cd7f2 --- /dev/null +++ b/cpp/src/arrow/io/symbols.map @@ -0,0 +1,18 @@ +{ + # Symbols marked as 'local' are not exported by the DSO and thus may not + # be used by client applications. + local: + # devtoolset / static-libstdc++ symbols + __cxa_*; + + extern "C++" { + # boost + boost::*; + + # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically + # links c++11 symbols into binaries so that the result may be executed on + # a system with an older libstdc++ which doesn't include the necessary + # c++11 symbols. + std::*; + }; +}; diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 383684f42f9..82634169ed9 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -48,4 +48,4 @@ add_custom_command( ) add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) -add_dependencies(arrow metadata_fbs) +add_dependencies(arrow_objlib metadata_fbs) diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index f00bb53c084..00f19b354e3 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -25,8 +25,8 @@ set(PARQUET_SRCS ) set(PARQUET_LIBS - arrow - ${PARQUET_SHARED_LIB} + arrow_shared + parquet_shared ) add_library(arrow_parquet SHARED diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index 572cae16e58..bfc27d26d63 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -411,7 +411,7 @@ class TestPrimitiveParquetIO : public TestParquetIO { public: typedef typename TestType::c_type T; - void TestFile(std::vector& values, int num_chunks, + void MakeTestFile(std::vector& values, int num_chunks, std::unique_ptr* file_reader) { std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); std::unique_ptr file_writer = this->MakeWriter(schema); @@ -435,10 +435,10 @@ class TestPrimitiveParquetIO : public TestParquetIO { *file_reader = this->ReaderFromSink(); } - void TestSingleColumnRequiredTableRead(int num_chunks) { + void CheckSingleColumnRequiredTableRead(int num_chunks) { std::vector values(SMALL_SIZE, test_traits::value); std::unique_ptr file_reader; - ASSERT_NO_THROW(TestFile(values, num_chunks, &file_reader)); + ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); std::shared_ptr
out; this->ReadTableFromFile(std::move(file_reader), &out); @@ -450,10 +450,10 @@ class TestPrimitiveParquetIO : public TestParquetIO { ExpectArray(values.data(), chunked_array->chunk(0).get()); } - void TestSingleColumnRequiredRead(int num_chunks) { + void CheckSingleColumnRequiredRead(int num_chunks) { std::vector values(SMALL_SIZE, test_traits::value); std::unique_ptr file_reader; - ASSERT_NO_THROW(TestFile(values, num_chunks, &file_reader)); + ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); std::shared_ptr out; this->ReadSingleColumnFile(std::move(file_reader), &out); @@ -469,19 +469,19 @@ typedef ::testing::TypesTestSingleColumnRequiredRead(1); + this->CheckSingleColumnRequiredRead(1); } TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredTableRead) { - this->TestSingleColumnRequiredTableRead(1); + this->CheckSingleColumnRequiredTableRead(1); } TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedRead) { - this->TestSingleColumnRequiredRead(4); + this->CheckSingleColumnRequiredRead(4); } TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { - this->TestSingleColumnRequiredTableRead(4); + this->CheckSingleColumnRequiredTableRead(4); } } // namespace parquet diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 7b05665b230..c7c400e9573 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -213,7 +213,7 @@ Status FlatColumnReader::Impl::ReadNonNullableBatch(typename ParquetType::c_type using ParquetCType = typename ParquetType::c_type; DCHECK(builder); - const ArrowCType* values_ptr; + const ArrowCType* values_ptr = nullptr; RETURN_NOT_OK( (ConvertPhysicalType(values, values_read, &values_ptr))); RETURN_NOT_OK(builder->Append(values_ptr, values_read)); diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h index db7a15753d8..2c8a9dfd025 100644 --- a/cpp/src/arrow/parquet/reader.h +++ b/cpp/src/arrow/parquet/reader.h @@ -23,6 +23,8 @@ #include "parquet/api/reader.h" #include "parquet/api/schema.h" +#include "arrow/util/visibility.h" + namespace arrow { class Array; @@ -77,7 +79,7 @@ class FlatColumnReader; // // This is additionally complicated "chunky" repeated fields or very large byte // arrays -class FileReader { +class ARROW_EXPORT FileReader { public: FileReader(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader); @@ -107,7 +109,7 @@ class FileReader { // // We also do not expose any internal Parquet details, such as row groups. This // might change in the future. -class FlatColumnReader { +class ARROW_EXPORT FlatColumnReader { public: virtual ~FlatColumnReader(); diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h index 39bee059522..88b5977d223 100644 --- a/cpp/src/arrow/parquet/schema.h +++ b/cpp/src/arrow/parquet/schema.h @@ -25,6 +25,7 @@ #include "arrow/schema.h" #include "arrow/type.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -32,15 +33,16 @@ class Status; namespace parquet { -Status NodeToField(const ::parquet::schema::NodePtr& node, std::shared_ptr* out); +Status ARROW_EXPORT NodeToField( + const ::parquet::schema::NodePtr& node, std::shared_ptr* out); -Status FromParquetSchema( +Status ARROW_EXPORT FromParquetSchema( const ::parquet::SchemaDescriptor* parquet_schema, std::shared_ptr* out); -Status FieldToNode(const std::shared_ptr& field, +Status ARROW_EXPORT FieldToNode(const std::shared_ptr& field, const ::parquet::WriterProperties& properties, ::parquet::schema::NodePtr* out); -Status ToParquetSchema(const Schema* arrow_schema, +Status ARROW_EXPORT ToParquetSchema(const Schema* arrow_schema, const ::parquet::WriterProperties& properties, std::shared_ptr<::parquet::SchemaDescriptor>* out); diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 63449bb20b1..0139edd3bb8 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -118,7 +118,7 @@ Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer, reinterpret_cast<::parquet::TypedColumnWriter*>(column_writer); if (writer->descr()->max_definition_level() == 0) { // no nulls, just dump the data - const ParquetCType* data_writer_ptr; + const ParquetCType* data_writer_ptr = nullptr; RETURN_NOT_OK((ConvertPhysicalType( data_ptr, length, &data_writer_ptr))); PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, data_writer_ptr)); @@ -128,7 +128,7 @@ Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer, reinterpret_cast(def_levels_buffer_.mutable_data()); if (data->null_count() == 0) { std::fill(def_levels_ptr, def_levels_ptr + length, 1); - const ParquetCType* data_writer_ptr; + const ParquetCType* data_writer_ptr = nullptr; RETURN_NOT_OK((ConvertPhysicalType( data_ptr, length, &data_writer_ptr))); PARQUET_CATCH_NOT_OK( diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index cfd80d80b79..45d0fd59868 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -23,6 +23,8 @@ #include "parquet/api/schema.h" #include "parquet/api/writer.h" +#include "arrow/util/visibility.h" + namespace arrow { class Array; @@ -40,7 +42,7 @@ namespace parquet { * Start a new RowGroup/Chunk with NewRowGroup * Write column-by-column the whole column chunk */ -class FileWriter { +class ARROW_EXPORT FileWriter { public: FileWriter(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); @@ -62,7 +64,7 @@ class FileWriter { * * The table shall only consist of nullable, non-repeated columns of primitive type. */ -Status WriteFlatTable(const Table* table, MemoryPool* pool, +Status ARROW_EXPORT WriteFlatTable(const Table* table, MemoryPool* pool, const std::shared_ptr<::parquet::OutputStream>& sink, int64_t chunk_size, const std::shared_ptr<::parquet::WriterProperties>& properties = ::parquet::default_writer_properties()); diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h index a8b0d8444ac..4301968e015 100644 --- a/cpp/src/arrow/schema.h +++ b/cpp/src/arrow/schema.h @@ -22,11 +22,13 @@ #include #include +#include "arrow/util/visibility.h" + namespace arrow { struct Field; -class Schema { +class ARROW_EXPORT Schema { public: explicit Schema(const std::vector>& fields); diff --git a/cpp/src/arrow/symbols.map b/cpp/src/arrow/symbols.map new file mode 100644 index 00000000000..2ca8d730610 --- /dev/null +++ b/cpp/src/arrow/symbols.map @@ -0,0 +1,15 @@ +{ + # Symbols marked as 'local' are not exported by the DSO and thus may not + # be used by client applications. + local: + # devtoolset / static-libstdc++ symbols + __cxa_*; + + extern "C++" { + # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically + # links c++11 symbols into binaries so that the result may be executed on + # a system with an older libstdc++ which doesn't include the necessary + # c++11 symbols. + std::*; + }; +}; diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 756b2a19593..2088fdf0b64 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -23,6 +23,8 @@ #include #include +#include "arrow/util/visibility.h" + namespace arrow { class Array; @@ -33,7 +35,7 @@ class Status; // A row batch is a simpler and more rigid table data structure intended for // use primarily in shared memory IPC. It contains a schema (metadata) and a // corresponding vector of equal-length Arrow arrays -class RowBatch { +class ARROW_EXPORT RowBatch { public: // num_rows is a parameter to allow for row batches of a particular size not // having any materialized columns. Each array should have the same length as @@ -63,7 +65,7 @@ class RowBatch { }; // Immutable container of fixed-length columns conforming to a particular schema -class Table { +class ARROW_EXPORT Table { public: // If columns is zero-length, the table's number of rows is zero Table(const std::string& name, const std::shared_ptr& schema, diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 8fb41211ba9..4cb37fd1dea 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -24,6 +24,7 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -101,7 +102,7 @@ struct Type { struct Field; -struct DataType { +struct ARROW_EXPORT DataType { Type::type type; std::vector> children_; @@ -133,7 +134,7 @@ typedef std::shared_ptr TypePtr; // A field is a piece of metadata that includes (for now) a name and a data // type -struct Field { +struct ARROW_EXPORT Field { // Field name std::string name; @@ -163,7 +164,7 @@ struct Field { typedef std::shared_ptr FieldPtr; template -struct PrimitiveType : public DataType { +struct ARROW_EXPORT PrimitiveType : public DataType { PrimitiveType() : DataType(Derived::type_enum) {} std::string ToString() const override; @@ -185,55 +186,55 @@ inline std::string PrimitiveType::ToString() const { \ static const char* name() { return NAME; } -struct NullType : public PrimitiveType { +struct ARROW_EXPORT NullType : public PrimitiveType { PRIMITIVE_DECL(NullType, void, NA, 0, "null"); }; -struct BooleanType : public PrimitiveType { +struct ARROW_EXPORT BooleanType : public PrimitiveType { PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); }; -struct UInt8Type : public PrimitiveType { +struct ARROW_EXPORT UInt8Type : public PrimitiveType { PRIMITIVE_DECL(UInt8Type, uint8_t, UINT8, 1, "uint8"); }; -struct Int8Type : public PrimitiveType { +struct ARROW_EXPORT Int8Type : public PrimitiveType { PRIMITIVE_DECL(Int8Type, int8_t, INT8, 1, "int8"); }; -struct UInt16Type : public PrimitiveType { +struct ARROW_EXPORT UInt16Type : public PrimitiveType { PRIMITIVE_DECL(UInt16Type, uint16_t, UINT16, 2, "uint16"); }; -struct Int16Type : public PrimitiveType { +struct ARROW_EXPORT Int16Type : public PrimitiveType { PRIMITIVE_DECL(Int16Type, int16_t, INT16, 2, "int16"); }; -struct UInt32Type : public PrimitiveType { +struct ARROW_EXPORT UInt32Type : public PrimitiveType { PRIMITIVE_DECL(UInt32Type, uint32_t, UINT32, 4, "uint32"); }; -struct Int32Type : public PrimitiveType { +struct ARROW_EXPORT Int32Type : public PrimitiveType { PRIMITIVE_DECL(Int32Type, int32_t, INT32, 4, "int32"); }; -struct UInt64Type : public PrimitiveType { +struct ARROW_EXPORT UInt64Type : public PrimitiveType { PRIMITIVE_DECL(UInt64Type, uint64_t, UINT64, 8, "uint64"); }; -struct Int64Type : public PrimitiveType { +struct ARROW_EXPORT Int64Type : public PrimitiveType { PRIMITIVE_DECL(Int64Type, int64_t, INT64, 8, "int64"); }; -struct FloatType : public PrimitiveType { +struct ARROW_EXPORT FloatType : public PrimitiveType { PRIMITIVE_DECL(FloatType, float, FLOAT, 4, "float"); }; -struct DoubleType : public PrimitiveType { +struct ARROW_EXPORT DoubleType : public PrimitiveType { PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); }; -struct ListType : public DataType { +struct ARROW_EXPORT ListType : public DataType { // List can contain any other logical value type explicit ListType(const std::shared_ptr& value_type) : ListType(value_type, Type::LIST) {} @@ -260,7 +261,7 @@ struct ListType : public DataType { }; // BinaryType type is reprsents lists of 1-byte values. -struct BinaryType : public ListType { +struct ARROW_EXPORT BinaryType : public ListType { BinaryType() : BinaryType(Type::BINARY) {} static char const* name() { return "binary"; } std::string ToString() const override; @@ -272,7 +273,7 @@ struct BinaryType : public ListType { }; // UTF encoded strings -struct StringType : public BinaryType { +struct ARROW_EXPORT StringType : public BinaryType { StringType() : BinaryType(Type::STRING) {} static char const* name() { return "string"; } @@ -283,7 +284,7 @@ struct StringType : public BinaryType { explicit StringType(Type::type logical_type) : BinaryType(logical_type) {} }; -struct StructType : public DataType { +struct ARROW_EXPORT StructType : public DataType { explicit StructType(const std::vector>& fields) : DataType(Type::STRUCT) { children_ = fields; diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index d0370840ca1..afdadbe0790 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -21,6 +21,9 @@ #include #include #include + +#include "arrow/util/visibility.h" + namespace arrow { class Array; @@ -31,18 +34,18 @@ struct Field; class MemoryPool; class Status; -Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, +Status ARROW_EXPORT MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, std::shared_ptr* out); // Create new arrays for logical types that are backed by primitive arrays. -Status MakePrimitiveArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& data, int32_t null_count, +Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr& type, + int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out); // Create new list arrays for logical types that are backed by ListArrays (e.g. list of // primitives and strings) // TODO(emkornfield) split up string vs list? -Status MakeListArray(const std::shared_ptr& type, int32_t length, +Status ARROW_EXPORT MakeListArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& offests, const std::shared_ptr& values, int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out); diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 598df3ef70d..6c497c597d9 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -21,10 +21,11 @@ #include #include "arrow/type.h" +#include "arrow/util/visibility.h" namespace arrow { -struct DecimalType : public DataType { +struct ARROW_EXPORT DecimalType : public DataType { explicit DecimalType(int precision_, int scale_) : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} int precision; diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 2f6f85d66ca..f3894510d09 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -31,12 +31,13 @@ #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" +#include "arrow/util/visibility.h" namespace arrow { class MemoryPool; -class ListArray : public Array { +class ARROW_EXPORT ListArray : public Array { public: ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, const ArrayPtr& values, int32_t null_count = 0, @@ -96,7 +97,7 @@ class ListArray : public Array { // represent multiple different logical types. If no logical type is provided // at construction time, the class defaults to List where t is taken from the // value_builder/values that the object is constructed with. -class ListBuilder : public ArrayBuilder { +class ARROW_EXPORT ListBuilder : public ArrayBuilder { public: // Use this constructor to incrementally build the value array along with offsets and // null bitmap. @@ -116,6 +117,8 @@ class ListBuilder : public ArrayBuilder { offset_builder_(pool), values_(values) {} + virtual ~ListBuilder() {} + Status Init(int32_t elements) override { DCHECK_LT(elements, std::numeric_limits::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index f1ec417d510..18f954adc08 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -29,6 +29,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -36,7 +37,7 @@ class MemoryPool; // Base class for fixed-size logical types. See MakePrimitiveArray // (types/construct.h) for constructing a specific subclass. -class PrimitiveArray : public Array { +class ARROW_EXPORT PrimitiveArray : public Array { public: virtual ~PrimitiveArray() {} @@ -53,7 +54,7 @@ class PrimitiveArray : public Array { }; #define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ - class NAME : public PrimitiveArray { \ + class ARROW_EXPORT NAME : public PrimitiveArray { \ public: \ using value_type = T; \ \ @@ -102,7 +103,7 @@ NUMERIC_ARRAY_DECL(FloatArray, FloatType, float); NUMERIC_ARRAY_DECL(DoubleArray, DoubleType, double); template -class PrimitiveBuilder : public ArrayBuilder { +class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: typedef typename Type::c_type value_type; @@ -149,7 +150,7 @@ class PrimitiveBuilder : public ArrayBuilder { }; template -class NumericBuilder : public PrimitiveBuilder { +class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { public: using typename PrimitiveBuilder::value_type; using PrimitiveBuilder::PrimitiveBuilder; @@ -262,7 +263,7 @@ typedef NumericBuilder Int64Builder; typedef NumericBuilder FloatBuilder; typedef NumericBuilder DoubleBuilder; -class BooleanArray : public PrimitiveArray { +class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: BooleanArray(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); @@ -288,7 +289,7 @@ struct type_traits { } }; -class BooleanBuilder : public PrimitiveBuilder { +class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { public: explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) : PrimitiveBuilder(pool, type) {} diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index a141fc11321..6807b00e8ca 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -115,7 +115,7 @@ TEST_F(TestStringContainer, TestListFunctions) { int pos = 0; for (size_t i = 0; i < expected_.size(); ++i) { ASSERT_EQ(pos, strings_->value_offset(i)); - ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); + ASSERT_EQ(static_cast(expected_[i].size()), strings_->value_length(i)); pos += expected_[i].size(); } } @@ -189,7 +189,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { ASSERT_FALSE(result_->IsNull(i)); result_->GetValue(i, &length); ASSERT_EQ(pos, result_->offset(i)); - ASSERT_EQ(strings[i % N].size(), length); + ASSERT_EQ(static_cast(strings[i % N].size()), length); ASSERT_EQ(strings[i % N], result_->GetString(i)); pos += length; @@ -267,7 +267,7 @@ TEST_F(TestBinaryContainer, TestListFunctions) { int pos = 0; for (size_t i = 0; i < expected_.size(); ++i) { ASSERT_EQ(pos, strings_->value_offset(i)); - ASSERT_EQ(expected_[i].size(), strings_->value_length(i)); + ASSERT_EQ(static_cast(expected_[i].size()), strings_->value_length(i)); pos += expected_[i].size(); } } @@ -339,7 +339,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { } else { ASSERT_FALSE(result_->IsNull(i)); const uint8_t* vals = result_->GetValue(i, &length); - ASSERT_EQ(strings[i % N].size(), length); + ASSERT_EQ(static_cast(strings[i % N].size()), length); ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length)); } } diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index da02c7d1d8a..2f0037024c7 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -61,6 +61,15 @@ Status StringArray::Validate() const { return BinaryArray::Validate(); } -TypePtr BinaryBuilder::value_type_ = TypePtr(new UInt8Type()); +// This used to be a static member variable of BinaryBuilder, but it can cause +// valgrind to report a (spurious?) memory leak when needed in other shared +// libraries. The problem came up while adding explicit visibility to libarrow +// and libarrow_parquet +static TypePtr kBinaryValueType = TypePtr(new UInt8Type()); + +BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type) + : ListBuilder(pool, std::make_shared(pool, kBinaryValueType), type) { + byte_builder_ = static_cast(value_builder_.get()); +} } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index b3c00d298b3..bab0c58f617 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -28,13 +28,14 @@ #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/util/status.h" +#include "arrow/util/visibility.h" namespace arrow { class Buffer; class MemoryPool; -class BinaryArray : public ListArray { +class ARROW_EXPORT BinaryArray : public ListArray { public: BinaryArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, @@ -62,7 +63,7 @@ class BinaryArray : public ListArray { const uint8_t* raw_bytes_; }; -class StringArray : public BinaryArray { +class ARROW_EXPORT StringArray : public BinaryArray { public: StringArray(int32_t length, const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count = 0, @@ -87,12 +88,10 @@ class StringArray : public BinaryArray { }; // BinaryBuilder : public ListBuilder -class BinaryBuilder : public ListBuilder { +class ARROW_EXPORT BinaryBuilder : public ListBuilder { public: - explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type) - : ListBuilder(pool, std::make_shared(pool, value_type_), type) { - byte_builder_ = static_cast(value_builder_.get()); - } + explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type); + virtual ~BinaryBuilder() {} Status Append(const uint8_t* value, int32_t length) { RETURN_NOT_OK(ListBuilder::Append()); @@ -105,11 +104,10 @@ class BinaryBuilder : public ListBuilder { protected: UInt8Builder* byte_builder_; - static TypePtr value_type_; }; // String builder -class StringBuilder : public BinaryBuilder { +class ARROW_EXPORT StringBuilder : public BinaryBuilder { public: explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : BinaryBuilder(pool, type) {} diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index d2bd2971d04..ccf5a52dc83 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -116,7 +116,7 @@ class TestStructBuilder : public TestBuilder { ASSERT_OK(MakeBuilder(pool_, type_, &tmp)); builder_ = std::dynamic_pointer_cast(tmp); - ASSERT_EQ(2, builder_->field_builders().size()); + ASSERT_EQ(2, static_cast(builder_->field_builders().size())); } void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } @@ -132,7 +132,7 @@ class TestStructBuilder : public TestBuilder { TEST_F(TestStructBuilder, TestAppendNull) { ASSERT_OK(builder_->AppendNull()); ASSERT_OK(builder_->AppendNull()); - ASSERT_EQ(2, builder_->field_builders().size()); + ASSERT_EQ(2, static_cast(builder_->field_builders().size())); ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); ASSERT_OK(list_vb->AppendNull()); @@ -148,7 +148,7 @@ TEST_F(TestStructBuilder, TestAppendNull) { ASSERT_OK(result_->Validate()); - ASSERT_EQ(2, result_->fields().size()); + ASSERT_EQ(2, static_cast(result_->fields().size())); ASSERT_EQ(2, result_->length()); ASSERT_EQ(2, result_->field(0)->length()); ASSERT_EQ(2, result_->field(1)->length()); @@ -174,7 +174,7 @@ TEST_F(TestStructBuilder, TestBasics) { ListBuilder* list_vb = static_cast(builder_->field_builder(0).get()); Int8Builder* char_vb = static_cast(list_vb->value_builder().get()); Int32Builder* int_vb = static_cast(builder_->field_builder(1).get()); - ASSERT_EQ(2, builder_->field_builders().size()); + ASSERT_EQ(2, static_cast(builder_->field_builders().size())); EXPECT_OK(builder_->Resize(list_lengths.size())); EXPECT_OK(char_vb->Resize(list_values.size())); diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 78afd29eb8d..63955eb31bb 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -25,10 +25,11 @@ #include "arrow/type.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" +#include "arrow/util/visibility.h" namespace arrow { -class StructArray : public Array { +class ARROW_EXPORT StructArray : public Array { public: StructArray(const TypePtr& type, int32_t length, std::vector& field_arrays, int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) @@ -64,7 +65,7 @@ class StructArray : public Array { // Append, Resize and Reserve methods are acting on StructBuilder. // Please make sure all these methods of all child-builders' are consistently // called to maintain data-structure consistency. -class StructBuilder : public ArrayBuilder { +class ARROW_EXPORT StructBuilder : public ArrayBuilder { public: StructBuilder(MemoryPool* pool, const std::shared_ptr& type, const std::vector>& field_builders) diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index d2a4b091fad..4e941fb5f5c 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -27,6 +27,7 @@ install(FILES macros.h memory-pool.h status.h + visibility.h DESTINATION include/arrow/util) ####################################### diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index f845d67761f..1aeebc69b4e 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -26,6 +26,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/status.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -41,7 +42,7 @@ class Status; // Capacity is the number of bytes that where allocated for the buffer in // total. // The following invariant is always true: Size < Capacity -class Buffer : public std::enable_shared_from_this { +class ARROW_EXPORT Buffer : public std::enable_shared_from_this { public: Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size), capacity_(size) {} virtual ~Buffer(); @@ -95,7 +96,7 @@ class Buffer : public std::enable_shared_from_this { }; // A Buffer whose contents can be mutated. May or may not own its data. -class MutableBuffer : public Buffer { +class ARROW_EXPORT MutableBuffer : public Buffer { public: MutableBuffer(uint8_t* data, int64_t size) : Buffer(data, size) { mutable_data_ = data; @@ -112,7 +113,7 @@ class MutableBuffer : public Buffer { uint8_t* mutable_data_; }; -class ResizableBuffer : public MutableBuffer { +class ARROW_EXPORT ResizableBuffer : public MutableBuffer { public: // Change buffer reported size to indicated size, allocating memory if // necessary. This will ensure that the capacity of the buffer is a multiple @@ -129,7 +130,7 @@ class ResizableBuffer : public MutableBuffer { }; // A Buffer whose lifetime is tied to a particular MemoryPool -class PoolBuffer : public ResizableBuffer { +class ARROW_EXPORT PoolBuffer : public ResizableBuffer { public: explicit PoolBuffer(MemoryPool* pool = nullptr); virtual ~PoolBuffer(); @@ -145,7 +146,8 @@ static constexpr int64_t MIN_BUFFER_CAPACITY = 1024; class BufferBuilder { public: - explicit BufferBuilder(MemoryPool* pool) : pool_(pool), capacity_(0), size_(0) {} + explicit BufferBuilder(MemoryPool* pool) + : pool_(pool), data_(nullptr), capacity_(0), size_(0) {} // Resizes the buffer to the nearest multiple of 64 bytes per Layout.md Status Resize(int32_t elements) { diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index 4ab9736c2b4..8e7dfd60baa 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -31,7 +31,7 @@ TEST(DefaultMemoryPool, MemoryTracking) { uint8_t* data; ASSERT_OK(pool->Allocate(100, &data)); - EXPECT_EQ(0, reinterpret_cast(data) % 64); + EXPECT_EQ(static_cast(0), reinterpret_cast(data) % 64); ASSERT_EQ(100, pool->bytes_allocated()); pool->Free(data, 100); diff --git a/cpp/src/arrow/util/memory-pool.h b/cpp/src/arrow/util/memory-pool.h index 824c7248e2e..4c1d699addd 100644 --- a/cpp/src/arrow/util/memory-pool.h +++ b/cpp/src/arrow/util/memory-pool.h @@ -20,11 +20,13 @@ #include +#include "arrow/util/visibility.h" + namespace arrow { class Status; -class MemoryPool { +class ARROW_EXPORT MemoryPool { public: virtual ~MemoryPool(); @@ -34,7 +36,7 @@ class MemoryPool { virtual int64_t bytes_allocated() const = 0; }; -MemoryPool* default_memory_pool(); +ARROW_EXPORT MemoryPool* default_memory_pool(); } // namespace arrow diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index d194ed5572f..8dd07d0d064 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -58,6 +58,9 @@ std::string Status::CodeAsString() const { case StatusCode::NotImplemented: type = "NotImplemented"; break; + default: + type = "Unknown"; + break; } return std::string(type); } diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index d1a74250008..6ba2035bcd3 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -19,6 +19,8 @@ #include #include +#include "arrow/util/visibility.h" + // Return the given status if it is not OK. #define ARROW_RETURN_NOT_OK(s) \ do { \ @@ -82,7 +84,7 @@ enum class StatusCode : char { NotImplemented = 10, }; -class Status { +class ARROW_EXPORT Status { public: // Create a success status. Status() : state_(NULL) {} diff --git a/cpp/src/arrow/util/visibility.h b/cpp/src/arrow/util/visibility.h new file mode 100644 index 00000000000..b197c198297 --- /dev/null +++ b/cpp/src/arrow/util/visibility.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_VISIBILITY_H +#define ARROW_UTIL_VISIBILITY_H + +#if defined(_WIN32) || defined(__CYGWIN__) +#define ARROW_EXPORT __declspec(dllexport) +#else // Not Windows +#ifndef ARROW_EXPORT +#define ARROW_EXPORT __attribute__((visibility("default"))) +#endif +#ifndef ARROW_NO_EXPORT +#define ARROW_NO_EXPORT __attribute__((visibility("hidden"))) +#endif +#endif // Non-Windows + +#endif // ARROW_UTIL_VISIBILITY_H diff --git a/python/conda.recipe/build.sh b/python/conda.recipe/build.sh index a164c1af518..f32710073c7 100644 --- a/python/conda.recipe/build.sh +++ b/python/conda.recipe/build.sh @@ -19,13 +19,14 @@ if [ "$(uname)" == "Darwin" ]; then export MACOSX_DEPLOYMENT_TARGET=10.7 fi -echo Setting the compiler... -if [ `uname` == Linux ]; then - EXTRA_CMAKE_ARGS=-DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++ -elif [ `uname` == Darwin ]; then - EXTRA_CMAKE_ARGS= -fi +# echo Setting the compiler... +# if [ `uname` == Linux ]; then +# EXTRA_CMAKE_ARGS=-DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++ +# elif [ `uname` == Darwin ]; then +# EXTRA_CMAKE_ARGS= +# fi cd .. -$PYTHON setup.py build_ext --extra-cmake-args=$EXTRA_CMAKE_ARGS || exit 1 +# $PYTHON setup.py build_ext --extra-cmake-args=$EXTRA_CMAKE_ARGS || exit 1 +$PYTHON setup.py build_ext || exit 1 $PYTHON setup.py install || exit 1 diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h index 88869c20480..4e997e31dd6 100644 --- a/python/src/pyarrow/adapters/builtin.h +++ b/python/src/pyarrow/adapters/builtin.h @@ -28,6 +28,7 @@ #include #include "pyarrow/common.h" +#include "pyarrow/visibility.h" namespace arrow { class Array; } @@ -35,6 +36,7 @@ namespace pyarrow { class Status; +PYARROW_EXPORT Status ConvertPySequence(PyObject* obj, std::shared_ptr* out); } // namespace pyarrow diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index 17922349de6..c3377685bcc 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -25,6 +25,8 @@ #include +#include "pyarrow/visibility.h" + namespace arrow { class Array; @@ -36,12 +38,15 @@ namespace pyarrow { class Status; +PYARROW_EXPORT Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, PyObject** out); +PYARROW_EXPORT Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out); +PYARROW_EXPORT Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr* out); diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index 0211e8948f2..fb0ba3e4822 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -22,6 +22,8 @@ #include "arrow/util/buffer.h" +#include "pyarrow/visibility.h" + namespace arrow { class MemoryPool; } namespace pyarrow { @@ -94,9 +96,9 @@ struct PyObjectStringify { return Status::UnknownError(message); \ } -arrow::MemoryPool* GetMemoryPool(); +PYARROW_EXPORT arrow::MemoryPool* GetMemoryPool(); -class NumPyBuffer : public arrow::Buffer { +class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { public: NumPyBuffer(PyArrayObject* arr) : Buffer(nullptr, 0) { diff --git a/python/src/pyarrow/config.h b/python/src/pyarrow/config.h index 48ae715d842..82936b1a5f3 100644 --- a/python/src/pyarrow/config.h +++ b/python/src/pyarrow/config.h @@ -21,6 +21,7 @@ #include #include "pyarrow/numpy_interop.h" +#include "pyarrow/visibility.h" #if PY_MAJOR_VERSION >= 3 #define PyString_Check PyUnicode_Check @@ -28,10 +29,13 @@ namespace pyarrow { +PYARROW_EXPORT extern PyObject* numpy_nan; +PYARROW_EXPORT void pyarrow_init(); +PYARROW_EXPORT void pyarrow_set_numpy_nan(PyObject* obj); } // namespace pyarrow diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index ec42bb31d3b..fa9c713b0c2 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -21,6 +21,8 @@ #include #include +#include "pyarrow/visibility.h" + namespace pyarrow { using arrow::DataType; @@ -40,6 +42,7 @@ extern const std::shared_ptr FLOAT; extern const std::shared_ptr DOUBLE; extern const std::shared_ptr STRING; +PYARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); } // namespace pyarrow diff --git a/python/src/pyarrow/status.h b/python/src/pyarrow/status.h index cb8c8add210..67cd66c58ee 100644 --- a/python/src/pyarrow/status.h +++ b/python/src/pyarrow/status.h @@ -17,6 +17,8 @@ #include #include +#include "pyarrow/visibility.h" + namespace pyarrow { #define PY_RETURN_NOT_OK(s) do { \ @@ -38,7 +40,7 @@ enum class StatusCode: char { UnknownError = 10 }; -class Status { +class PYARROW_EXPORT Status { public: // Create a success status. Status() : state_(NULL) { } diff --git a/python/src/pyarrow/visibility.h b/python/src/pyarrow/visibility.h new file mode 100644 index 00000000000..9f0c13b4b20 --- /dev/null +++ b/python/src/pyarrow/visibility.h @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_VISIBILITY_H +#define PYARROW_VISIBILITY_H + +#if defined(_WIN32) || defined(__CYGWIN__) +#define PYARROW_EXPORT __declspec(dllexport) +#else // Not Windows +#ifndef PYARROW_EXPORT +#define PYARROW_EXPORT __attribute__((visibility("default"))) +#endif +#ifndef PYARROW_NO_EXPORT +#define PYARROW_NO_EXPORT __attribute__((visibility("hidden"))) +#endif +#endif // Non-Windows + +#endif // PYARROW_VISIBILITY_H From ff6132f8a1c2a98cf7c94ae327342c8b38aecb18 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 11 Jul 2016 22:58:57 -0700 Subject: [PATCH 096/210] ARROW-237: Implement parquet-cpp's abstract IO interfaces for memory allocation and file reading Part of ARROW-227 and ARROW-236 Author: Wes McKinney Closes #101 from wesm/ARROW-237 and squashes the following commits: 00c8211 [Wes McKinney] Draft implementations of parquet-cpp allocator and read-only file interfaces --- cpp/src/arrow/io/hdfs-io-test.cc | 2 +- cpp/src/arrow/io/hdfs.cc | 16 +- cpp/src/arrow/io/hdfs.h | 8 +- cpp/src/arrow/io/interfaces.h | 14 +- cpp/src/arrow/parquet/CMakeLists.txt | 5 + cpp/src/arrow/parquet/io.cc | 94 ++++ cpp/src/arrow/parquet/io.h | 80 +++ cpp/src/arrow/parquet/parquet-io-test.cc | 511 ++++-------------- .../parquet/parquet-reader-writer-test.cc | 489 +++++++++++++++++ cpp/src/arrow/parquet/utils.h | 15 +- python/pyarrow/includes/libarrow_io.pxd | 12 +- python/pyarrow/io.pyx | 8 +- 12 files changed, 810 insertions(+), 444 deletions(-) create mode 100644 cpp/src/arrow/parquet/io.cc create mode 100644 cpp/src/arrow/parquet/io.h create mode 100644 cpp/src/arrow/parquet/parquet-reader-writer-test.cc diff --git a/cpp/src/arrow/io/hdfs-io-test.cc b/cpp/src/arrow/io/hdfs-io-test.cc index d1bf140ae68..e48a28142fa 100644 --- a/cpp/src/arrow/io/hdfs-io-test.cc +++ b/cpp/src/arrow/io/hdfs-io-test.cc @@ -266,7 +266,7 @@ TEST_F(TestHdfsClient, ReadableMethods) { ASSERT_EQ(size, file_size); uint8_t buffer[50]; - int32_t bytes_read = 0; + int64_t bytes_read = 0; ASSERT_OK(file->Read(50, &bytes_read, buffer)); ASSERT_EQ(0, std::memcmp(buffer, data.data(), 50)); diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 6da6ea4e71b..800c3edf4f3 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -100,7 +100,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { return Status::OK(); } - Status ReadAt(int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { tSize ret = hdfsPread(fs_, file_, static_cast(position), reinterpret_cast(buffer), nbytes); RETURN_NOT_OK(CheckReadResult(ret)); @@ -108,7 +108,7 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { return Status::OK(); } - Status Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { tSize ret = hdfsRead(fs_, file_, reinterpret_cast(buffer), nbytes); RETURN_NOT_OK(CheckReadResult(ret)); *bytes_read = ret; @@ -138,11 +138,11 @@ Status HdfsReadableFile::Close() { } Status HdfsReadableFile::ReadAt( - int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { return impl_->ReadAt(position, nbytes, bytes_read, buffer); } -Status HdfsReadableFile::Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) { +Status HdfsReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { return impl_->Read(nbytes, bytes_read, buffer); } @@ -177,7 +177,7 @@ class HdfsWriteableFile::HdfsWriteableFileImpl : public HdfsAnyFileImpl { return Status::OK(); } - Status Write(const uint8_t* buffer, int32_t nbytes, int32_t* bytes_written) { + Status Write(const uint8_t* buffer, int64_t nbytes, int64_t* bytes_written) { tSize ret = hdfsWrite(fs_, file_, reinterpret_cast(buffer), nbytes); CHECK_FAILURE(ret, "Write"); *bytes_written = ret; @@ -198,12 +198,12 @@ Status HdfsWriteableFile::Close() { } Status HdfsWriteableFile::Write( - const uint8_t* buffer, int32_t nbytes, int32_t* bytes_read) { + const uint8_t* buffer, int64_t nbytes, int64_t* bytes_read) { return impl_->Write(buffer, nbytes, bytes_read); } -Status HdfsWriteableFile::Write(const uint8_t* buffer, int32_t nbytes) { - int32_t bytes_written_dummy = 0; +Status HdfsWriteableFile::Write(const uint8_t* buffer, int64_t nbytes) { + int64_t bytes_written_dummy = 0; return Write(buffer, nbytes, &bytes_written_dummy); } diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index 532e3c536a1..b6449fcb88a 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -164,14 +164,14 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { Status GetSize(int64_t* size) override; Status ReadAt( - int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) override; + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; Status Seek(int64_t position) override; Status Tell(int64_t* position) override; // NOTE: If you wish to read a particular range of a file in a multithreaded // context, you may prefer to use ReadAt to avoid locking issues - Status Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) override; + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; private: class ARROW_NO_EXPORT HdfsReadableFileImpl; @@ -189,9 +189,9 @@ class ARROW_EXPORT HdfsWriteableFile : public WriteableFile { Status Close() override; - Status Write(const uint8_t* buffer, int32_t nbytes) override; + Status Write(const uint8_t* buffer, int64_t nbytes) override; - Status Write(const uint8_t* buffer, int32_t nbytes, int32_t* bytes_written); + Status Write(const uint8_t* buffer, int64_t nbytes, int64_t* bytes_written); Status Tell(int64_t* position) override; diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index 4bd8a8ffc2f..25361d5633d 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_IO_INTERFACES -#define ARROW_IO_INTERFACES +#ifndef ARROW_IO_INTERFACES_H +#define ARROW_IO_INTERFACES_H #include @@ -40,17 +40,17 @@ class FileSystemClient { }; class FileBase { + public: virtual Status Close() = 0; - virtual Status Tell(int64_t* position) = 0; }; class ReadableFile : public FileBase { public: virtual Status ReadAt( - int64_t position, int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) = 0; + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) = 0; - virtual Status Read(int32_t nbytes, int32_t* bytes_read, uint8_t* buffer) = 0; + virtual Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) = 0; virtual Status GetSize(int64_t* size) = 0; }; @@ -62,10 +62,10 @@ class RandomAccessFile : public ReadableFile { class WriteableFile : public FileBase { public: - virtual Status Write(const uint8_t* buffer, int32_t nbytes) = 0; + virtual Status Write(const uint8_t* buffer, int64_t nbytes) = 0; }; } // namespace io } // namespace arrow -#endif // ARROW_IO_INTERFACES +#endif // ARROW_IO_INTERFACES_H diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 00f19b354e3..f2a90b71a49 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,6 +19,7 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS + io.cc reader.cc schema.cc writer.cc @@ -48,8 +49,12 @@ ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) ADD_ARROW_TEST(parquet-io-test) ARROW_TEST_LINK_LIBRARIES(parquet-io-test arrow_parquet) +ADD_ARROW_TEST(parquet-reader-writer-test) +ARROW_TEST_LINK_LIBRARIES(parquet-reader-writer-test arrow_parquet) + # Headers: top level install(FILES + io.h reader.h schema.h utils.h diff --git a/cpp/src/arrow/parquet/io.cc b/cpp/src/arrow/parquet/io.cc new file mode 100644 index 00000000000..c81aa8c4da9 --- /dev/null +++ b/cpp/src/arrow/parquet/io.cc @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/parquet/io.h" + +#include +#include + +#include "parquet/api/io.h" + +#include "arrow/parquet/utils.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +// To assist with readability +using ArrowROFile = arrow::io::RandomAccessFile; + +namespace arrow { +namespace parquet { + +// ---------------------------------------------------------------------- +// ParquetAllocator + +ParquetAllocator::ParquetAllocator() : pool_(default_memory_pool()) {} + +ParquetAllocator::ParquetAllocator(MemoryPool* pool) : pool_(pool) {} + +ParquetAllocator::~ParquetAllocator() {} + +uint8_t* ParquetAllocator::Malloc(int64_t size) { + uint8_t* result; + PARQUET_THROW_NOT_OK(pool_->Allocate(size, &result)); + return result; +} + +void ParquetAllocator::Free(uint8_t* buffer, int64_t size) { + // Does not report Status + pool_->Free(buffer, size); +} + +// ---------------------------------------------------------------------- +// ParquetReadSource + +ParquetReadSource::ParquetReadSource( + const std::shared_ptr& file, ParquetAllocator* allocator) + : file_(file), allocator_(allocator) {} + +void ParquetReadSource::Close() { + PARQUET_THROW_NOT_OK(file_->Close()); +} + +int64_t ParquetReadSource::Tell() const { + int64_t position; + PARQUET_THROW_NOT_OK(file_->Tell(&position)); + return position; +} + +void ParquetReadSource::Seek(int64_t position) { + PARQUET_THROW_NOT_OK(file_->Seek(position)); +} + +int64_t ParquetReadSource::Read(int64_t nbytes, uint8_t* out) { + int64_t bytes_read; + PARQUET_THROW_NOT_OK(file_->Read(nbytes, &bytes_read, out)); + return bytes_read; +} + +std::shared_ptr<::parquet::Buffer> ParquetReadSource::Read(int64_t nbytes) { + // TODO(wesm): This code is duplicated from parquet/util/input.cc; suggests + // that there should be more code sharing amongst file-like sources + auto result = std::make_shared<::parquet::OwnedMutableBuffer>(0, allocator_); + result->Resize(nbytes); + + int64_t bytes_read = Read(nbytes, result->mutable_data()); + if (bytes_read < nbytes) { result->Resize(bytes_read); } + return result; +} + +} // namespace parquet +} // namespace arrow diff --git a/cpp/src/arrow/parquet/io.h b/cpp/src/arrow/parquet/io.h new file mode 100644 index 00000000000..ef8871da4df --- /dev/null +++ b/cpp/src/arrow/parquet/io.h @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Bridges Arrow's IO interfaces and Parquet-cpp's IO interfaces + +#ifndef ARROW_PARQUET_IO_H +#define ARROW_PARQUET_IO_H + +#include +#include + +#include "parquet/api/io.h" + +#include "arrow/io/interfaces.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; + +namespace parquet { + +// An implementation of the Parquet MemoryAllocator API that plugs into an +// existing Arrow memory pool. This way we can direct all allocations to a +// single place rather than tracking allocations in different locations (for +// example: without utilizing parquet-cpp's default allocator) +class ARROW_EXPORT ParquetAllocator : public ::parquet::MemoryAllocator { + public: + // Uses the default memory pool + ParquetAllocator(); + + explicit ParquetAllocator(MemoryPool* pool); + virtual ~ParquetAllocator(); + + uint8_t* Malloc(int64_t size) override; + void Free(uint8_t* buffer, int64_t size) override; + + MemoryPool* pool() { return pool_; } + + private: + MemoryPool* pool_; +}; + +class ARROW_EXPORT ParquetReadSource : public ::parquet::RandomAccessSource { + public: + ParquetReadSource( + const std::shared_ptr& file, ParquetAllocator* allocator); + + void Close() override; + int64_t Tell() const override; + void Seek(int64_t pos) override; + int64_t Read(int64_t nbytes, uint8_t* out) override; + std::shared_ptr<::parquet::Buffer> Read(int64_t nbytes) override; + + private: + // An Arrow readable file of some kind + std::shared_ptr file_; + + // The allocator is required for creating managed buffers + ParquetAllocator* allocator_; +}; + +} // namespace parquet +} // namespace arrow + +#endif // ARROW_PARQUET_IO_H diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index bfc27d26d63..7e724b31e38 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -15,475 +15,164 @@ // specific language governing permissions and limitations // under the License. +#include +#include +#include +#include + #include "gtest/gtest.h" -#include "arrow/test-util.h" -#include "arrow/parquet/test-util.h" -#include "arrow/parquet/reader.h" -#include "arrow/parquet/writer.h" -#include "arrow/types/construct.h" -#include "arrow/types/primitive.h" -#include "arrow/types/string.h" +#include "arrow/parquet/io.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" -#include "parquet/api/reader.h" -#include "parquet/api/writer.h" - -using ParquetBuffer = parquet::Buffer; -using parquet::BufferReader; -using parquet::default_writer_properties; -using parquet::InMemoryOutputStream; -using parquet::LogicalType; -using parquet::ParquetFileReader; -using parquet::ParquetFileWriter; -using parquet::RandomAccessSource; -using parquet::Repetition; -using parquet::SchemaDescriptor; -using parquet::ParquetVersion; -using ParquetType = parquet::Type; -using parquet::schema::GroupNode; -using parquet::schema::NodePtr; -using parquet::schema::PrimitiveNode; +#include "parquet/api/io.h" namespace arrow { - namespace parquet { -const int SMALL_SIZE = 100; -const int LARGE_SIZE = 10000; - -template -struct test_traits {}; +// Allocator tests -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::BOOLEAN; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static uint8_t const value; -}; - -const uint8_t test_traits::value(1); +TEST(TestParquetAllocator, DefaultCtor) { + ParquetAllocator allocator; -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_8; - static uint8_t const value; -}; + const int buffer_size = 10; -const uint8_t test_traits::value(64); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::INT_8; - static int8_t const value; -}; - -const int8_t test_traits::value(-64); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_16; - static uint16_t const value; -}; + uint8_t* buffer = nullptr; + ASSERT_NO_THROW(buffer = allocator.Malloc(buffer_size);); -const uint16_t test_traits::value(1024); + // valgrind will complain if we write into nullptr + memset(buffer, 0, buffer_size); -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::INT_16; - static int16_t const value; -}; - -const int16_t test_traits::value(-1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_32; - static uint32_t const value; -}; - -const uint32_t test_traits::value(1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static int32_t const value; -}; - -const int32_t test_traits::value(-1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT64; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_64; - static uint64_t const value; -}; - -const uint64_t test_traits::value(1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT64; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static int64_t const value; -}; - -const int64_t test_traits::value(-1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static float const value; -}; - -const float test_traits::value(2.1f); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static double const value; -}; - -const double test_traits::value(4.2); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY; - static constexpr LogicalType::type logical_enum = LogicalType::UTF8; - static std::string const value; -}; - -const std::string test_traits::value("Test"); - -template -using ParquetDataType = ::parquet::DataType::parquet_enum>; - -template -using ParquetWriter = ::parquet::TypedColumnWriter>; + allocator.Free(buffer, buffer_size); +} -template -class TestParquetIO : public ::testing::Test { +// Pass through to the default memory pool +class TrackingPool : public MemoryPool { public: - virtual void SetUp() {} - - std::shared_ptr MakeSchema(Repetition::type repetition) { - auto pnode = PrimitiveNode::Make("column1", repetition, - test_traits::parquet_enum, test_traits::logical_enum); - NodePtr node_ = - GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); - return std::static_pointer_cast(node_); - } - - std::unique_ptr MakeWriter( - const std::shared_ptr& schema) { - sink_ = std::make_shared(); - return ParquetFileWriter::Open(sink_, schema); - } - - std::unique_ptr ReaderFromSink() { - std::shared_ptr buffer = sink_->GetBuffer(); - std::unique_ptr source(new BufferReader(buffer)); - return ParquetFileReader::Open(std::move(source)); - } - - void ReadSingleColumnFile( - std::unique_ptr file_reader, std::shared_ptr* out) { - arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); - std::unique_ptr column_reader; - ASSERT_OK_NO_THROW(reader.GetFlatColumn(0, &column_reader)); - ASSERT_NE(nullptr, column_reader.get()); - - ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); - ASSERT_NE(nullptr, out->get()); - } + TrackingPool() : pool_(default_memory_pool()), bytes_allocated_(0) {} - void ReadAndCheckSingleColumnFile(Array* values) { - std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); - ASSERT_TRUE(values->Equals(out)); + Status Allocate(int64_t size, uint8_t** out) override { + RETURN_NOT_OK(pool_->Allocate(size, out)); + bytes_allocated_ += size; + return Status::OK(); } - void ReadTableFromFile( - std::unique_ptr file_reader, std::shared_ptr
* out) { - arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); - ASSERT_OK_NO_THROW(reader.ReadFlatTable(out)); - ASSERT_NE(nullptr, out->get()); + void Free(uint8_t* buffer, int64_t size) override { + pool_->Free(buffer, size); + bytes_allocated_ -= size; } - void ReadAndCheckSingleColumnTable(const std::shared_ptr& values) { - std::shared_ptr
out; - ReadTableFromFile(ReaderFromSink(), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(values->length(), out->num_rows()); - - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); - } + int64_t bytes_allocated() const override { return bytes_allocated_; } - template - void WriteFlatColumn(const std::shared_ptr& schema, - const std::shared_ptr& values) { - FileWriter writer(default_memory_pool(), MakeWriter(schema)); - ASSERT_OK_NO_THROW(writer.NewRowGroup(values->length())); - ASSERT_OK_NO_THROW(writer.WriteFlatColumnChunk(values.get())); - ASSERT_OK_NO_THROW(writer.Close()); - } - - std::shared_ptr sink_; + private: + MemoryPool* pool_; + int64_t bytes_allocated_; }; -// We habe separate tests for UInt32Type as this is currently the only type -// where a roundtrip does not yield the identical Array structure. -// There we write an UInt32 Array but receive an Int64 Array as result for -// Parquet version 1.0. +TEST(TestParquetAllocator, CustomPool) { + TrackingPool pool; -typedef ::testing::Types TestTypes; + ParquetAllocator allocator(&pool); -TYPED_TEST_CASE(TestParquetIO, TestTypes); + ASSERT_EQ(&pool, allocator.pool()); -TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) { - auto values = NonNullArray(SMALL_SIZE); + const int buffer_size = 10; - std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); - this->WriteFlatColumn(schema, values); + uint8_t* buffer = nullptr; + ASSERT_NO_THROW(buffer = allocator.Malloc(buffer_size);); - this->ReadAndCheckSingleColumnFile(values.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { - auto values = NonNullArray(SMALL_SIZE); - std::shared_ptr
table = MakeSimpleTable(values, false); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, - values->length(), default_writer_properties())); - - std::shared_ptr
out; - this->ReadTableFromFile(this->ReaderFromSink(), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(100, out->num_rows()); - - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); -} + ASSERT_EQ(buffer_size, pool.bytes_allocated()); -TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { - // This also tests max_definition_level = 1 - auto values = NullableArray(SMALL_SIZE, 10); + // valgrind will complain if we write into nullptr + memset(buffer, 0, buffer_size); - std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); - this->WriteFlatColumn(schema, values); + allocator.Free(buffer, buffer_size); - this->ReadAndCheckSingleColumnFile(values.get()); + ASSERT_EQ(0, pool.bytes_allocated()); } -TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { - // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(SMALL_SIZE, 10); - std::shared_ptr
table = MakeSimpleTable(values, true); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, - values->length(), default_writer_properties())); +// ---------------------------------------------------------------------- +// Read source tests - this->ReadAndCheckSingleColumnTable(values); -} - -TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { - auto values = NonNullArray(SMALL_SIZE); - int64_t chunk_size = values->length() / 4; +class BufferReader : public io::RandomAccessFile { + public: + BufferReader(const uint8_t* buffer, int buffer_size) + : buffer_(buffer), buffer_size_(buffer_size), position_(0) {} - std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); - FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); - for (int i = 0; i < 4; i++) { - ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); - ASSERT_OK_NO_THROW( - writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); + Status Close() override { + // no-op + return Status::OK(); } - ASSERT_OK_NO_THROW(writer.Close()); - - this->ReadAndCheckSingleColumnFile(values.get()); -} -TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { - auto values = NonNullArray(LARGE_SIZE); - std::shared_ptr
table = MakeSimpleTable(values, false); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable( - table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); - - this->ReadAndCheckSingleColumnTable(values); -} - -TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { - int64_t chunk_size = SMALL_SIZE / 4; - auto values = NullableArray(SMALL_SIZE, 10); - - std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); - FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); - for (int i = 0; i < 4; i++) { - ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); - ASSERT_OK_NO_THROW( - writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); + Status Tell(int64_t* position) override { + *position = position_; + return Status::OK(); } - ASSERT_OK_NO_THROW(writer.Close()); - this->ReadAndCheckSingleColumnFile(values.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { - // This also tests max_definition_level = 1 - auto values = NullableArray(LARGE_SIZE, 100); - std::shared_ptr
table = MakeSimpleTable(values, true); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable( - table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); - - this->ReadAndCheckSingleColumnTable(values); -} - -using TestUInt32ParquetIO = TestParquetIO; - -TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compability) { - // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(LARGE_SIZE, 100); - std::shared_ptr
table = MakeSimpleTable(values, true); - - // Parquet 2.0 roundtrip should yield an uint32_t column again - this->sink_ = std::make_shared(); - std::shared_ptr<::parquet::WriterProperties> properties = - ::parquet::WriterProperties::Builder() - .version(ParquetVersion::PARQUET_2_0) - ->build(); - ASSERT_OK_NO_THROW( - WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); - this->ReadAndCheckSingleColumnTable(values); -} + Status ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, bytes_read, buffer); + } -TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { - // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(LARGE_SIZE, 100); - std::shared_ptr
table = MakeSimpleTable(values, true); - - // Parquet 1.0 returns an int64_t column as there is no way to tell a Parquet 1.0 - // reader that a column is unsigned. - this->sink_ = std::make_shared(); - std::shared_ptr<::parquet::WriterProperties> properties = - ::parquet::WriterProperties::Builder() - .version(ParquetVersion::PARQUET_1_0) - ->build(); - ASSERT_OK_NO_THROW( - WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); - - std::shared_ptr expected_values; - std::shared_ptr int64_data = - std::make_shared(default_memory_pool()); - { - ASSERT_OK(int64_data->Resize(sizeof(int64_t) * values->length())); - int64_t* int64_data_ptr = reinterpret_cast(int64_data->mutable_data()); - const uint32_t* uint32_data_ptr = - reinterpret_cast(values->data()->data()); - // std::copy might be faster but this is explicit on the casts) - for (int64_t i = 0; i < values->length(); i++) { - int64_data_ptr[i] = static_cast(uint32_data_ptr[i]); - } + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override { + memcpy(buffer, buffer_ + position_, nbytes); + *bytes_read = std::min(nbytes, buffer_size_ - position_); + position_ += *bytes_read; + return Status::OK(); } - ASSERT_OK(MakePrimitiveArray(std::make_shared(), values->length(), - int64_data, values->null_count(), values->null_bitmap(), &expected_values)); - this->ReadAndCheckSingleColumnTable(expected_values); -} -template -using ParquetCDataType = typename ParquetDataType::c_type; + Status GetSize(int64_t* size) override { + *size = buffer_size_; + return Status::OK(); + } -template -class TestPrimitiveParquetIO : public TestParquetIO { - public: - typedef typename TestType::c_type T; - - void MakeTestFile(std::vector& values, int num_chunks, - std::unique_ptr* file_reader) { - std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); - std::unique_ptr file_writer = this->MakeWriter(schema); - size_t chunk_size = values.size() / num_chunks; - // Convert to Parquet's expected physical type - std::vector values_buffer( - sizeof(ParquetCDataType) * values.size()); - auto values_parquet = - reinterpret_cast*>(values_buffer.data()); - std::copy(values.cbegin(), values.cend(), values_parquet); - for (int i = 0; i < num_chunks; i++) { - auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = - static_cast*>(row_group_writer->NextColumn()); - ParquetCDataType* data = values_parquet + i * chunk_size; - column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); - column_writer->Close(); - row_group_writer->Close(); + Status Seek(int64_t position) override { + if (position < 0 || position >= buffer_size_) { + return Status::IOError("position out of bounds"); } - file_writer->Close(); - *file_reader = this->ReaderFromSink(); + + position_ = position; + return Status::OK(); } - void CheckSingleColumnRequiredTableRead(int num_chunks) { - std::vector values(SMALL_SIZE, test_traits::value); - std::unique_ptr file_reader; - ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); + private: + const uint8_t* buffer_; + int buffer_size_; + int64_t position_; +}; - std::shared_ptr
out; - this->ReadTableFromFile(std::move(file_reader), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(SMALL_SIZE, out->num_rows()); +TEST(TestParquetReadSource, Basics) { + std::string data = "this is the data"; + auto data_buffer = reinterpret_cast(data.c_str()); - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ExpectArray(values.data(), chunked_array->chunk(0).get()); - } + ParquetAllocator allocator; + auto file = std::make_shared(data_buffer, data.size()); + auto source = std::make_shared(file, &allocator); - void CheckSingleColumnRequiredRead(int num_chunks) { - std::vector values(SMALL_SIZE, test_traits::value); - std::unique_ptr file_reader; - ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); + ASSERT_EQ(0, source->Tell()); + ASSERT_NO_THROW(source->Seek(5)); + ASSERT_EQ(5, source->Tell()); + ASSERT_NO_THROW(source->Seek(0)); - std::shared_ptr out; - this->ReadSingleColumnFile(std::move(file_reader), &out); - - ExpectArray(values.data(), out.get()); - } -}; + // Seek out of bounds + ASSERT_THROW(source->Seek(100), ::parquet::ParquetException); -typedef ::testing::Types PrimitiveTestTypes; + uint8_t buffer[50]; -TYPED_TEST_CASE(TestPrimitiveParquetIO, PrimitiveTestTypes); + ASSERT_NO_THROW(source->Read(4, buffer)); + ASSERT_EQ(0, std::memcmp(buffer, "this", 4)); + ASSERT_EQ(4, source->Tell()); -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredRead) { - this->CheckSingleColumnRequiredRead(1); -} + std::shared_ptr<::parquet::Buffer> pq_buffer; -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredTableRead) { - this->CheckSingleColumnRequiredTableRead(1); -} + ASSERT_NO_THROW(pq_buffer = source->Read(7)); -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedRead) { - this->CheckSingleColumnRequiredRead(4); -} + auto expected_buffer = std::make_shared<::parquet::Buffer>(data_buffer + 4, 7); -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { - this->CheckSingleColumnRequiredTableRead(4); + ASSERT_TRUE(expected_buffer->Equals(*pq_buffer.get())); } } // namespace parquet - } // namespace arrow diff --git a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc new file mode 100644 index 00000000000..bfc27d26d63 --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc @@ -0,0 +1,489 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" +#include "arrow/parquet/test-util.h" +#include "arrow/parquet/reader.h" +#include "arrow/parquet/writer.h" +#include "arrow/types/construct.h" +#include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +#include "parquet/api/reader.h" +#include "parquet/api/writer.h" + +using ParquetBuffer = parquet::Buffer; +using parquet::BufferReader; +using parquet::default_writer_properties; +using parquet::InMemoryOutputStream; +using parquet::LogicalType; +using parquet::ParquetFileReader; +using parquet::ParquetFileWriter; +using parquet::RandomAccessSource; +using parquet::Repetition; +using parquet::SchemaDescriptor; +using parquet::ParquetVersion; +using ParquetType = parquet::Type; +using parquet::schema::GroupNode; +using parquet::schema::NodePtr; +using parquet::schema::PrimitiveNode; + +namespace arrow { + +namespace parquet { + +const int SMALL_SIZE = 100; +const int LARGE_SIZE = 10000; + +template +struct test_traits {}; + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::BOOLEAN; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static uint8_t const value; +}; + +const uint8_t test_traits::value(1); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_8; + static uint8_t const value; +}; + +const uint8_t test_traits::value(64); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::INT_8; + static int8_t const value; +}; + +const int8_t test_traits::value(-64); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_16; + static uint16_t const value; +}; + +const uint16_t test_traits::value(1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::INT_16; + static int16_t const value; +}; + +const int16_t test_traits::value(-1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_32; + static uint32_t const value; +}; + +const uint32_t test_traits::value(1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static int32_t const value; +}; + +const int32_t test_traits::value(-1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT64; + static constexpr LogicalType::type logical_enum = LogicalType::UINT_64; + static uint64_t const value; +}; + +const uint64_t test_traits::value(1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT64; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static int64_t const value; +}; + +const int64_t test_traits::value(-1024); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static float const value; +}; + +const float test_traits::value(2.1f); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE; + static constexpr LogicalType::type logical_enum = LogicalType::NONE; + static double const value; +}; + +const double test_traits::value(4.2); + +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY; + static constexpr LogicalType::type logical_enum = LogicalType::UTF8; + static std::string const value; +}; + +const std::string test_traits::value("Test"); + +template +using ParquetDataType = ::parquet::DataType::parquet_enum>; + +template +using ParquetWriter = ::parquet::TypedColumnWriter>; + +template +class TestParquetIO : public ::testing::Test { + public: + virtual void SetUp() {} + + std::shared_ptr MakeSchema(Repetition::type repetition) { + auto pnode = PrimitiveNode::Make("column1", repetition, + test_traits::parquet_enum, test_traits::logical_enum); + NodePtr node_ = + GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); + return std::static_pointer_cast(node_); + } + + std::unique_ptr MakeWriter( + const std::shared_ptr& schema) { + sink_ = std::make_shared(); + return ParquetFileWriter::Open(sink_, schema); + } + + std::unique_ptr ReaderFromSink() { + std::shared_ptr buffer = sink_->GetBuffer(); + std::unique_ptr source(new BufferReader(buffer)); + return ParquetFileReader::Open(std::move(source)); + } + + void ReadSingleColumnFile( + std::unique_ptr file_reader, std::shared_ptr* out) { + arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); + std::unique_ptr column_reader; + ASSERT_OK_NO_THROW(reader.GetFlatColumn(0, &column_reader)); + ASSERT_NE(nullptr, column_reader.get()); + + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); + ASSERT_NE(nullptr, out->get()); + } + + void ReadAndCheckSingleColumnFile(Array* values) { + std::shared_ptr out; + ReadSingleColumnFile(ReaderFromSink(), &out); + ASSERT_TRUE(values->Equals(out)); + } + + void ReadTableFromFile( + std::unique_ptr file_reader, std::shared_ptr
* out) { + arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); + ASSERT_OK_NO_THROW(reader.ReadFlatTable(out)); + ASSERT_NE(nullptr, out->get()); + } + + void ReadAndCheckSingleColumnTable(const std::shared_ptr& values) { + std::shared_ptr
out; + ReadTableFromFile(ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(values->length(), out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); + } + + template + void WriteFlatColumn(const std::shared_ptr& schema, + const std::shared_ptr& values) { + FileWriter writer(default_memory_pool(), MakeWriter(schema)); + ASSERT_OK_NO_THROW(writer.NewRowGroup(values->length())); + ASSERT_OK_NO_THROW(writer.WriteFlatColumnChunk(values.get())); + ASSERT_OK_NO_THROW(writer.Close()); + } + + std::shared_ptr sink_; +}; + +// We habe separate tests for UInt32Type as this is currently the only type +// where a roundtrip does not yield the identical Array structure. +// There we write an UInt32 Array but receive an Int64 Array as result for +// Parquet version 1.0. + +typedef ::testing::Types TestTypes; + +TYPED_TEST_CASE(TestParquetIO, TestTypes); + +TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) { + auto values = NonNullArray(SMALL_SIZE); + + std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); + this->WriteFlatColumn(schema, values); + + this->ReadAndCheckSingleColumnFile(values.get()); +} + +TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { + auto values = NonNullArray(SMALL_SIZE); + std::shared_ptr
table = MakeSimpleTable(values, false); + this->sink_ = std::make_shared(); + ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, + values->length(), default_writer_properties())); + + std::shared_ptr
out; + this->ReadTableFromFile(this->ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + +TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { + // This also tests max_definition_level = 1 + auto values = NullableArray(SMALL_SIZE, 10); + + std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); + this->WriteFlatColumn(schema, values); + + this->ReadAndCheckSingleColumnFile(values.get()); +} + +TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(SMALL_SIZE, 10); + std::shared_ptr
table = MakeSimpleTable(values, true); + this->sink_ = std::make_shared(); + ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, + values->length(), default_writer_properties())); + + this->ReadAndCheckSingleColumnTable(values); +} + +TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { + auto values = NonNullArray(SMALL_SIZE); + int64_t chunk_size = values->length() / 4; + + std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); + for (int i = 0; i < 4; i++) { + ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); + ASSERT_OK_NO_THROW( + writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); + } + ASSERT_OK_NO_THROW(writer.Close()); + + this->ReadAndCheckSingleColumnFile(values.get()); +} + +TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { + auto values = NonNullArray(LARGE_SIZE); + std::shared_ptr
table = MakeSimpleTable(values, false); + this->sink_ = std::make_shared(); + ASSERT_OK_NO_THROW(WriteFlatTable( + table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); + + this->ReadAndCheckSingleColumnTable(values); +} + +TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { + int64_t chunk_size = SMALL_SIZE / 4; + auto values = NullableArray(SMALL_SIZE, 10); + + std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); + for (int i = 0; i < 4; i++) { + ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); + ASSERT_OK_NO_THROW( + writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); + } + ASSERT_OK_NO_THROW(writer.Close()); + + this->ReadAndCheckSingleColumnFile(values.get()); +} + +TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { + // This also tests max_definition_level = 1 + auto values = NullableArray(LARGE_SIZE, 100); + std::shared_ptr
table = MakeSimpleTable(values, true); + this->sink_ = std::make_shared(); + ASSERT_OK_NO_THROW(WriteFlatTable( + table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); + + this->ReadAndCheckSingleColumnTable(values); +} + +using TestUInt32ParquetIO = TestParquetIO; + +TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compability) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(LARGE_SIZE, 100); + std::shared_ptr
table = MakeSimpleTable(values, true); + + // Parquet 2.0 roundtrip should yield an uint32_t column again + this->sink_ = std::make_shared(); + std::shared_ptr<::parquet::WriterProperties> properties = + ::parquet::WriterProperties::Builder() + .version(ParquetVersion::PARQUET_2_0) + ->build(); + ASSERT_OK_NO_THROW( + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); + this->ReadAndCheckSingleColumnTable(values); +} + +TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(LARGE_SIZE, 100); + std::shared_ptr
table = MakeSimpleTable(values, true); + + // Parquet 1.0 returns an int64_t column as there is no way to tell a Parquet 1.0 + // reader that a column is unsigned. + this->sink_ = std::make_shared(); + std::shared_ptr<::parquet::WriterProperties> properties = + ::parquet::WriterProperties::Builder() + .version(ParquetVersion::PARQUET_1_0) + ->build(); + ASSERT_OK_NO_THROW( + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); + + std::shared_ptr expected_values; + std::shared_ptr int64_data = + std::make_shared(default_memory_pool()); + { + ASSERT_OK(int64_data->Resize(sizeof(int64_t) * values->length())); + int64_t* int64_data_ptr = reinterpret_cast(int64_data->mutable_data()); + const uint32_t* uint32_data_ptr = + reinterpret_cast(values->data()->data()); + // std::copy might be faster but this is explicit on the casts) + for (int64_t i = 0; i < values->length(); i++) { + int64_data_ptr[i] = static_cast(uint32_data_ptr[i]); + } + } + ASSERT_OK(MakePrimitiveArray(std::make_shared(), values->length(), + int64_data, values->null_count(), values->null_bitmap(), &expected_values)); + this->ReadAndCheckSingleColumnTable(expected_values); +} + +template +using ParquetCDataType = typename ParquetDataType::c_type; + +template +class TestPrimitiveParquetIO : public TestParquetIO { + public: + typedef typename TestType::c_type T; + + void MakeTestFile(std::vector& values, int num_chunks, + std::unique_ptr* file_reader) { + std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); + std::unique_ptr file_writer = this->MakeWriter(schema); + size_t chunk_size = values.size() / num_chunks; + // Convert to Parquet's expected physical type + std::vector values_buffer( + sizeof(ParquetCDataType) * values.size()); + auto values_parquet = + reinterpret_cast*>(values_buffer.data()); + std::copy(values.cbegin(), values.cend(), values_parquet); + for (int i = 0; i < num_chunks; i++) { + auto row_group_writer = file_writer->AppendRowGroup(chunk_size); + auto column_writer = + static_cast*>(row_group_writer->NextColumn()); + ParquetCDataType* data = values_parquet + i * chunk_size; + column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); + column_writer->Close(); + row_group_writer->Close(); + } + file_writer->Close(); + *file_reader = this->ReaderFromSink(); + } + + void CheckSingleColumnRequiredTableRead(int num_chunks) { + std::vector values(SMALL_SIZE, test_traits::value); + std::unique_ptr file_reader; + ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); + + std::shared_ptr
out; + this->ReadTableFromFile(std::move(file_reader), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ExpectArray(values.data(), chunked_array->chunk(0).get()); + } + + void CheckSingleColumnRequiredRead(int num_chunks) { + std::vector values(SMALL_SIZE, test_traits::value); + std::unique_ptr file_reader; + ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); + + std::shared_ptr out; + this->ReadSingleColumnFile(std::move(file_reader), &out); + + ExpectArray(values.data(), out.get()); + } +}; + +typedef ::testing::Types PrimitiveTestTypes; + +TYPED_TEST_CASE(TestPrimitiveParquetIO, PrimitiveTestTypes); + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredRead) { + this->CheckSingleColumnRequiredRead(1); +} + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredTableRead) { + this->CheckSingleColumnRequiredTableRead(1); +} + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedRead) { + this->CheckSingleColumnRequiredRead(4); +} + +TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { + this->CheckSingleColumnRequiredTableRead(4); +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h index 409bcd9065c..bcc46be60e6 100644 --- a/cpp/src/arrow/parquet/utils.h +++ b/cpp/src/arrow/parquet/utils.h @@ -18,12 +18,12 @@ #ifndef ARROW_PARQUET_UTILS_H #define ARROW_PARQUET_UTILS_H -#include "arrow/util/status.h" +#include +#include "arrow/util/status.h" #include "parquet/exception.h" namespace arrow { - namespace parquet { #define PARQUET_CATCH_NOT_OK(s) \ @@ -36,8 +36,17 @@ namespace parquet { (s); \ } catch (const ::parquet::ParquetException& e) {} -} // namespace parquet +#define PARQUET_THROW_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + if (!_s.ok()) { \ + std::stringstream ss; \ + ss << "Arrow error: " << _s.ToString(); \ + throw ::parquet::ParquetException(ss.str()); \ + } \ + } while (0); +} // namespace parquet } // namespace arrow #endif // ARROW_PARQUET_UTILS_H diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd index d874ba30912..d0fb8f9f000 100644 --- a/python/pyarrow/includes/libarrow_io.pxd +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -51,17 +51,17 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: cdef cppclass HdfsReadableFile(CHdfsFile): CStatus GetSize(int64_t* size) - CStatus Read(int32_t nbytes, int32_t* bytes_read, + CStatus Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) - CStatus ReadAt(int64_t position, int32_t nbytes, - int32_t* bytes_read, uint8_t* buffer) + CStatus ReadAt(int64_t position, int64_t nbytes, + int64_t* bytes_read, uint8_t* buffer) cdef cppclass HdfsWriteableFile(CHdfsFile): - CStatus Write(const uint8_t* buffer, int32_t nbytes) + CStatus Write(const uint8_t* buffer, int64_t nbytes) - CStatus Write(const uint8_t* buffer, int32_t nbytes, - int32_t* bytes_written) + CStatus Write(const uint8_t* buffer, int64_t nbytes, + int64_t* bytes_written) cdef cppclass CHdfsClient" arrow::io::HdfsClient": @staticmethod diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 8b97671e453..071eea5ba6e 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -383,7 +383,7 @@ cdef class HdfsFile: Read indicated number of bytes from the file, up to EOF """ cdef: - int32_t bytes_read = 0 + int64_t bytes_read = 0 uint8_t* buf self._assert_readable() @@ -394,7 +394,7 @@ cdef class HdfsFile: if buf == NULL: raise MemoryError("Failed to allocate {0} bytes".format(nbytes)) - cdef int32_t total_bytes = 0 + cdef int64_t total_bytes = 0 cdef int rpc_chunksize = min(self.buffer_size, nbytes) @@ -423,7 +423,7 @@ cdef class HdfsFile: memory). First seeks to the beginning of the file. """ cdef: - int32_t bytes_read = 0 + int64_t bytes_read = 0 uint8_t* buf self._assert_readable() @@ -499,6 +499,6 @@ cdef class HdfsFile: data = tobytes(data) cdef const uint8_t* buf = cp.PyBytes_AS_STRING(data) - cdef int32_t bufsize = len(data) + cdef int64_t bufsize = len(data) with nogil: check_cstatus(self.wr_file.get().Write(buf, bufsize)) From 62390d8427445b033ba7f7cf3150184222d2c2c1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Tue, 12 Jul 2016 17:34:36 -0700 Subject: [PATCH 097/210] ARROW-106: [C++] Add IPC to binary/string types Author: Micah Kornfield Closes #103 from emkornfield/emk_add_string_rpc and squashes the following commits: 9c563fe [Micah Kornfield] ARROW-106: [C++] Add IPC to binary/string types --- cpp/src/arrow/ipc/adapter.cc | 10 ++---- cpp/src/arrow/ipc/ipc-adapter-test.cc | 52 +++++++++++++++++++++++++-- cpp/src/arrow/types/construct.cc | 4 +++ 3 files changed, 57 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 45cc288cd6b..bac11727006 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -33,6 +33,7 @@ #include "arrow/types/construct.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" @@ -81,14 +82,9 @@ static bool IsListType(const DataType* type) { // code consider using pattern like: // http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type // - // TODO(emkornfield) Fix type systems so these are all considered lists and - // the types behave the same way? - // case Type::BINARY: - // case Type::CHAR: + case Type::BINARY: case Type::LIST: - // see todo on common types - // case Type::STRING: - // case Type::VARCHAR: + case Type::STRING: return true; default: return false; diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index eb47ac6fee8..2bfb459d6e0 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -31,6 +31,7 @@ #include "arrow/test-util.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" @@ -105,6 +106,52 @@ Status MakeIntRowBatch(std::shared_ptr* out) { return Status::OK(); } +template +Status MakeRandomBinaryArray( + const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) { + const std::vector values = { + "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; + Builder builder(pool, type); + const auto values_len = values.size(); + for (int32_t i = 0; i < length; ++i) { + int values_index = i % values_len; + if (values_index == 0) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + const std::string& value = values[values_index]; + RETURN_NOT_OK( + builder.Append(reinterpret_cast(value.data()), value.size())); + } + } + *array = builder.Finish(); + return Status::OK(); +} + +Status MakeStringTypesRowBatch(std::shared_ptr* out) { + const int32_t length = 500; + auto string_type = std::make_shared(); + auto binary_type = std::make_shared(); + auto f0 = std::make_shared("f0", string_type); + auto f1 = std::make_shared("f1", binary_type); + std::shared_ptr schema(new Schema({f0, f1})); + + std::shared_ptr a0, a1; + MemoryPool* pool = default_memory_pool(); + + { + auto status = + MakeRandomBinaryArray(string_type, length, pool, &a0); + RETURN_NOT_OK(status); + } + { + auto status = + MakeRandomBinaryArray(binary_type, length, pool, &a1); + RETURN_NOT_OK(status); + } + out->reset(new RowBatch(schema, length, {a0, a1})); + return Status::OK(); +} + Status MakeListRowBatch(std::shared_ptr* out) { // Make the schema auto f0 = std::make_shared("f0", LIST_INT32); @@ -191,9 +238,10 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { return Status::OK(); } -INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, +INSTANTIATE_TEST_CASE_P( + RoundTripTests, TestWriteRowBatch, ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, - &MakeZeroLengthRowBatch, &MakeDeeplyNestedList)); + &MakeZeroLengthRowBatch, &MakeDeeplyNestedList, &MakeStringTypesRowBatch)); void TestGetRowBatchSize(std::shared_ptr batch) { MockMemorySource mock_source(1 << 16); diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 2d913a73748..5ae9c5ab6d4 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -124,9 +124,13 @@ Status MakeListArray(const TypePtr& type, int32_t length, const std::shared_ptr& null_bitmap, ArrayPtr* out) { switch (type->type) { case Type::BINARY: + out->reset(new BinaryArray(type, length, offsets, values, null_count, null_bitmap)); + break; + case Type::LIST: out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); break; + case Type::DECIMAL_TEXT: case Type::STRING: out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); From 55bfa834312685991d615301ac0b4fcc7c11640b Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 18 Jul 2016 15:07:48 -0700 Subject: [PATCH 098/210] =?UTF-8?q?ARROW-238:=20Change=20InternalMemoryPoo?= =?UTF-8?q?l::Free()=20to=20return=20Status::Invalid=20when=20ther?= =?UTF-8?q?=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …e is insufficient memory. Author: Jihoon Son Closes #102 from jihoonson/ARROW-238 and squashes the following commits: cb9e7b1 [Jihoon Son] Disable FreeLargeMemory test for release builds f903130 [Jihoon Son] Free allocated memory after death 0077a70 [Jihoon Son] Adjust the amount of memory allocation b1af59b [Jihoon Son] Change to ASSERT_EXIT b4159f0 [Jihoon Son] Reflect comments e89a1f9 [Jihoon Son] Change python implementation as well. 7651570 [Jihoon Son] Change InternalMemoryPool::Free() to return Status::Invalid when there is insufficient memory. --- cpp/src/arrow/util/logging.h | 2 +- cpp/src/arrow/util/memory-pool-test.cc | 14 ++++++++++++++ cpp/src/arrow/util/memory-pool.cc | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index fccc5e3085d..54f67593bec 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -40,7 +40,7 @@ namespace arrow { #define ARROW_CHECK(condition) \ (condition) ? 0 : ::arrow::internal::FatalLog(ARROW_FATAL) \ - << __FILE__ << __LINE__ << "Check failed: " #condition " " + << __FILE__ << __LINE__ << " Check failed: " #condition " " #ifdef NDEBUG #define ARROW_DFATAL ARROW_WARNING diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index 8e7dfd60baa..919f3740982 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -46,4 +46,18 @@ TEST(DefaultMemoryPool, OOM) { ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); } +TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { + MemoryPool* pool = default_memory_pool(); + + uint8_t* data; + ASSERT_OK(pool->Allocate(100, &data)); + +#ifndef NDEBUG + EXPECT_EXIT(pool->Free(data, 120), ::testing::ExitedWithCode(1), + ".*Check failed: \\(bytes_allocated_\\) >= \\(size\\)"); +#endif + + pool->Free(data, 100); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index 0a58e5aa21f..fed149bc359 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -23,6 +23,7 @@ #include #include "arrow/util/status.h" +#include "arrow/util/logging.h" namespace arrow { @@ -81,6 +82,7 @@ int64_t InternalMemoryPool::bytes_allocated() const { void InternalMemoryPool::Free(uint8_t* buffer, int64_t size) { std::lock_guard guard(pool_lock_); + DCHECK_GE(bytes_allocated_, size); std::free(buffer); bytes_allocated_ -= size; } From 59e5f9806515e8a5360870c93082316f74d7ec7c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 18 Jul 2016 15:37:27 -0700 Subject: [PATCH 099/210] ARROW-236: Bridging IO interfaces under the hood in pyarrow Author: Wes McKinney Closes #104 from wesm/ARROW-236 and squashes the following commits: 73648e0 [Wes McKinney] cpplint f2cd77f [Wes McKinney] Check in io.pxd 94bcd30 [Wes McKinney] Do not let Parquet close an Arrow file 9b9d94d [Wes McKinney] Barely working direct HDFS-Parquet reads 06ddd06 [Wes McKinney] Slight refactoring of read table to be able to also handle classes wrapping C++ file interfaces c7a913e [Wes McKinney] Provide a means to expose abstract native file handles e6724de [Wes McKinney] Implement alternate ctor to construct parquet::FileReader from an arrow::io::RandomAccessFile --- cpp/src/arrow/io/interfaces.h | 1 + cpp/src/arrow/parquet/io.cc | 19 +++++-- cpp/src/arrow/parquet/io.h | 10 +++- cpp/src/arrow/parquet/parquet-io-test.cc | 8 ++- cpp/src/arrow/parquet/reader.cc | 20 +++++++ cpp/src/arrow/parquet/reader.h | 13 ++++- cpp/src/arrow/parquet/writer.cc | 1 - cpp/src/arrow/parquet/writer.h | 2 +- python/pyarrow/includes/libarrow_io.pxd | 49 +++++++++++------ python/pyarrow/includes/parquet.pxd | 24 +++++++- python/pyarrow/io.pxd | 32 +++++++++++ python/pyarrow/io.pyx | 19 ++++++- python/pyarrow/parquet.pyx | 70 +++++++++++++++++++----- 13 files changed, 216 insertions(+), 52 deletions(-) create mode 100644 python/pyarrow/io.pxd diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index 25361d5633d..c2128525371 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -19,6 +19,7 @@ #define ARROW_IO_INTERFACES_H #include +#include namespace arrow { diff --git a/cpp/src/arrow/parquet/io.cc b/cpp/src/arrow/parquet/io.cc index c81aa8c4da9..b6fdd67d15b 100644 --- a/cpp/src/arrow/parquet/io.cc +++ b/cpp/src/arrow/parquet/io.cc @@ -55,12 +55,23 @@ void ParquetAllocator::Free(uint8_t* buffer, int64_t size) { // ---------------------------------------------------------------------- // ParquetReadSource -ParquetReadSource::ParquetReadSource( - const std::shared_ptr& file, ParquetAllocator* allocator) - : file_(file), allocator_(allocator) {} +ParquetReadSource::ParquetReadSource(ParquetAllocator* allocator) + : file_(nullptr), allocator_(allocator) {} + +Status ParquetReadSource::Open(const std::shared_ptr& file) { + int64_t file_size; + RETURN_NOT_OK(file->GetSize(&file_size)); + + file_ = file; + size_ = file_size; + return Status::OK(); +} void ParquetReadSource::Close() { - PARQUET_THROW_NOT_OK(file_->Close()); + // TODO(wesm): Make this a no-op for now. This leaves Python wrappers for + // these classes in a borked state. Probably better to explicitly close. + + // PARQUET_THROW_NOT_OK(file_->Close()); } int64_t ParquetReadSource::Tell() const { diff --git a/cpp/src/arrow/parquet/io.h b/cpp/src/arrow/parquet/io.h index ef8871da4df..1c59695c6c1 100644 --- a/cpp/src/arrow/parquet/io.h +++ b/cpp/src/arrow/parquet/io.h @@ -49,7 +49,9 @@ class ARROW_EXPORT ParquetAllocator : public ::parquet::MemoryAllocator { uint8_t* Malloc(int64_t size) override; void Free(uint8_t* buffer, int64_t size) override; - MemoryPool* pool() { return pool_; } + void set_pool(MemoryPool* pool) { pool_ = pool; } + + MemoryPool* pool() const { return pool_; } private: MemoryPool* pool_; @@ -57,8 +59,10 @@ class ARROW_EXPORT ParquetAllocator : public ::parquet::MemoryAllocator { class ARROW_EXPORT ParquetReadSource : public ::parquet::RandomAccessSource { public: - ParquetReadSource( - const std::shared_ptr& file, ParquetAllocator* allocator); + explicit ParquetReadSource(ParquetAllocator* allocator); + + // We need to ask for the file size on opening the file, and this can fail + Status Open(const std::shared_ptr& file); void Close() override; int64_t Tell() const override; diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index 7e724b31e38..6615457c483 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -23,6 +23,7 @@ #include "gtest/gtest.h" #include "arrow/parquet/io.h" +#include "arrow/test-util.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -147,9 +148,12 @@ TEST(TestParquetReadSource, Basics) { std::string data = "this is the data"; auto data_buffer = reinterpret_cast(data.c_str()); - ParquetAllocator allocator; + ParquetAllocator allocator(default_memory_pool()); + auto file = std::make_shared(data_buffer, data.size()); - auto source = std::make_shared(file, &allocator); + auto source = std::make_shared(&allocator); + + ASSERT_OK(source->Open(file)); ASSERT_EQ(0, source->Tell()); ASSERT_NO_THROW(source->Seek(5)); diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index c7c400e9573..e92967e5363 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -23,6 +23,7 @@ #include #include "arrow/column.h" +#include "arrow/parquet/io.h" #include "arrow/parquet/schema.h" #include "arrow/parquet/utils.h" #include "arrow/schema.h" @@ -35,6 +36,10 @@ using parquet::ColumnReader; using parquet::Repetition; using parquet::TypedColumnReader; +// Help reduce verbosity +using ParquetRAS = parquet::RandomAccessSource; +using ParquetReader = parquet::ParquetFileReader; + namespace arrow { namespace parquet { @@ -181,6 +186,21 @@ FileReader::FileReader( FileReader::~FileReader() {} +// Static ctor +Status OpenFile(const std::shared_ptr& file, + ParquetAllocator* allocator, std::unique_ptr* reader) { + std::unique_ptr source(new ParquetReadSource(allocator)); + RETURN_NOT_OK(source->Open(file)); + + // TODO(wesm): reader properties + std::unique_ptr pq_reader; + PARQUET_CATCH_NOT_OK(pq_reader = ParquetReader::Open(std::move(source))); + + // Use the same memory pool as the ParquetAllocator + reader->reset(new FileReader(allocator->pool(), std::move(pq_reader))); + return Status::OK(); +} + Status FileReader::GetFlatColumn(int i, std::unique_ptr* out) { return impl_->GetFlatColumn(i, out); } diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h index 2c8a9dfd025..f1492f64521 100644 --- a/cpp/src/arrow/parquet/reader.h +++ b/cpp/src/arrow/parquet/reader.h @@ -23,6 +23,8 @@ #include "parquet/api/reader.h" #include "parquet/api/schema.h" +#include "arrow/io/interfaces.h" +#include "arrow/parquet/io.h" #include "arrow/util/visibility.h" namespace arrow { @@ -99,7 +101,7 @@ class ARROW_EXPORT FileReader { virtual ~FileReader(); private: - class Impl; + class ARROW_NO_EXPORT Impl; std::unique_ptr impl_; }; @@ -125,15 +127,20 @@ class ARROW_EXPORT FlatColumnReader { Status NextBatch(int batch_size, std::shared_ptr* out); private: - class Impl; + class ARROW_NO_EXPORT Impl; std::unique_ptr impl_; explicit FlatColumnReader(std::unique_ptr impl); friend class FileReader; }; -} // namespace parquet +// Helper function to create a file reader from an implementation of an Arrow +// readable file +ARROW_EXPORT +Status OpenFile(const std::shared_ptr& file, + ParquetAllocator* allocator, std::unique_ptr* reader); +} // namespace parquet } // namespace arrow #endif // ARROW_PARQUET_READER_H diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 0139edd3bb8..f9514aa2ad2 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -35,7 +35,6 @@ using parquet::ParquetVersion; using parquet::schema::GroupNode; namespace arrow { - namespace parquet { class FileWriter::Impl { diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index 45d0fd59868..5aa1ba58717 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -55,7 +55,7 @@ class ARROW_EXPORT FileWriter { MemoryPool* memory_pool() const; private: - class Impl; + class ARROW_NO_EXPORT Impl; std::unique_ptr impl_; }; diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd index d0fb8f9f000..734ace6c923 100644 --- a/python/pyarrow/includes/libarrow_io.pxd +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -19,11 +19,37 @@ from pyarrow.includes.common cimport * -cdef extern from "arrow/io/interfaces.h" nogil: +cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: + enum FileMode" arrow::io::FileMode::type": + FileMode_READ" arrow::io::FileMode::READ" + FileMode_WRITE" arrow::io::FileMode::WRITE" + FileMode_READWRITE" arrow::io::FileMode::READWRITE" + enum ObjectType" arrow::io::ObjectType::type": ObjectType_FILE" arrow::io::ObjectType::FILE" ObjectType_DIRECTORY" arrow::io::ObjectType::DIRECTORY" + cdef cppclass FileBase: + CStatus Close() + CStatus Tell(int64_t* position) + + cdef cppclass ReadableFile(FileBase): + CStatus GetSize(int64_t* size) + CStatus Read(int64_t nbytes, int64_t* bytes_read, + uint8_t* buffer) + + CStatus ReadAt(int64_t position, int64_t nbytes, + int64_t* bytes_read, uint8_t* buffer) + + cdef cppclass RandomAccessFile(ReadableFile): + CStatus Seek(int64_t position) + + cdef cppclass WriteableFile(FileBase): + CStatus Write(const uint8_t* buffer, int64_t nbytes) + # CStatus Write(const uint8_t* buffer, int64_t nbytes, + # int64_t* bytes_written) + + cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: CStatus ConnectLibHdfs() @@ -44,24 +70,11 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: int64_t block_size int16_t permissions - cdef cppclass CHdfsFile: - CStatus Close() - CStatus Seek(int64_t position) - CStatus Tell(int64_t* position) - - cdef cppclass HdfsReadableFile(CHdfsFile): - CStatus GetSize(int64_t* size) - CStatus Read(int64_t nbytes, int64_t* bytes_read, - uint8_t* buffer) - - CStatus ReadAt(int64_t position, int64_t nbytes, - int64_t* bytes_read, uint8_t* buffer) - - cdef cppclass HdfsWriteableFile(CHdfsFile): - CStatus Write(const uint8_t* buffer, int64_t nbytes) + cdef cppclass HdfsReadableFile(RandomAccessFile): + pass - CStatus Write(const uint8_t* buffer, int64_t nbytes, - int64_t* bytes_written) + cdef cppclass HdfsWriteableFile(WriteableFile): + pass cdef cppclass CHdfsClient" arrow::io::HdfsClient": @staticmethod diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index a2f83ea5ea5..fe24f593e32 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -19,6 +19,7 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport CSchema, CStatus, CTable, MemoryPool +from pyarrow.includes.libarrow_io cimport RandomAccessFile cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: @@ -90,19 +91,36 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: shared_ptr[WriterProperties] build() +cdef extern from "arrow/parquet/io.h" namespace "arrow::parquet" nogil: + cdef cppclass ParquetAllocator: + ParquetAllocator() + ParquetAllocator(MemoryPool* pool) + MemoryPool* pool() + void set_pool(MemoryPool* pool) + + cdef cppclass ParquetReadSource: + ParquetReadSource(ParquetAllocator* allocator) + Open(const shared_ptr[RandomAccessFile]& file) + + cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil: + CStatus OpenFile(const shared_ptr[RandomAccessFile]& file, + ParquetAllocator* allocator, + unique_ptr[FileReader]* reader) + cdef cppclass FileReader: FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader) CStatus ReadFlatTable(shared_ptr[CTable]* out); cdef extern from "arrow/parquet/schema.h" namespace "arrow::parquet" nogil: - CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, shared_ptr[CSchema]* out) - CStatus ToParquetSchema(const CSchema* arrow_schema, shared_ptr[SchemaDescriptor]* out) + CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, + shared_ptr[CSchema]* out) + CStatus ToParquetSchema(const CSchema* arrow_schema, + shared_ptr[SchemaDescriptor]* out) cdef extern from "arrow/parquet/writer.h" namespace "arrow::parquet" nogil: cdef CStatus WriteFlatTable(const CTable* table, MemoryPool* pool, const shared_ptr[OutputStream]& sink, int64_t chunk_size, const shared_ptr[WriterProperties]& properties) - diff --git a/python/pyarrow/io.pxd b/python/pyarrow/io.pxd new file mode 100644 index 00000000000..b92af72704a --- /dev/null +++ b/python/pyarrow/io.pxd @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_io cimport RandomAccessFile, WriteableFile + + +cdef class NativeFileInterface: + + # By implementing these "virtual" functions (all functions in Cython + # extension classes are technically virtual in the C++ sense)m we can + # expose the arrow::io abstract file interfaces to other components + # throughout the suite of Arrow C++ libraries + cdef read_handle(self, shared_ptr[RandomAccessFile]* file) + cdef write_handle(self, shared_ptr[WriteableFile]* file) diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 071eea5ba6e..b8bf8835620 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -164,7 +164,7 @@ cdef class HdfsClient: .ListDirectory(c_path, &listing)) cdef const HdfsPathInfo* info - for i in range(listing.size()): + for i in range( listing.size()): info = &listing[i] # Try to trim off the hdfs://HOST:PORT piece @@ -314,8 +314,15 @@ cdef class HdfsClient: f = self.open(path, 'rb', buffer_size=buffer_size) f.download(stream) +cdef class NativeFileInterface: -cdef class HdfsFile: + cdef read_handle(self, shared_ptr[RandomAccessFile]* file): + raise NotImplementedError + + cdef write_handle(self, shared_ptr[WriteableFile]* file): + raise NotImplementedError + +cdef class HdfsFile(NativeFileInterface): cdef: shared_ptr[HdfsReadableFile] rd_file shared_ptr[HdfsWriteableFile] wr_file @@ -357,6 +364,14 @@ cdef class HdfsFile: if self.is_readonly: raise IOError("only valid on writeonly files") + cdef read_handle(self, shared_ptr[RandomAccessFile]* file): + self._assert_readable() + file[0] = self.rd_file + + cdef write_handle(self, shared_ptr[WriteableFile]* file): + self._assert_writeable() + file[0] = self.wr_file + def size(self): cdef int64_t size self._assert_readable() diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 0b2b2088033..ebba1a17ac7 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -20,34 +20,75 @@ # cython: embedsignature = True from pyarrow.includes.libarrow cimport * -cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.includes.parquet cimport * +from pyarrow.includes.libarrow_io cimport RandomAccessFile, WriteableFile +cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.compat import tobytes from pyarrow.error import ArrowException from pyarrow.error cimport check_cstatus +from pyarrow.io import NativeFileInterface from pyarrow.table cimport Table -def read_table(filename, columns=None): +from pyarrow.io cimport NativeFileInterface + +import six + + +cdef class ParquetReader: + cdef: + ParquetAllocator allocator + unique_ptr[FileReader] reader + + def __cinit__(self): + self.allocator.set_pool(default_memory_pool()) + + cdef open_local_file(self, file_path): + cdef c_string path = tobytes(file_path) + + # Must be in one expression to avoid calling std::move which is not + # possible in Cython (due to missing rvalue support) + + # TODO(wesm): ParquetFileReader::OpenFIle can throw? + self.reader = unique_ptr[FileReader]( + new FileReader(default_memory_pool(), + ParquetFileReader.OpenFile(path))) + + cdef open_native_file(self, NativeFileInterface file): + cdef shared_ptr[RandomAccessFile] cpp_handle + file.read_handle(&cpp_handle) + + check_cstatus(OpenFile(cpp_handle, &self.allocator, &self.reader)) + + def read_all(self): + cdef: + Table table = Table() + shared_ptr[CTable] ctable + + with nogil: + check_cstatus(self.reader.get() + .ReadFlatTable(&ctable)) + + table.init(ctable) + return table + + +def read_table(source, columns=None): """ Read a Table from Parquet format Returns ------- table: pyarrow.Table """ - cdef unique_ptr[FileReader] reader - cdef Table table = Table() - cdef shared_ptr[CTable] ctable - - # Must be in one expression to avoid calling std::move which is not possible - # in Cython (due to missing rvalue support) - reader = unique_ptr[FileReader](new FileReader(default_memory_pool(), - ParquetFileReader.OpenFile(tobytes(filename)))) - with nogil: - check_cstatus(reader.get().ReadFlatTable(&ctable)) + cdef ParquetReader reader = ParquetReader() + + if isinstance(source, six.string_types): + reader.open_local_file(source) + elif isinstance(source, NativeFileInterface): + reader.open_native_file(source) + + return reader.read_all() - table.init(ctable) - return table def write_table(table, filename, chunk_size=None, version=None): """ @@ -84,4 +125,3 @@ def write_table(table, filename, chunk_size=None, version=None): with nogil: check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, chunk_size_, properties_builder.build())) - From a2fb756a43441a72e10ae74fa0e483e01bc5917e Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Tue, 19 Jul 2016 13:39:48 -0700 Subject: [PATCH 100/210] ARROW-241: Add missing implementation for splitAndTransfer in UnionVector Use simple implementation that actually just copies --- java/vector/src/main/codegen/templates/UnionVector.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 6042a5bf683..482944828ad 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -264,7 +264,11 @@ public void transfer() { @Override public void splitAndTransfer(int startIndex, int length) { - + to.allocateNew(); + for (int i = 0; i < length; i++) { + to.copyFromSafe(startIndex + i, i, org.apache.arrow.vector.complex.UnionVector.this); + } + to.getMutator().setValueCount(length); } @Override From dc79ceb05c05e626e2324863cfc3f386ecccce90 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 1 Aug 2016 11:29:02 -0700 Subject: [PATCH 101/210] ARROW-244: Some global APIs of IPC module should be visible to the outside Author: Jihoon Son Closes #109 from jihoonson/ARROW-244 and squashes the following commits: 51d9a87 [Jihoon Son] Make line length shorter than 90 2da5466 [Jihoon Son] Make some APIs of IPC module visible --- cpp/src/arrow/ipc/adapter.h | 11 +++++++---- cpp/src/arrow/ipc/memory.h | 5 +++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index 0d2b77f5ace..a34a5c4fcc9 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -24,6 +24,8 @@ #include #include +#include "arrow/util/visibility.h" + namespace arrow { class Array; @@ -54,20 +56,21 @@ constexpr int kMaxIpcRecursionDepth = 64; // // Finally, the memory offset to the start of the metadata / data header is // returned in an out-variable -Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, - int64_t* header_offset, int max_recursion_depth = kMaxIpcRecursionDepth); +ARROW_EXPORT Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, + int64_t position, int64_t* header_offset, + int max_recursion_depth = kMaxIpcRecursionDepth); // int64_t GetRowBatchMetadata(const RowBatch* batch); // Compute the precise number of bytes needed in a contiguous memory segment to // write the row batch. This involves generating the complete serialized // Flatbuffers metadata. -Status GetRowBatchSize(const RowBatch* batch, int64_t* size); +ARROW_EXPORT Status GetRowBatchSize(const RowBatch* batch, int64_t* size); // ---------------------------------------------------------------------- // "Read" path; does not copy data if the MemorySource does not -class RowBatchReader { +class ARROW_EXPORT RowBatchReader { public: static Status Open( MemorySource* source, int64_t position, std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h index c6fd7a71899..377401d85c0 100644 --- a/cpp/src/arrow/ipc/memory.h +++ b/cpp/src/arrow/ipc/memory.h @@ -25,6 +25,7 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/visibility.h" namespace arrow { @@ -69,7 +70,7 @@ class BufferOutputStream : public OutputStream { int64_t position_; }; -class MemorySource { +class ARROW_EXPORT MemorySource { public: // Indicates the access permissions of the memory source enum AccessMode { READ_ONLY, READ_WRITE }; @@ -100,7 +101,7 @@ class MemorySource { }; // A memory source that uses memory-mapped files for memory interactions -class MemoryMappedSource : public MemorySource { +class ARROW_EXPORT MemoryMappedSource : public MemorySource { public: static Status Open(const std::string& path, AccessMode access_mode, std::shared_ptr* out); From 356d015bb7de3a12167ac8ea02dbda9bbdc8c27f Mon Sep 17 00:00:00 2001 From: MechCoder Date: Wed, 13 Jul 2016 17:24:26 -0700 Subject: [PATCH 102/210] ARROW-240: Provide more detailed installation instructions for pyarrow. Closes --- python/README.md | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/python/README.md b/python/README.md index c79fa9786f4..bafe71b05ec 100644 --- a/python/README.md +++ b/python/README.md @@ -4,11 +4,40 @@ This library provides a Pythonic API wrapper for the reference Arrow C++ implementation, along with tools for interoperability with pandas, NumPy, and other traditional Python scientific computing packages. -#### Development details +### Development details This project is layered in two pieces: * pyarrow, a C++ library for easier interoperability between Arrow C++, NumPy, and pandas * Cython extensions and pure Python code under arrow/ which expose Arrow C++ - and pyarrow to pure Python users \ No newline at end of file + and pyarrow to pure Python users + +#### PyArrow Dependencies: +These are the various projects that PyArrow depends on. + +1. **g++ and gcc Version >= 4.8** +2. **cmake > 2.8.6** +3. **boost** +4. **Parquet-cpp** + + The preferred way to install parquet-cpp is to use conda. + You need to set the ``PARQUET_HOME`` environment variable to where parquet-cpp is installed. + ```bash + conda install -y --channel apache/channel/dev parquet-cpp + ``` +5. **Arrow-cpp and its dependencies*** + + The Arrow C++ library must be built with all options enabled and installed with ``ARROW_HOME`` environment variable set to + the installation location. Look at (https://github.com/apache/arrow/blob/master/cpp/README.md) for + instructions. Alternatively you could just install arrow-cpp + from conda. + ```bash + conda install arrow-cpp -c apache/channel/dev + ``` +6. **Python dependencies: numpy, pandas, cython, pytest** + +#### Install pyarrow + ```bash + python setup.py build_ext --inplace + ``` From 3a2dfba59a2482226cc3c49a11a779dd9ce3dfd7 Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Mon, 1 Aug 2016 16:31:54 -0700 Subject: [PATCH 103/210] ARROW-101: Fix java compiler warnings Fixes several warnings emitted by java compiler regarding the use of raw types and unclosed resources. Author: Laurent Goujon Closes #60 from laurentgo/laurent/fix-generic-warnings and squashes the following commits: 96ccc67 [Laurent Goujon] [ARROW-101] Fix java compiler resources warnings 61bde83 [Laurent Goujon] [ARROW-101] Fix java compiler rawtypes warnings --- .../src/main/java/org/apache/arrow/vector/ZeroVector.java | 5 +++-- .../arrow/vector/complex/impl/PromotableWriter.java | 8 ++++---- .../org/apache/arrow/vector/util/JsonStringArrayList.java | 2 +- .../org/apache/arrow/vector/util/JsonStringHashMap.java | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index 78de8706fb7..c94e8d1db09 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -19,6 +19,7 @@ import io.netty.buffer.ArrowBuf; +import java.util.Collections; import java.util.Iterator; import org.apache.arrow.memory.BufferAllocator; @@ -109,8 +110,8 @@ public TransferPair getTransferPair(BufferAllocator allocator) { // } @Override - public Iterator iterator() { - return Iterators.emptyIterator(); + public Iterator iterator() { + return Collections.emptyIterator(); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index ea62e023608..45509f688ba 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -85,16 +85,16 @@ private void setWriter(ValueVector v) { state = State.SINGLE; vector = v; type = v.getField().getType().getMinorType(); - Class writerClass = BasicTypeHelper + Class writerClass = BasicTypeHelper .getWriterImpl(v.getField().getType().getMinorType(), v.getField().getDataMode()); if (writerClass.equals(SingleListWriter.class)) { writerClass = UnionListWriter.class; } - Class vectorClass = BasicTypeHelper.getValueVectorClass(v.getField().getType().getMinorType(), v.getField() + Class vectorClass = BasicTypeHelper.getValueVectorClass(v.getField().getType().getMinorType(), v.getField() .getDataMode()); try { - Constructor constructor = null; - for (Constructor c : writerClass.getConstructors()) { + Constructor constructor = null; + for (Constructor c : writerClass.getConstructors()) { if (c.getParameterTypes().length == 3) { constructor = c; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java index 7aeaa12ef9f..6291bfeaee6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringArrayList.java @@ -42,7 +42,7 @@ public boolean equals(Object obj) { if (!(obj instanceof List)) { return false; } - List other = (List) obj; + List other = (List) obj; return this.size() == other.size() && this.containsAll(other); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java index 750dd592aa4..e8ce5221eeb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/JsonStringHashMap.java @@ -46,7 +46,7 @@ public boolean equals(Object obj) { if (!(obj instanceof Map)) { return false; } - Map other = (Map) obj; + Map other = (Map) obj; if (this.size() != other.size()) { return false; } From 56835c338f01aebcace01312e82431306e7fd578 Mon Sep 17 00:00:00 2001 From: adeneche Date: Mon, 1 Aug 2016 15:28:08 -0700 Subject: [PATCH 104/210] ARROW-246: [Java] UnionVector doesn't call allocateNew() when creating it's vectorType --- .../main/codegen/templates/UnionVector.java | 2 + .../arrow/vector/DirtyBufferAllocator.java | 120 ++++++++++++++++++ .../apache/arrow/vector/TestUnionVector.java | 88 +++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 482944828ad..692436d1285 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -73,6 +73,8 @@ public UnionVector(MaterializedField field, BufferAllocator allocator, CallBack this.allocator = allocator; this.internalMap = new MapVector("internal", allocator, callBack); this.typeVector = internalMap.addOrGet("types", new MajorType(MinorType.UINT1, DataMode.REQUIRED), UInt1Vector.class); + this.typeVector.allocateNew(); + this.typeVector.zeroVector(); this.field.addChild(internalMap.getField().clone()); this.majorType = field.getType(); this.callBack = callBack; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java b/java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java new file mode 100644 index 00000000000..cc6b9ec51d6 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import org.apache.arrow.memory.AllocationReservation; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.BufferManager; + +import io.netty.buffer.ArrowBuf; +import io.netty.buffer.ByteBufAllocator; + +/** + * Wrapper around a buffer delegate that populates any allocated buffer with a constant + * value. Useful for testing if value vectors are properly resetting their buffers. + */ +public class DirtyBufferAllocator implements BufferAllocator { + + private final BufferAllocator delegate; + private final byte fillValue; + + DirtyBufferAllocator(final BufferAllocator delegate, final byte fillValue) { + this.delegate = delegate; + this.fillValue = fillValue; + } + + @Override + public ArrowBuf buffer(int size) { + return buffer(size, null); + } + + @Override + public ArrowBuf buffer(int size, BufferManager manager) { + ArrowBuf buffer = delegate.buffer(size, manager); + // contaminate the buffer + for (int i = 0; i < buffer.capacity(); i++) { + buffer.setByte(i, fillValue); + } + + return buffer; + } + + @Override + public ByteBufAllocator getAsByteBufAllocator() { + return delegate.getAsByteBufAllocator(); + } + + @Override + public BufferAllocator newChildAllocator(String name, long initReservation, long maxAllocation) { + return delegate.newChildAllocator(name, initReservation, maxAllocation); + } + + @Override + public void close() { + delegate.close(); + } + + @Override + public long getAllocatedMemory() { + return delegate.getAllocatedMemory(); + } + + @Override + public void setLimit(long newLimit) { + delegate.setLimit(newLimit); + } + + @Override + public long getLimit() { + return delegate.getLimit(); + } + + @Override + public long getPeakMemoryAllocation() { + return delegate.getPeakMemoryAllocation(); + } + + @Override + public AllocationReservation newReservation() { + return delegate.newReservation(); + } + + @Override + public ArrowBuf getEmpty() { + return delegate.getEmpty(); + } + + @Override + public String getName() { + return delegate.getName(); + } + + @Override + public boolean isOverLimit() { + return delegate.isOverLimit(); + } + + @Override + public String toVerboseString() { + return delegate.toVerboseString(); + } + + @Override + public void assertOpen() { + delegate.assertOpen(); + }} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java new file mode 100644 index 00000000000..8f19b3191ba --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.holders.NullableUInt4Holder; +import org.apache.arrow.vector.holders.UInt4Holder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestUnionVector { + private final static String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testUnionVector() throws Exception { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + + final BufferAllocator alloc = new DirtyBufferAllocator(allocator, (byte) 100); + + UnionVector unionVector = new UnionVector(field, alloc, null); + + final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); + uInt4Holder.value = 100; + uInt4Holder.isSet = 1; + + try { + // write some data + final UnionVector.Mutator mutator = unionVector.getMutator(); + mutator.setType(0, Types.MinorType.UINT4); + mutator.setSafe(0, uInt4Holder); + mutator.setType(2, Types.MinorType.UINT4); + mutator.setSafe(2, uInt4Holder); + mutator.setValueCount(4); + + // check that what we wrote is correct + final UnionVector.Accessor accessor = unionVector.getAccessor(); + assertEquals(4, accessor.getValueCount()); + + assertEquals(false, accessor.isNull(0)); + assertEquals(100, accessor.getObject(0)); + + assertEquals(true, accessor.isNull(1)); + + assertEquals(false, accessor.isNull(2)); + assertEquals(100, accessor.getObject(2)); + + assertEquals(true, accessor.isNull(3)); + + } finally { + unionVector.clear(); + } + } + +} From 5df7d4dee5fd57e91d9bb83f44f2269f61b79fb3 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 4 Aug 2016 15:29:01 -0700 Subject: [PATCH 105/210] ARROW-247: Missing explicit destructor in RowBatchReader causes an incomplete type error Author: Jihoon Son Closes #111 from jihoonson/ARROW-247 and squashes the following commits: cc7281c [Jihoon Son] Make destructor virtual 795d3d3 [Jihoon Son] Merge branch 'master' of https://github.com/apache/arrow into ARROW-247 df297ef [Jihoon Son] Trigger travis 65d64c8 [Jihoon Son] Make the comment into two line 9555260 [Jihoon Son] Add a comment f671a32 [Jihoon Son] Add explicit destructor for RowBatchReader --- cpp/src/arrow/ipc/adapter.cc | 4 ++++ cpp/src/arrow/ipc/adapter.h | 2 ++ 2 files changed, 6 insertions(+) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index bac11727006..84f7830092c 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -369,6 +369,10 @@ Status RowBatchReader::Open(MemorySource* source, int64_t position, return Status::OK(); } +// Here the explicit destructor is required for compilers to be aware of +// the complete information of RowBatchReader::Impl class +RowBatchReader::~RowBatchReader() {} + Status RowBatchReader::GetRowBatch( const std::shared_ptr& schema, std::shared_ptr* out) { return impl_->AssembleBatch(schema, out); diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index a34a5c4fcc9..6231af66aa1 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -78,6 +78,8 @@ class ARROW_EXPORT RowBatchReader { static Status Open(MemorySource* source, int64_t position, int max_recursion_depth, std::shared_ptr* out); + virtual ~RowBatchReader(); + // Reassemble the row batch. A Schema is required to be able to construct the // right array containers Status GetRowBatch( From 34e7f48cb71428c4d78cf00d8fdf0045532d6607 Mon Sep 17 00:00:00 2001 From: adeneche Date: Fri, 5 Aug 2016 10:26:47 -0700 Subject: [PATCH 106/210] ARROW-250: Fix for ARROW-246 may cause memory leaks this closes #112 --- .../main/codegen/templates/UnionVector.java | 3 +- .../vector/complex/impl/PromotableWriter.java | 1 + .../arrow/vector/DirtyBufferAllocator.java | 120 ------------------ .../arrow/vector/DirtyRootAllocator.java | 53 ++++++++ .../apache/arrow/vector/TestUnionVector.java | 14 +- .../complex/impl/TestPromotableWriter.java | 98 ++++++++++++++ 6 files changed, 157 insertions(+), 132 deletions(-) delete mode 100644 java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 692436d1285..0f089b7e915 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -73,8 +73,6 @@ public UnionVector(MaterializedField field, BufferAllocator allocator, CallBack this.allocator = allocator; this.internalMap = new MapVector("internal", allocator, callBack); this.typeVector = internalMap.addOrGet("types", new MajorType(MinorType.UINT1, DataMode.REQUIRED), UInt1Vector.class); - this.typeVector.allocateNew(); - this.typeVector.zeroVector(); this.field.addChild(internalMap.getField().clone()); this.majorType = field.getType(); this.callBack = callBack; @@ -193,6 +191,7 @@ public int getValueCapacity() { @Override public void close() { + clear(); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index 45509f688ba..462ec9dd86a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -155,6 +155,7 @@ private FieldWriter promoteToUnion() { tp.transfer(); if (parentContainer != null) { unionVector = parentContainer.addOrGet(name, new MajorType(MinorType.UNION, DataMode.OPTIONAL), UnionVector.class); + unionVector.allocateNew(); } else if (listVector != null) { unionVector = listVector.promoteToUnion(); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java b/java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java deleted file mode 100644 index cc6b9ec51d6..00000000000 --- a/java/vector/src/test/java/org/apache/arrow/vector/DirtyBufferAllocator.java +++ /dev/null @@ -1,120 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector; - -import org.apache.arrow.memory.AllocationReservation; -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.BufferManager; - -import io.netty.buffer.ArrowBuf; -import io.netty.buffer.ByteBufAllocator; - -/** - * Wrapper around a buffer delegate that populates any allocated buffer with a constant - * value. Useful for testing if value vectors are properly resetting their buffers. - */ -public class DirtyBufferAllocator implements BufferAllocator { - - private final BufferAllocator delegate; - private final byte fillValue; - - DirtyBufferAllocator(final BufferAllocator delegate, final byte fillValue) { - this.delegate = delegate; - this.fillValue = fillValue; - } - - @Override - public ArrowBuf buffer(int size) { - return buffer(size, null); - } - - @Override - public ArrowBuf buffer(int size, BufferManager manager) { - ArrowBuf buffer = delegate.buffer(size, manager); - // contaminate the buffer - for (int i = 0; i < buffer.capacity(); i++) { - buffer.setByte(i, fillValue); - } - - return buffer; - } - - @Override - public ByteBufAllocator getAsByteBufAllocator() { - return delegate.getAsByteBufAllocator(); - } - - @Override - public BufferAllocator newChildAllocator(String name, long initReservation, long maxAllocation) { - return delegate.newChildAllocator(name, initReservation, maxAllocation); - } - - @Override - public void close() { - delegate.close(); - } - - @Override - public long getAllocatedMemory() { - return delegate.getAllocatedMemory(); - } - - @Override - public void setLimit(long newLimit) { - delegate.setLimit(newLimit); - } - - @Override - public long getLimit() { - return delegate.getLimit(); - } - - @Override - public long getPeakMemoryAllocation() { - return delegate.getPeakMemoryAllocation(); - } - - @Override - public AllocationReservation newReservation() { - return delegate.newReservation(); - } - - @Override - public ArrowBuf getEmpty() { - return delegate.getEmpty(); - } - - @Override - public String getName() { - return delegate.getName(); - } - - @Override - public boolean isOverLimit() { - return delegate.isOverLimit(); - } - - @Override - public String toVerboseString() { - return delegate.toVerboseString(); - } - - @Override - public void assertOpen() { - delegate.assertOpen(); - }} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java b/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java new file mode 100644 index 00000000000..f775f1d2d67 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/DirtyRootAllocator.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferManager; +import org.apache.arrow.memory.RootAllocator; + +import io.netty.buffer.ArrowBuf; + +/** + * Root allocator that returns buffers pre-filled with a given value.
+ * Useful for testing if value vectors are properly zeroing their buffers. + */ +public class DirtyRootAllocator extends RootAllocator { + + private final byte fillValue; + + public DirtyRootAllocator(final long limit, final byte fillValue) { + super(limit); + this.fillValue = fillValue; + } + + @Override + public ArrowBuf buffer(int size) { + return buffer(size, null); + } + + @Override + public ArrowBuf buffer(int size, BufferManager manager) { + ArrowBuf buffer = super.buffer(size, manager); + // contaminate the buffer + for (int i = 0; i < buffer.capacity(); i++) { + buffer.setByte(i, fillValue); + } + + return buffer; + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java index 8f19b3191ba..e4d28c3f88c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -20,7 +20,6 @@ import static org.junit.Assert.assertEquals; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.holders.NullableUInt4Holder; import org.apache.arrow.vector.holders.UInt4Holder; @@ -37,7 +36,7 @@ public class TestUnionVector { @Before public void init() { - allocator = new RootAllocator(Long.MAX_VALUE); + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); } @After @@ -49,15 +48,13 @@ public void terminate() throws Exception { public void testUnionVector() throws Exception { final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - final BufferAllocator alloc = new DirtyBufferAllocator(allocator, (byte) 100); - - UnionVector unionVector = new UnionVector(field, alloc, null); - final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); uInt4Holder.value = 100; uInt4Holder.isSet = 1; - try { + try (UnionVector unionVector = new UnionVector(field, allocator, null)) { + unionVector.allocateNew(); + // write some data final UnionVector.Mutator mutator = unionVector.getMutator(); mutator.setType(0, Types.MinorType.UINT4); @@ -79,9 +76,6 @@ public void testUnionVector() throws Exception { assertEquals(100, accessor.getObject(2)); assertEquals(true, accessor.isNull(3)); - - } finally { - unionVector.clear(); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java new file mode 100644 index 00000000000..4c24444d81d --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.impl; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.DirtyRootAllocator; +import org.apache.arrow.vector.complex.AbstractMapVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.holders.UInt4Holder; +import org.apache.arrow.vector.types.MaterializedField; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestPromotableWriter { + private final static String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testPromoteToUnion() throws Exception { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + + try (final AbstractMapVector container = new MapVector(field, allocator, null); + final MapVector v = container.addOrGet("test", MapVector.TYPE, MapVector.class); + final PromotableWriter writer = new PromotableWriter(v, container)) { + + container.allocateNew(); + + writer.start(); + + writer.setPosition(0); + writer.bit("A").writeBit(0); + + writer.setPosition(1); + writer.bit("A").writeBit(1); + + writer.setPosition(2); + writer.integer("A").writeInt(10); + + // we don't write anything in 3 + + writer.setPosition(4); + writer.integer("A").writeInt(100); + + writer.end(); + + container.getMutator().setValueCount(5); + + final UnionVector uv = v.getChild("A", UnionVector.class); + final UnionVector.Accessor accessor = uv.getAccessor(); + + assertFalse("0 shouldn't be null", accessor.isNull(0)); + assertEquals(false, accessor.getObject(0)); + + assertFalse("1 shouldn't be null", accessor.isNull(1)); + assertEquals(true, accessor.getObject(1)); + + assertFalse("2 shouldn't be null", accessor.isNull(2)); + assertEquals(10, accessor.getObject(2)); + + assertTrue("3 should be null", accessor.isNull(3)); + + assertFalse("4 shouldn't be null", accessor.isNull(4)); + assertEquals(100, accessor.getObject(4)); + } + } +} From 2742d37cc3f890ffd68ba46920240c18ae5528ae Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 12 Aug 2016 15:58:20 -0700 Subject: [PATCH 107/210] ARROW-254: remove Bit type as it is redundant with Boolean The only use of Bit is for the nullability (or validity) vector which is best understood as a boolean type. We should remove it as it is not used. Author: Julien Le Dem Closes #116 from julienledem/arrow_254_remove_bit_type and squashes the following commits: 1cada12 [Julien Le Dem] ARROW-254: remove Bit type --- cpp/src/arrow/ipc/metadata-internal.cc | 2 -- format/Message.fbs | 4 ---- 2 files changed, 6 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1b1d50f96ea..5c439120b17 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -99,8 +99,6 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, return Status::Invalid("Type metadata cannot be none"); case flatbuf::Type_Int: return IntFromFlatbuffer(static_cast(type_data), out); - case flatbuf::Type_Bit: - return Status::NotImplemented("Type is not implemented"); case flatbuf::Type_FloatingPoint: return FloatFromFlatuffer( static_cast(type_data), out); diff --git a/format/Message.fbs b/format/Message.fbs index fc849eedf79..e0a956c3b25 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -20,9 +20,6 @@ table Union { mode: UnionMode; } -table Bit { -} - table Int { bitWidth: int; // 1 to 64 is_signed: bool; @@ -62,7 +59,6 @@ table JSONScalar { union Type { Int, - Bit, FloatingPoint, Binary, Utf8, From dc01f099d966b92f4de7679b4a1caf97c363e08e Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 12 Aug 2016 16:00:18 -0700 Subject: [PATCH 108/210] ARROW-253: restrict ints to 8, 16, 32, or 64 bits in V1 Author: Julien Le Dem Closes #115 from julienledem/arrow_253_int_8_16_32_64 and squashes the following commits: d8df119 [Julien Le Dem] ARROW-253: restrict ints to 8, 16, 32, or 64 bits in V1 --- cpp/src/arrow/ipc/metadata-internal.cc | 9 ++++----- format/Message.fbs | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 5c439120b17..e6b47de70ed 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -55,12 +55,12 @@ const std::shared_ptr DOUBLE = std::make_shared(); static Status IntFromFlatbuffer( const flatbuf::Int* int_data, std::shared_ptr* out) { - if (int_data->bitWidth() % 8 != 0) { - return Status::NotImplemented("Integers not in cstdint are not implemented"); - } if (int_data->bitWidth() > 64) { return Status::NotImplemented("Integers with more than 64 bits not implemented"); } + if (int_data->bitWidth() < 8) { + return Status::NotImplemented("Integers with less than 8 bits not implemented"); + } switch (int_data->bitWidth()) { case 8: @@ -76,8 +76,7 @@ static Status IntFromFlatbuffer( *out = int_data->is_signed() ? INT64 : UINT64; break; default: - *out = nullptr; - break; + return Status::NotImplemented("Integers not in cstdint are not implemented"); } return Status::OK(); } diff --git a/format/Message.fbs b/format/Message.fbs index e0a956c3b25..6a351b9dbf0 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -21,7 +21,7 @@ table Union { } table Int { - bitWidth: int; // 1 to 64 + bitWidth: int; // restricted to 8, 16, 32, and 64 in v1 is_signed: bool; } From e8724f8379324c59d285d2380005577a49290c42 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sat, 13 Aug 2016 13:50:02 +0900 Subject: [PATCH 109/210] ARROW-260: Fix flaky oversized tests - Limit max allocation bytes for a vector as 1 KB - Remove System.setProperty() in TestValueVector - Move tests which test OversizedAllocationException for ValueVector into a separate class and add a disclaimer - Add a comment for the new test This closes #118. --- java/pom.xml | 3 + ...TestOversizedAllocationForValueVector.java | 137 ++++++++++++++++++ .../apache/arrow/vector/TestValueVector.java | 131 +---------------- 3 files changed, 145 insertions(+), 126 deletions(-) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java diff --git a/java/pom.xml b/java/pom.xml index ea42894fda2..71f59caf279 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -303,6 +303,9 @@ ${project.build.directory} + + -Darrow.vector.max_allocation_bytes=1048576 diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java new file mode 100644 index 00000000000..4dee86c9d59 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.holders.UInt4Holder; +import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.util.OversizedAllocationException; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * This class tests that OversizedAllocationException occurs when a large memory is allocated for a vector. + * Typically, arrow allows the allocation of the size of at most Integer.MAX_VALUE, but this might cause OOM in tests. + * Thus, the max allocation size is limited to 1 KB in this class. Please see the surefire option in pom.xml. + */ +public class TestOversizedAllocationForValueVector { + + private final static String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test(expected = OversizedAllocationException.class) + public void testFixedVectorReallocation() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + final UInt4Vector vector = new UInt4Vector(field, allocator); + // edge case 1: buffer size = max value capacity + final int expectedValueCapacity = BaseValueVector.MAX_ALLOCATION_SIZE / 4; + try { + vector.allocateNew(expectedValueCapacity); + assertEquals(expectedValueCapacity, vector.getValueCapacity()); + vector.reAlloc(); + assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); + } finally { + vector.close(); + } + + // common case: value count < max value capacity + try { + vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 8); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this should throw an IOOB + } finally { + vector.close(); + } + } + + @Test(expected = OversizedAllocationException.class) + public void testBitVectorReallocation() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + final BitVector vector = new BitVector(field, allocator); + // edge case 1: buffer size ~ max value capacity + final int expectedValueCapacity = 1 << 29; + try { + vector.allocateNew(expectedValueCapacity); + assertEquals(expectedValueCapacity, vector.getValueCapacity()); + vector.reAlloc(); + assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(expectedValueCapacity); + for (int i=0; i<3;i++) { + vector.reAlloc(); // expand buffer size + } + assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); + vector.reAlloc(); // buffer size ~ max allocation + assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); + vector.reAlloc(); // overflow + } finally { + vector.close(); + } + } + + + @Test(expected = OversizedAllocationException.class) + public void testVariableVectorReallocation() { + final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); + final VarCharVector vector = new VarCharVector(field, allocator); + // edge case 1: value count = MAX_VALUE_ALLOCATION + final int expectedAllocationInBytes = BaseValueVector.MAX_ALLOCATION_SIZE; + final int expectedOffsetSize = 10; + try { + vector.allocateNew(expectedAllocationInBytes, 10); + assertTrue(expectedOffsetSize <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes <= vector.getBuffer().capacity()); + vector.reAlloc(); + assertTrue(expectedOffsetSize * 2 <= vector.getValueCapacity()); + assertTrue(expectedAllocationInBytes * 2 <= vector.getBuffer().capacity()); + } finally { + vector.close(); + } + + // common: value count < MAX_VALUE_ALLOCATION + try { + vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 2, 0); + vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION + vector.reAlloc(); // this tests if it overflows + } finally { + vector.close(); + } + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index b5c4509c8b5..ce091ab1ed0 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -17,29 +17,13 @@ */ package org.apache.arrow.vector; -import static org.junit.Assert.assertArrayEquals; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - -import java.nio.charset.Charset; - import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.RepeatedListVector; import org.apache.arrow.vector.complex.RepeatedMapVector; -import org.apache.arrow.vector.holders.BitHolder; -import org.apache.arrow.vector.holders.IntHolder; -import org.apache.arrow.vector.holders.NullableFloat4Holder; -import org.apache.arrow.vector.holders.NullableUInt4Holder; -import org.apache.arrow.vector.holders.NullableVar16CharHolder; -import org.apache.arrow.vector.holders.NullableVarCharHolder; -import org.apache.arrow.vector.holders.RepeatedFloat4Holder; -import org.apache.arrow.vector.holders.RepeatedIntHolder; -import org.apache.arrow.vector.holders.RepeatedVarBinaryHolder; -import org.apache.arrow.vector.holders.UInt4Holder; -import org.apache.arrow.vector.holders.VarCharHolder; +import org.apache.arrow.vector.holders.*; import org.apache.arrow.vector.types.MaterializedField; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; @@ -47,40 +31,19 @@ import org.apache.arrow.vector.util.OversizedAllocationException; import org.junit.After; import org.junit.Before; -import org.junit.Rule; import org.junit.Test; -import org.junit.rules.ExternalResource; + +import java.nio.charset.Charset; + +import static org.junit.Assert.*; public class TestValueVector { - //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TestValueVector.class); private final static String EMPTY_SCHEMA_PATH = ""; private BufferAllocator allocator; - // Rule to adjust MAX_ALLOCATION_SIZE and restore it back after the tests - @Rule - public final ExternalResource rule = new ExternalResource() { - private final String systemValue = System.getProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY); - private final String testValue = Long.toString(32*1024*1024); - - @Override - protected void before() throws Throwable { - System.setProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY, testValue); - } - - @Override - protected void after() { - if (systemValue != null) { - System.setProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY, systemValue); - } - else { - System.clearProperty(BaseValueVector.MAX_ALLOCATION_SIZE_PROPERTY); - } - } - }; - @Before public void init() { allocator = new RootAllocator(Long.MAX_VALUE); @@ -96,90 +59,6 @@ public void terminate() throws Exception { allocator.close(); } - @Test(expected = OversizedAllocationException.class) - public void testFixedVectorReallocation() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - final UInt4Vector vector = new UInt4Vector(field, allocator); - // edge case 1: buffer size = max value capacity - final int expectedValueCapacity = BaseValueVector.MAX_ALLOCATION_SIZE / 4; - try { - vector.allocateNew(expectedValueCapacity); - assertEquals(expectedValueCapacity, vector.getValueCapacity()); - vector.reAlloc(); - assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); - } finally { - vector.close(); - } - - // common case: value count < max value capacity - try { - vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 8); - vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION - vector.reAlloc(); // this should throw an IOOB - } finally { - vector.close(); - } - } - - @Test(expected = OversizedAllocationException.class) - public void testBitVectorReallocation() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - final BitVector vector = new BitVector(field, allocator); - // edge case 1: buffer size ~ max value capacity - final int expectedValueCapacity = 1 << 29; - try { - vector.allocateNew(expectedValueCapacity); - assertEquals(expectedValueCapacity, vector.getValueCapacity()); - vector.reAlloc(); - assertEquals(expectedValueCapacity * 2, vector.getValueCapacity()); - } finally { - vector.close(); - } - - // common: value count < MAX_VALUE_ALLOCATION - try { - vector.allocateNew(expectedValueCapacity); - for (int i=0; i<3;i++) { - vector.reAlloc(); // expand buffer size - } - assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); - vector.reAlloc(); // buffer size ~ max allocation - assertEquals(Integer.MAX_VALUE, vector.getValueCapacity()); - vector.reAlloc(); // overflow - } finally { - vector.close(); - } - } - - - @Test(expected = OversizedAllocationException.class) - public void testVariableVectorReallocation() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - final VarCharVector vector = new VarCharVector(field, allocator); - // edge case 1: value count = MAX_VALUE_ALLOCATION - final int expectedAllocationInBytes = BaseValueVector.MAX_ALLOCATION_SIZE; - final int expectedOffsetSize = 10; - try { - vector.allocateNew(expectedAllocationInBytes, 10); - assertTrue(expectedOffsetSize <= vector.getValueCapacity()); - assertTrue(expectedAllocationInBytes <= vector.getBuffer().capacity()); - vector.reAlloc(); - assertTrue(expectedOffsetSize * 2 <= vector.getValueCapacity()); - assertTrue(expectedAllocationInBytes * 2 <= vector.getBuffer().capacity()); - } finally { - vector.close(); - } - - // common: value count < MAX_VALUE_ALLOCATION - try { - vector.allocateNew(BaseValueVector.MAX_ALLOCATION_SIZE / 2, 0); - vector.reAlloc(); // value allocation reaches to MAX_VALUE_ALLOCATION - vector.reAlloc(); // this tests if it overflows - } finally { - vector.close(); - } - } - @Test public void testFixedType() { final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); From 689cd270e923d4f3f15913843c2569b36e87c4db Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Mon, 15 Aug 2016 09:25:51 -0700 Subject: [PATCH 110/210] ARROW-245: add endianness to RecordBatch Author: Julien Le Dem Closes #113 from julienledem/arrow_245_endianness and squashes the following commits: e4cd749 [Julien Le Dem] fix linter error c727844 [Julien Le Dem] Fix NOTICE; typo; doc wording 88aaee3 [Julien Le Dem] move endianness to Schema e5f7355 [Julien Le Dem] clarifying big endian support 36caf3c [Julien Le Dem] autodetect endianness 7477de1 [Julien Le Dem] update Layout.md endianness; add image source file eea3edd [Julien Le Dem] update cpp to use the new field 9b56874 [Julien Le Dem] ARROW-245: add endianness to RecordBatch --- NOTICE.txt | 5 +++++ cpp/src/arrow/ipc/metadata-internal.cc | 20 ++++++++++++++++++-- format/Arrow.graffle | Bin 0 -> 3646 bytes format/Arrow.png | Bin 0 -> 86598 bytes format/Layout.md | 9 ++++++++- format/Message.fbs | 11 +++++++++++ 6 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 format/Arrow.graffle create mode 100644 format/Arrow.png diff --git a/NOTICE.txt b/NOTICE.txt index 0310c897cd7..a85101617ce 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -7,3 +7,8 @@ The Apache Software Foundation (http://www.apache.org/). This product includes software from the SFrame project (BSD, 3-clause). * Copyright (C) 2015 Dato, Inc. * Copyright (c) 2009 Carnegie Mellon University. + +This product includes software from the Numpy project (BSD-new) + https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910 + * Copyright (c) 1995, 1996, 1997 Jim Hugunin, hugunin@mit.edu + * Copyright (c) 2005 Travis E. Oliphant oliphant@ee.byu.edu Brigham Young University diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index e6b47de70ed..1d3edf0117f 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -243,6 +243,17 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr* // Implement MessageBuilder +// will return the endianness of the system we are running on +// based the NUMPY_API function. See NOTICE.txt +flatbuf::Endianness endianness() { + union { + uint32_t i; + char c[4]; + } bint = {0x01020304}; + + return bint.c[0] == 1 ? flatbuf::Endianness_Big : flatbuf::Endianness_Little; +} + Status MessageBuilder::SetSchema(const Schema* schema) { header_type_ = flatbuf::MessageHeader_Schema; @@ -254,7 +265,11 @@ Status MessageBuilder::SetSchema(const Schema* schema) { field_offsets.push_back(offset); } - header_ = flatbuf::CreateSchema(fbb_, fbb_.CreateVector(field_offsets)).Union(); + header_ = flatbuf::CreateSchema( + fbb_, + endianness(), + fbb_.CreateVector(field_offsets)) + .Union(); body_length_ = 0; return Status::OK(); } @@ -263,7 +278,8 @@ Status MessageBuilder::SetRecordBatch(int32_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers) { header_type_ = flatbuf::MessageHeader_RecordBatch; - header_ = flatbuf::CreateRecordBatch(fbb_, length, fbb_.CreateVectorOfStructs(nodes), + header_ = flatbuf::CreateRecordBatch(fbb_, length, + fbb_.CreateVectorOfStructs(nodes), fbb_.CreateVectorOfStructs(buffers)) .Union(); body_length_ = body_length; diff --git a/format/Arrow.graffle b/format/Arrow.graffle new file mode 100644 index 0000000000000000000000000000000000000000..453e85025d8d324310e27daa3692a98e3bceb332 GIT binary patch literal 3646 zcmV-E4#DvsiwFP!000030PS7>Q`j+NR@Ded-uKgmvFCw3BGNdmUmoyLiuq-W`UK6;X#sltJiODZ);dc)g0Crf?@qKU;`3FcV|4aU7n!kM(#E* zCdqwD9sa9Jo#@rpPy09I?$v=4(J$l--;v*LFD5?(fS zaKy9xkmNi`go6N!i=XzxDBzF$ifdS7eV=v%HXPJ0_|=;LcRC(1_Uj{&j5IXt8Z^23 z$WT=SWJLDs@`|)E+UyQZTPnEZrPnhi5U74+!>NsoePDi?JyG zBCNTAqnthnKw}qJyFgcE8}LN_^#vC=Jq|P1B&^z{7V< z2Yb1Kp+bG-S4DixCVNdia(y?*CBYmOv$C-%HkdQ)6F-`T>!kKjuKGqM912D$fcLab zPD5d$&l&Ye0Tf0L!Tjjd@38rnEF=mH`Gk01NW=+fD$fyeG8QF%i(6Sd-Ke>XU?!1}xCo*i>oOmEhd2|vMPW}7Y*C}zodG?{JNV*btNEllbl2&NR67TG)$C)!+`WUeHWJd{av zyX;KD31Ualt;H;&n}%eODzJztS!5L~VpW4hG;NVLVhaf%d87(FVn`lY1&^53;1Osp z8P({}mON4g9?>O_tb#|3ip-B#`eGhgc3<*HNqEGPJhBQN(W}BEKwsjKSO7{UsREN| zl1WymyV5-6Eu6-2!piDT8xWpS*!A>&vp!6b5}W))1L*;Qc@71n@(mXL`m7i%g(CJzs1o!O?A*{7yK zZAYAPtawt}bwTi6e2 z{Esu}LZqXX!yJaiEmgCUFX6B)zMEj{d#l5FIBI*X4rq0J%0HbR4nh(EWkSX0WJrGb z!DSuhv49Kz(h7)+biLIL2=T>5+Zz)8$)#EY9JsCMt8M`PFN7%mFf8$bkQRSfKzxX8 zk>bOSMKmoI(LpT2Q;QD+#v+EvMMSlsm)o?N-oXP~J>*2s7iIuMl zQ*pV^1Uf)lBXo2Loiag3Y}>CQa^eAzX{nO|k*2R2IhsUHsX86nCt-y9gAMp`5ik22 z^Mx-Lx5hK@;u1h1FaeaIt{PCP1XP&-)gTV-iwRb@ieA!A-;9XvihS|f6xL{6zD>7tHRUF4|t8j%Aea!S?bP#*_( zmBo%YkZNt~tg0=sQzq=Z_85+MO~i2{8<<+ijD?z94UhcQV8gaYp34-&c-?g^9g`}9 z7)sVLRd=#`yiAnzN;Tf_s|Jq@S|}BGPB+pYwQ2N`x=}Blnp6G-mEC`KqhAz5`}f*< zBpIW-JVsY73aPkP?5k?n1wpG73Tkcg*1q|Bk*y2ER9{oiBW{D@KJ(qxhOB}o3|U3M ztO2MS$gmB=)>UM2GqB=W3B%k`RPc1&*J~QhbO|GgKaQIP(q6Wn$D&sxjL~_&uCXYJ zLh#9hYkX!=d5pW+==2d+fSP6?&9FJsST z59@RuQrE%hQO^yCzqq^p$nlv6)^yXzPI?*DE~1^a9aMxY!Pa)}kdKw`q*>-oAfzWl zp*i=9)0fEGSl|t8a4XFb|2=$7Nc3|26J3Se3o(<~D4UzR4OTs%mH{v2iXh_vs^WB% zs&NihkzUhP+qNLE4ckI?yi%`OFWs|8PvzM|>Df!q{@Hu>9ZEbmlz#nF`SnP9j9(8| zHTf3b=jJTk;m6Ovy+GdEq>iud^g&MR}9dOGM^Xt<2l{B|v z=hwv%magxUO=&;E^F`KUJzq;R3Ow7)QBcs(#hFZHc7DsV@{HsggqPG1v>yYn(!aBB zF3gvzHbW)0El*u5!>MbMM^?cjYkOy3j74bbYhQR?=1)kHC=W?UZ(}tqv9>qz#pfm= zU~VEfWdTRTYDq_<8XTfM^?bds=e5n(ql%qoqya5Ek)^%ti7fF$M7zC+_#Ny-ECBf# z{HD_*ecalxQsZqPYttjI#PbN6iC-?iUWm(ZvvQrU#H)$>GB~(9#F5k6z+Nuu?kOHi zjw{pA2u*U^ z>eu0Vd{99Gxe~Y8(K~o_5SnS_hqcui^_FK94q8t*ri~q5U0a{O3*|{VrSf;*kubzv zauRnARorKROnc1;lE>wg5kVWkPMf)RXT9k@z6|Lxx+Zm~wrf#aXZA)5_w6c0y z@tI5H#>kCEheQ3bQ&JGb?{%F2s@*DthnHq=qxc)g9MfxXP6@TQRktW4Cc3DaaiES@p)?n+`O>YOvknow51MxoXeYly4fMr#E=q8?{Sw|h)nXIGiRAd`*;vu zvwO&=+Qd2xsY}8$=G>4;!a_u3q(9J*wo??&DJC8hTykccpsg6#3I6+B@JM(OB*=-w zocMnT8C0hc3isj^ zPB%9_>X7h$YJ8yNc4@Zr432BdhS7YJJMd{AM>0zKpyC~KX@_?iLhDAR zrN@~f0F7N>?Lwd+xQC2ARa1TzHJ2+Z&M1S`(e1ZGj}l)wBT9qxXA_;Xh}{2iE&`rY zG*!XH`InjgcBCfZ(^4o*PHq?Mwc(h;X^{Vu%N?(=$m`;*8Afl1mwSm-lx+OcA))! z`1@(|xOdRIcn@^g{`l5&x<~xGd+{EA>a|ZkcnAFc(XoDgjoyC#^zj@VoPIn$Xny80 zufKY?M{gFZe?~gd->DwKD2wvH8|In~R+0TQtHup+;S+)BIo`k(!S03c-7N?E6}vs~ zXxG1o6T(HC1MKj`6XBTk#miyF^q)9#oRP0-l;@=*?0>=Gl>3mR=yU%xRzvugiPLr9 zMD)vje6BdL=$5hXkO!uH@6*pik`BfeJpKES9QzRs7FGL)V)rl4-XE#Yhi&sY_GlOc zye8tBOigKyw=HfTe;r_-gYD0xiZk9-+9QruDT1tHmyUP&VAs$`vq=2ff@pfQJMqz{ z7D3Ye*^~QMLoe+oa{|11+IZRR_?1xk=SHC5$vNbj9&Voyf$bU31nIM$%&5b>BHbzK z%GLs`#noUVL)3^?Pz#?J>(BatGvx@8515Za!$Kw~ST`xa#!m&g7yXpwp&VzVp97DB QCyMI|W2akVcRaknWUDrBOf-e)sl$ zp7(pccZ~DjIpdsh#_JfuX6?1^b+0?-yyi8pxuP}H6tFSKFc1(Bu%9T(Y9Sz?SRf!E zwV*-35qDb1XYdQrQ%gY_aeja`82o|mrfBGifPhH=|Br}}nN0#tn0C-HfEuW(2wS;2 zvs+lZTH3JtI=g|h5fDUug~3N>8>j`fuk$k(Phnp%+COIqgU|589JJJbPJud!(Hf{~ zP|LV_*iiGa^Rjc&iepeyQ;T|7+X`#R%Kvpa_)Cn|9tw36=HT%0@nQGjVR!YgHgKo-|fiScv^WlxIrCUU8v#hT3EVzLB(ik;WzsC zzkl@;>R|gncXILk>#@KCa=^dg;9}?G_;=gjQc?I(VHsCvHxC<6Ptd+NpXi@6|JSkq zxzE4O*RXemx`I*gaIkvf0=4k~7eg)Jy%XpD>+1jOGybzJbq@y{@YH{<=KAaEe;)hm zd{GYgga2b7{x!;fj)Gwp#}MWC_mqia)NF7sBOpj3Jdu^s@kRW;?2|vGb6evssUe9{ zTVSdG{T-1Mik#*m-pvU#jqVZCt8%G}3H`v8meW>H3vv*ggTaV=93L z?4KlVO=U$0@(M^B?9H{OJgqzZCjUyDS+(hG>Nxg!4~Z{pZ?z`A(9M zJf#2In$I7v{Y?3#X>9Gs0ArpkEsx!ZvF}=V_s!vQtHN0hyj!mrI6XG|JJepsb2M+h zx;&%`IQ}$V=-Tm{|6|#+(xPR*{qoX6>7c8vGz*2Q_MZ#(w%v?}tw-YTO{`zak>#cc z{c#qAQef!tl9t0DDfFE7WBFxG933~O7c0S3?OxJwpM(%{>E|)?&QK-->!sWuCl&G* z44DRB9@)JXgjQcdGd|wKwhle8$Yq*P9#?UAJh6-d`^9fxQ0>u7oSk6 zYIse!40QH5yqhO{KAk?+Kn|e|)|QVW`9$pJ-nZd8TV>KM?!h>;h;(1H#@4V%ekI@x z`-~?$zia;sYuyl7pCe{^dsF2E71n~`wu0)n;KpJ1`hj6!kfpHyIjSn++APO7db07k z(P1*8@#YczC*)f6E@aji@12@%oYwW~2V7FD1+$!!zTN9ZAN%X#Xq4B4hTrvTHe|?9>nt@nk}62h)o0EE^-Z*3(i&)VpquII zDXn}`0SQfB%APH=~UoROL@@cTI4)$>8En-Qus#|Py7w`G<4G*H153o zu6VA|A|?HPI(TZ2U@7DZ*&hmzNufqs+Z`y=YyA=lyhvn{<3ky%_tfE$+9{2Er_nkN*eD`D`VHFCIrvS6hMU=dvB`gi zlIbCRUIOe0G9pU?KAG=cNm`RXCXs!Hj3D*tTH!%yUeNjp&i(4#pkEH#+GfcJh|7A~ zu_=wlJL7J!F@%tNRff%Rwzj(?nONltl0-=C6lrEbe%oglX}!iBr#dkK(p%vgTq*l+ z=&1ubw>St&bN%<78<+L!)QaRuV`^`IeT(G1)diaigGQ3icO--=@Enh?_>Fqr2t&iX zZPxLw$TRjc^q8|XU~N7JisX-kJ44RGf6(Fd1PO4#7*D#&!-?@&&CWldi zuiM8?X|r3dP%KOwNhy4m-097O=jZ7X1Zk~Be2>i;E;C4YY_SNCF{4<6Z%#gE-^oCN z_@bcin$~01Due%J9yrY7@-MetG(%2_Ru|HCgp8|}eK+EvDw4}O>|z^S5XUMlfA)lC zhJ>TOWM=IqPPq9R5xX#!Mrt!q;#)9sX$~hwUi#{}JkTvrMb}+I{{g>At)!eK1)?T? zwYih~oZZ-Q@*TDH^&&8I$KCf;6jZV7kI0<4WVi}>f!BsXQ1GdJ2c7xtct&)dh3?t~ z$YiK%oD+_fs#B>&>yP-kXNs+b5wJI{hGLbDL3f_;E1BH8mEU>tMo2e;1QBVTJBZD* z&>L6f45K2<;np`tjh6nr5e;~R^je)b3h zJZ*xEBaRJ%V0-p7f9|k<}@0?=?&y{{j6oDh-NM$T!#m zUkebB{kmnD?oIL{=da78liqsxCw5eN3ZLzVz1VZHEiXx3*UQ^SKl*sh9^Wr5jvSn^ zIz(kd4*9s|+@IQFTR>!#9C7bK%`rdJ6nEI?k8{tzgz!zn0^xS;B^VHuyI&>yTcmiZ)_(3K%A=5a)%>DQ@d97dMS@)}{Y3jzv^*Rxfe)LcLX0glSZwui zeR>eV-um%!9g~tGTqIata?X3vxp`@GfXFF));7zz4;c}Ya_}X$juCb&H{r&e-p@Sv zlyD+C9!(PFm7eor&v|@dlSvF5pW)8VZ}Qx>Rup(VHCGr)#CIp7%{mgfr&f?n&y&qa ziUvCXUa}4e6{nbngsr)-53)HFoms-`j8^Kj;OuVuCV|9+T)sqA;HBRWPX6J7NJ^y~ zG=Jw}t8gZ&{UWYm%JFU5A$oQSHf%vcZo=p&ba#u;k%Fv6rVN5Vf$}558cHejljVBI z7L#P!UMXyQm2X+LL4lqkn~G^rP&y=u!B_Et_-;&%Mwxk%GmbPBhjEcL1vGnM!d1LtB_CMkJsiI9G5tU(d#6kdd* zgBYr-Z<5kjaOTF~Yr`b3xM~JBhWU^%jk~MS;P=rBwNqG(O#VNL{0V*fpMS4K^_e|t zJHv5x2vMJcEnIu$N~6nL3#y-<=k8p&e!OBJ)k+^rTm`n0>}Sq6_Rp2zySfqKSaakZ z3o?ZpVBL9f`?VOl_rstAVNLgbV@@nlL&Zqcme1oPm%{1?VXH;M+eLuh6ef7IrdB9kc#3R48~%i(d)Z(jh6dodR`QF_~4_<4yNyK1DO; z_NzI!|Cm{G4lqXAc8m7wr-=-6sqkZwt3<4Fl2Kw8BMlW1);(#8BdcLu?|OM#S@>Dj zxL2zVmcd%sEH#ysa;_t3CPM95#@x9G#*GCf?58&Uwp>;@XCI9Po03(e`3C$GWVpdNFl)<~KS8K>}Od$+9=hMxc_u&%RN5R#RbK$A5@47qla?-<{vles8^;Dj})=Xn-AmFq=2F2q!q zP&!3)&pB%WRrHP35-YomL(QNr-!z8(tVTXt>XU|`!of{RgE;O7$5~y7>xVfr2D0XR zr!~rH>|ysvreNFXymn|G+6Ql_Q*}>nWK*WsYB+)AkOs#5QIF50F72wW+!_u&pAM<| z%f6}}w6a;HGn0k)GgqB&-E5eGMuG}sTn~vKYj(OS2|AwhNg~yb{S0|~z1-r}y8pIf z-^3;#UjqG#r7o{5;7Mj81Vg6<#uch05t`a@4)6Ofk}+lhXM)0?DB)j?2hJ`CbWdf> z5m}@`b2prtDX?3!hk~3gufEmil^OS{RKpC8XWjHQI6liFtnDkkWqwN^7@iY<(^**J_d=T)L z*{60oq1gO$)TO#A@BZ1W903UFcpU|bC3`W?4;C*{jaW=30(2DLyK{Nj5`8F>?^_swnt??)QL2Gu^fEw66U9%)Q$HtCtyJi9Y;!a&d0ZcL z-07PI60!YSi=@I4-p#j3Gid}9bH;;Sc~ zD}fi`jik457?$$SI%nD0zUeUv>4BGCQewz@>wyX?4e~@J0R7-RoXd zssP+W2QX`0uhqpfL@Zo-y9+aT>|9#*-YlgO;5C8O`bH%KUk-5wgtx7SEl#JK0|mHP zp>R**wL3=Rnc>oQB5#Si-0IV@7C}x7LgO5tuU7zRTBoRsUxgV3HjGSQvNxIUPnFAZ zp*hw>nIrB61BL)RNN5H!S_Bqpmv?hKqg$u*)#(?G(H;;yQ}DNsieEpgAD1VzElJT6 zUXP;dizDq0kS70r3ZOOVMDew98uq)9n0Fti^^EYE1visqq%BVib|BLDzkmIp)01Wj z@raOnU>0!lQh{)$A^Lj$*r(zmlY;R*^XEr(M}m?Z0?93PxR`;AWNpW-!jX+cN{H^80a@%29;Ki5HdzQ)d^VUO6s98RqE^fsvQ zm3PlFzy*qo=3{6!Y-?7xuM;93jBr;5xY{K1`jtx&RRG^5>$f z1EiDZ>Ue!STQy7=Q%5DAEZE@Xl}b{2hq~wq@*~DjTw={1usPZEsxuDst^lR}1`r03 ze{HZI55;{_f9+MJ7<*4ix`*beQd_>m*EEOIc?a-d+TTAh3a4YC|_vbmuL~XkM8@Wl4BAcQXhOMRIz`( zQlxO7jCxEwuL{`(pbdM?90DBz^V^0+m#p{66zO%lg|Tmj7!&#r1RfbXmX{RD^)Q!> zQngo#X^P}6IP7Q3EMsg>n#t5Bw;%!f2AV!PNvLyPL1>c8d}VuT*+?<}G8{n!+i0 z?Z+~mss?CT7<#m4a$Oc1*m(ACsY_a*XVIuh2CxbM8V;?$0prYMX<}HKfCyV|khWOk5hD4yv&E(VgS5-(pMQr&Mo9bSw z>I8XvaV8p&<24h_QFtLmJYFFYeWhFkE3=Z`9t*ZeM*pVuId}UWFK_-VtV~EUnc60F zSQ8UQf>;oFTv+f2x(y{Kk1sN)W@@xu%9CPPfDNx1DypU-%KxA=|61OW3poZ#j^Wg+ z93|hO&?=)zuBTZc}p~MDc}H;clF5OxC|OUm^VhD?|PLfcqY@6%#JCxnQ^N`(%d&hVOuS zCBzTo+(*1ldDYkW_uRw%rG_iwBH0w2fbOT4B!pf2`9G$|Yy1``d)(OZJKo_igmfUW z1gvi%SA9d96c8f20D{ae^XbJBi|5_wq(|mTU6dvg3F=F^wO-DUrA?kU)O5_ZeAU|$ z7YB>a>Qn=8gg9f?082C~2#Le#AA><@n*fhF!C@|Hc;MhKZ|xg?HX@R!IvEL}qvkF{ zJfM?S8kApsVM;XrVd0y+C)yKEQACRbd$k;bxPC>IuujwsY7Mr~DgroTIYfhX+WhRkUksm#W!}18-)yL)*tu{5o_Yw&~jDT#5{B%CzUq&t=2! z#`L*ni%>l@|1E?o6p9XZ^sCNQ$WqHBTIjm_g&G%NNcnTWetx~>?hklotYk%jFladE z?6W5-yCIasiWEwht1s?zzoC=DjS-uje*8!(LS3iiyL3j{B#Bf8RX%I%j=&>jrq?#i ztBfq*H*>z^G$z;-`}b=_43rT4m#cXsB{K|k>|ijtC8SbhG&J7h!XM(g_u$YY+dCQu z?L8K5(!pM7-bl9}EebN)+ioeB zBHwF#{Y&W(v!it)UK$r={A~jzrBEhjUb64%4I76z;xkDf$Sz>I$#mgHFVz!=K*fplhe2%_Gpzq&owgFcP)s{Z0tf*>XY2)>e$$J^&c34@b zGxAOkQ9^Ut-+CFql72%NBRxph=hHv}zLC9e`Non&+sPPHIvhufcRyI8;|1e1R^-c9 zbB#Z6=K%IC59ylRt8pO#6cvK2VGnk2GGozmT$Fq(Jcg*Lw!7PF3p~1Lk8~2Fxil+}ZJhe-hM@%ot1V=t9=Edchh3*ARrO2Q3{A{Kh72E*WcfqMTd z020w7xGSW9RQ6Q--Yk%le)f%VmGNzmOBG;Irx`WG+5_BeFW`)-P9y>!nwEFiEu9$v zE6`=5V)~!X#xS$P%SF!NIjSic^huEK5GT;V$S*;l^PzO>M6~LQLyxt0mnn{Le?syW zwtzUad6*ozSnr2EdGpA$v;Pe=2~V$}O6uZpc~ARwqifsAz4->$74g9&=Ntgi&y@$Y z5^MujU)A_uDqm9H^4Cs=^<8d$_Q?}QX+tA`QRE%LX*qa+v#AX7P#@`s6#y9Q&qA0U zC#`surRCTasnTq?VEtN^B;yr7wcnyjD9_vFFZT+<00g)Ky|rCcH+3hMzqiI#c@GVH z>;-}t<~sn`Xb|~Oe8>6ZMa63Gy(QsfW*nO19&hh3!4UOD z6!3e{MCmhZN9VC!SReHh8MlAYr-V$IAl@m|Y^4Fw4z&@Ya3i^Zi?LELGKeSw8op&! znL_H1R=q}qb~I9`a}g2AAjk?2y$A!bMQ<-R&atG6VJ?`8GH1RE)MMCEqJd=$o!Pi! zG^UaUG`X;Z`+iw2)~|gB!_;|RVgAMguu`l*0`lu|{Dnf28->rmAaMu;UB}4G48@Y$ z2u=s}LquaKMZ8(t=ecmVLBeqxCzrqD9DnG%gBNpSfb!M8U#6+ ze%~QaclSwhADZTgy^W;m<#8%8c7Z@o$Ei_v1p2vHM%6FL)|I6P%_5cSHk$T@36y13()gztG02zk(fhx0Pnb4KWU!@yS_2Q4`uC%zZjJgW&zv zgY)Mm2<%2=p5ayS%_cN0d07 zK_gmB5M8jQU|s@N>oa#v*)q1ZpS64<<#M%|M2l}fcl*F!P$z#4bcW2u9_P<`(WM2* zR%8hfhH0k$dmyDH)r;t|efRq)xX!gS$NTASM80l?9RJplQ-NPo54dUS<%{FYWCVf!=)Ht;j99SAQmO(^$tKHdfZ`#jz`7xN9e?Tro6c4!5sYu`Z!}|Z@yD~Mv z6#Vy!|Blt4HSyn7^8f7qu+RL4@kU=VgfOJf)0CMfutKsO?h9-d5hR2&w-zmd#Eey5Pcs92}@_p%>UF$ZWh8~Dp|)y|5c z7+d#0%VAKb(;wN!-<&n|uLhaMa6>>|oq&vl)4cP1Z!*J0QLm~3+?pwg0`VIj{f0hBx5_Jf>NlP0@mcl8Vv+dzoK(yh}$;D+j+DQzXRg@u1Twz2amtS4|LTe&@cpC@v0@AVK6_JcwW{^-7qF}#f2!wUWsmeG<8@m0 zu|S)F6QP1Dwxrwon})DWIl`x}kvK)0duSEBZ5*%qW?u9vd!3@VoxG zRXzd)a9ZTt)@Y=_bNGXtw>9m#ss>CoqvX#8y6E7s-{O>t6#e}qJt$=w=08FNqbC21 zLk7zCI?lR8<^P%Tf3>U_1F%Lhg~NsRzwh(T^Pk6yfe=)2@TIrJe>T<6 zY92@Z-&_1OW0E2NzaOYE-=E*-GqnFKk$%&B00MJ$-|uk$S&{H%s8K4p(HCzwoTU9&n7EX#&f%JC8Lq0N+8F zJPS_$+n)SuV9Q7VwjlB>V=en@ga6s(@E<4ujc90?^8Aw)`m=2<;ex%Pg7^7{JV23K!zvPOTmKq_V333 z>KBMycrS zi!O-ez=af=f-fH+0%F->$dw^z`0SgZd-GK#>oiSWrBwOma@Bt>>a9m0rPr1aS@LhU z`j>sl*8%of>Cut*e{K*a4zNVGyhK^!|Ff+Ue6Xrn{2KrG)_-pT;JW?LN$8lWH!K|f zwq1YQe5Mqb6;-Oz*8>&}{}6cO3(`YZKrEPQXqy3&J8kDE?~(txsWh~J!%j|KiGQp? z8$!xsO93Qcv1|=<3r^MTpYD;@J1;gozc>W0b;dZh4tQ}My#vsU=xj0re3!pfG2iTe znT=4S|C&Kn&(v?r#Ao@3#NCy8eQCDx6E`3$mG_)b75y=#{o)=rtp+*1b%$jO)|uel zoK<4HH;)P+E^Gl|6A1E+b@PKD;WqbStvhm*kE?@WbClzYv390ph&1NZ?VNz~0u(49 zn}Dv;ZZy|qx6-cv?&J00hSU3g+Snw9E!LhJVz#I6TK3;Q07^Rk@S)q*fpX@6iy}{* z2>h0^UvPV}!sV4nnBLln2(pL39c#RRIfU^oW{ih^n=e5Y#Rxi^^E<|KSSS&FJLim+>=Z<- zZRZopA_w)8a24L{4UpKlcpqeYj$}9~5in^)K-R3(VXGhv3;;^L%eNi_IxbtqzR9mU z06hA3d9pDoxDf~RBG0PA&s&A`X$g?Q(g?yQRLt%K>7jl(w$?zP?8MGGBNOxI9bX3m zT#x=Fn&=I;NU*3vC{RFVL(e6}3{ZX`PiF}Sb3&fmfLWLD+Shr%7QF&;YvYZsD-?C_ z?qZrX$&Y}U0S!ryLy z!hs&`VUTV572pIw2GkDZ1|6s0PWO?!iuxX}CdX?^LFYkY7bIN(&r1D^FDez^W_PBN zyrcXtac@_|8`Ugg7FHlzMiq^C3e?TzhJ)eTfEt(qnPSSO%J+jTK=-DQJOJcn!#qq+ zC2{FW6@%}tR|aUF?!$9N8cz|`0_(YVmzsG}UV}WaCdS#Mx`ZZ8Ml>L{6DF5IDvLzP zV5b?7ptZ#rybkyd>|ki$2=$h{yPVcHd+56wk{wpQ*#UHY;gw;c_k}ib^$H+e%%#~~ zq}*Gg*@rhz__$`)OeB^V2#>!J@lg|pV`kxLkZe(rOe$g{>mMeli0>ms>SPA+qPcGust}O9A zH?!m(mH>2|XNz-A)T>K;AhTigBx9Sk7D(B+czET_Eq|TwWmBZDwE20eQBUD-Y2u!J zyK#BXR)yf;Eqr%#=E>EUVl=ot=fgfQw*tnZ9qXHRvHc~#Dre>~&t5t#10=RlCXMrX zat@7u0izaPN5$NoaSVh7Aj^|2=YS?pj{NEb*o}%Vd7kV@o@@yRV!YFNpzH4L0iv|e z$QMF1iy+ya)Y#+ZC+IiC^_g?|y^XTVtltT53Jk?FDo%qS{+yJ*9wgBt&iF4FzcX|^ z+@t8^}p)!pmI0?F@}q&JFVsUQg6DsIzmL?snYO4vrZ1n$@!E z%H=gzrr-76l$jf!%B`qsdGMndF^`xv7-VBNrw}48YfA=HNG?NkyW>(P(lRSY=+kM`DX@FNt0AcUsRUk42S>}T09xup7% z2jh)y4N)m&S;)|%ned+?;o;l4p>aVPJb;Q=>t#sLY!`9!75X>gGmb3w?d>H!en_t< z)0)+&n9vHuRJ7A|<)awN>WRE>0~L#?hnH+S3A-lHbBm^)$}tr8ei(*JA{L7hgku`% z&3l->PCQ0!%~&HhxsJU8y7h=Jj2{W#T?2(e*!4{^31Y@)vw$z5J8)rb=Y|vI$I5IO z>GuyzeM69$S5)_pYLamg4hRjDuEUekxn)r{EbwO?jGOW7&G$eeijrR^yW?4{Ai2k5 zW=KmZ>{2hJ1?BA#eXllk5Q{WS^oHK;IaSx1=#>S|%RO@`x`r4u* z=;}%aVmcEl(RgFrOaWp=7E)18VXBcGpuh#Q0DA#PgVPkQW;oOSm<3Wx+_Cmlc$JE~ zNtWvAoHQ&XTj<;?XfgG+K8dOIY2M7@6Eo_^6d_65}7IC3*8c z5wca z{wb!%ErTEwP31T_asOiJXC!ubk5_JFolr1$zh!?2E`q?Fj`eCo-i_bsz?qgmE}HVy zmm)>A-Zi!zJIB}KpG6uU->6vE71UVrHTnIS%P*JiP#U9)?{5={W~l(59W-GbMH zSF&4_ew%oGn5*6D_SEMblE(@9ZKB3IZYaGJ5a*?VNI9MFJH60tSKjGx>0i6C12WII zk5e{Vaf!omWzrLFJsPP@T!0F}P{+|6O0;iCo)3-K`!_f486d>oqlc*NyMs&?G=JhaI;+IC2e7 z$cK&|6MZc-dpGpB1^XM`GL|xNFw{6%)bwtvfiWqS11+vR46`0I)4(6PskU_mNl$DelNe;< zom{7cFbK_HN48}gop1akyM7A^8~~|4UKj_p+3T@;JS^mR$n1H(UrGoYx271(GovU+ z)yfciQdIa?c6Vb*S)Sq(#GNMktHf;=tGM~e$H7!5XVHzY+8MLG7aYK%65~S4{j8zM zWRr578p?$yF+KJRCZAuQ9W$X|HVS2oWGW$;rq?*5`3drV(f9^HmRuGhFZX`YrIqcw z;C*M1HGKGH&k)v8K>OHzwe zgOq|2OC47YB6S_YY?SCoyV9ijhaow2C-Ca(ds^GW>ugM!D zO-m@SNX9ZEzYsMfX*yr(1vl(i&Wd*?305k;v`N64kBQXsaNfQs-u_xcL^A1gx%eTI zJEF%beu9fElXu%a`v}$6vKW+i3L3qrvnJa4E+G%S=R&CXa|2J1!@kL#--HVgR=O^dm(5fAEN~~9G&ch@Jap|lZtMZ0ipVTGM zunCaYs_0lT#H#GRYWOZGC4O5f1WHbUI+=i2fHz$rYeggdbcAtE#X##CjlMgA} zXx9ty(xT28w+J$uyzxtgj;^&NLz*$%ywjaBF_a{C=8QKYe&((9D16T_MK3ZT;kH(s zYDh&8QeAG6w01x%^wcERia16@?=oVyBW)7VQQjzhIp``&Ye4wJkVIa~5j`8ykS-*lare5hDSd0Du!M=C^ay}T%_%zzLFSXVm)>nl zR*9rM4AM~-hW$g(9W7#aAxaja*W^E9wDmR%0;i}DcN`b8nJE5#7l2x#junp4z`sC= zKeZJt3mZW}HQ!*TvkGsA_l~oyWQ+H~FA!pe^OEw*Lb_V9j0lpOSuTX z_6P_e%YFy%b_p#ELsxc%JJOB$G=pjC4fEB@INv`}$_rn+?ll{^FRESODjn+oEFtSg z?B^UEYsur+bcv$e*-wC)L@9OV)&(Ck-|R&V+swMfeO(eo!&dWxTs$w)q8yekWI2{D zy(L-akiH=dD!MzCe93!=p&zC%Y}P*EbztALjV~oo6)d!lM%`n5WT0W!J-e{iO_6&= z@PKjD7mL17H;j!th~~(& zs1o`Gp#&DjuxfV=r|d|rMWd~#79xDG@y+kjk3LL|rq;MjMh1pJNC-E5^_!VUUh*@i z4iNZ&lCYk_bU9H~fR)A6$8}joUCVIs)D2JfPHzyBvrEuW%DNVFY0H|UkuiKFiLfDXd)$YkM5HeQ1jB}W0Lf>I-5tNd~C@p;}#92Oy(A4>ScXJ1+#Sg z+-MVw{3ogV7i}+##tO=s;>^Az)q^Vwq-cN6=jgz{>z4+37end{*%0}7EbCv){na4D zd*h#J1VOpMe{`Jx0Gjq&mMFO~DC4=t{*E4>(-k7qbOH2xx7I+b$^WA7Kx>0=)r8-} z`#{|X$w1i^NKbgrUFu%`PsV9V9h~xe$ry<8mFULWpI^Y^eBMRxihmGBI;|l4;3&~` zXy%7W;n7b67sT372zk&uSCJf7w|&WKfP#WrwyivF8>IungM|;RfGFs`fuq^l@FL!m z#%7E#w)f-5mWs~uWbm3S(w9t|!62dU0_rHo8-QXluo9Zh zIsg}(AQ@5ZhlF_ymx7aW(+`H7WVr$kX4g5DV4g6s02LLR>>T(I>x|L?;ICmDWP~eA zY%Cg)2x7`dpga|;%;@ubjJyv()m9Do0t?1pKM%eHUW1}a?VUyc@4G-wi~aC1yowZF zh-cz?Hc8Gx23L;Xbk5SL>uXAeU=V?hu|OZNHh_%3(khVtwSwB;k2j}ZA_%Fo>Tt(H zmO;n}6yI4cEecuj*qk~5Nsh0#Ik^U@pQKDe-(6NETd z2aPN6lC^U~Noc-$o_NX&>nDJ8zYjqXTih%Numw7UY4%mavV}x9t)HU}_nrt%9NN(! zQvy~(Ek_t$1r0AA1WX?(zx#}#T~6T7dC|ecrDo3xy-I#SFZdq((kzm924QHWQKKoS z^M)60m+8QZe}T?@bKrG1#oI=p3NMh0C3aq@rKhQk2?`mz_YxH(0k!DeXNlWS zv!YuukGqBFg)vK7$*jKtQ5@10mG_*5anhz{_dCM=BCu;jEi-lBBg19)Rb+e8M-k*M ztNhPFKoA5f#c3`#U#mA^gIdYghW>!IQ^^tb0y6VFK%~XOR|Y6{{|&;~8=QxrhT#TH z3<%1l4uA~UmRaDetwj10D5q9AUjsqJ(QNe!pog9V01|EuG+6~1Vm=E)JNGU@fr1SI zn7`_A5T(eb!?K)R5*q;r1hSd6#1E>IuhwHZJ_0h#mrR{Aw9l7x`)u(Ejb~+q0s#VU zlh;m5h7mNBPKCAoeBz|>1|hN?EqE`D2KvYih*q^(T6hFOb>=#{qdL4S5U`s!Kz5HL zB1V{GzTOiJ=Rg$)#6g7clH9o-ubsrLCQ7`9?wyth)eacqEsU@_?5$C^_cqKIMD(jV z*%s;&K*x#u1Qa)53gr=GZ{^+rGSxf9SY^MZ6?+%3r}`{8;nu7jp6gBzB<}d#w0ujk7f7C#~N6Oxg%G9@W-)pT$5ucFDs6Us*_7wrz15q*!Oo$6I|_nV1$Smn&nv2x(m>85aJakh3tvES=%_)1j=xx~n!alS*~A{q!6B923$b}bXuXe)7 zBzYpRPfS$C5VBxaxclt&nsqEfe27pANP6>tfS$MzNr%N0uN~U~uE$j4UgO2LY)v!q zHZgcX-0)-cuk&TiJlRvhhAM8Q z9SjibOv{XoQRFjWpjY(4vCwSu-K)AMbXQ|0OIhR$WQ+MfQK?Ooc6RhTZ_zHlUujX> zg$R80eSvYM!-YMXV34<{dDz&k&ET@r4Uu~QPd%kheO4l!qc7WRdVeYd zy<~WsLi#;a|HY3*vsnNPL3Syo{Gj2j0?ZNe9ZS7Z+nX8d_VbHh$yYr;n^d>tT4ye4 zA*fUK@uL=*S$ zQ!>vv8TPYCXi6mN4R{Ah2n#{~t#y{mXTLeclnSOPaFFpK@71cBMe?$J^_#s%|Grai_5AzM&&eQK22S;4c-fK<4Wy@1 zmsfx0;$UuEzn-WP{}^SbI;BuT;}l?9=7XEp&HF(&8fR>n$a&ge3%IXf@OlI73G2Jm zLM>7bn=#}ns)MPOBTEL7pK<4wFO&)YXFj+gB1$3~ZCXV^fuqB){g@oSL z_QsqSlKp^^Y;Axn1g~*^P#T8meO}|Q6!^5qD;PuQ^A-Q^h}&By;I^P)K@}{uRU2cMWA>!WF!g!npW7@06k9lhofvPl?Bl zbIm3@lv)J6Y#b)keM51)9C)QZ7`RmK6~M+xK~^pWdRDh$iKGqr0N&HVJt3uIUR4+U zAnZ7Jd+4NiT&m3$aZktq8E4mA^l931;@|UiYMo_KErSCUASqdcs>&d#sg7qa5MH_8_liD>=^~VM4I+q= zdW_WU=D)xx)B)vn0(jWm*0P-d>ZIu5>k#{b%@*y%`SC$o2lVrAK)$-6WV?uG+4f~CBH%DQch{a>k$$RtaZs*^!kQ;{ct8J zUOeZM^3vQwtC|j=4V5LHn`c5Kc>J!W3)F*2<;E?LM1-yS_7VtI!XUZ~Nmtfi6uGho zxf26@`$+A^-ox;8%e-ats-c!$epw&yam%>pt`*DP0(lmHt>TCYDNCBo$Y2Rr+lJKF z%Hd3TL$gT6f#dfHXC2~fa4WTTWchcJg!Qp_nlCLvyR1?sTIhMp_l=}sN&5a*~}lJLg5Asp;z>#YUluHi@FY0xDo z_JDfjXgoLP!K5H&=DGmoANoPZQ8+bnWP=lVE2;v^vLo*(9SPiyRKrn(Syu})s+xI* z7|En(?=k;y-RL#CN!^1`q>vzqB5nBXCGxf#pXwJBM=@l zVq1RlRDA*&DaqCqOpDOQCMqwRhXOCHnZlekt*ZSrB;LZIp`oPj~2&hEXNUq|tz`4EODJ1~sBEbw8gZ45z{x z4L1qaYAMefbr|0~_{v9IfQV41f46@z@$~NaQ}GUZ#KpT^^i zP)(VHVsJZDjnJlZHIp~9=mYii`Xw9P*I`;=qW!sAy7mJs3c;l4v7X-`Xej=PNOa;$ z{b>kbX7Rz|B?czS@%nq-quE<(eqw*>Y|C1I)w6>QN^5RYN|>b89=Q2Z!G2e{QKp#~ zmWl!oAs!g;vc1vddVtZqowRZuRCxN$!3CX=X^{gj=b=hEXNbrUh$ZX1i7JsP%a!K0 zj-%J-#y?@x;Z6!CE(E||X=9>H===bv%LnXb&u5jF1qzj+_(RL$NRAN=_pstsI!NLI zdK6+ud|aE>B7j~qG{}3`c6U3=b=zI|w>_>IHG2)f>zd@XAVNeYBbiGVgk)Hg_?O#@ z7(AB&NngrvI|N&Qg~CqWq3B>Ij1xYN#&01D3?qQ7I6;i!_Mv?NtO=GNyVhz4B)9n1 zX-!8Q`9^k2W?U4XZ8eQ)JhB5pWRx*xjm2ltS)dS!*Z96sz9m{W(j?KNN;Qm{;VIT; z-8TX&Ad=Ml7H@OzSHJfrYvQ^4ZfX=i3@e^6zFLZ6AJj9x3CCpK_>CtDfD~4??1Kmk z(R}c36i~DsUmI_)pWf2`-cQ{v%okPHwd{1P&Q3iPCCj;iR2A|&Gi{iLMO6wgEq(2p zMX!F$>b`yG-TDviVTw5ap46aUC^_^ ziJv-gkSXLZcw_><&~IrfcjkP99Jha^dYfwio!2DYqS26t{#b4vX{8hD56yGn)iLdG z|D2pL(>@x`=Cm`I-s>q*h_Hx2`TZ7Dm4);=yr#C18tpru&OEzOP5Y4XCCe@UIkue= zZWh`a!2#yk)&!ChEwFnEYhPLXM2L|7gCz;HS68|>i;@Le_-WUf7n-lytv{BKd&DUx ztsiu-_uwq!f_-*6)%o|b>2kMu5VilV?Pwf6*W=Nf`+*9P@^NgyXjlpepHNzd`jsE( z5`N7Akb3sE6UAxH(<^XCIx}5P>Xty`$41U<~kd#aiF zi2JE<`l4!)q0apGsBp(z4XSLZW65t6A%(&K2Z;?ewC@)I5BPZd!ztdqLkeGdX|r(( z)UF6#AME|rXD7#?qp{DQ5m?b8P_;n1~BVYG-*j(TJo2Fx7hF^+q4@`r!u`wqV;o)Y)7OrGu`9c= zNkpYJdKZEKhW`Fpu;}V_eQ4)Vw%ZHTB@~a(e0T|daMprX%s9Tg3j(jZfr|HgWkW(j z!FC=L^YMOG_O>b_y|Y~tbTeM1MPQaff@KLe|%Cr)6PHQ(kUfC6BvmUmd! zs6%K~czi?4_=#m79iHL)OoLJ`KS(K>-`x$b3V;yKyHgL9NpREgf_kT1f>T918E2d9Kr!JspGLnlKr_N1(xp-pe z1-^XrxA5xE?P)_)B(I(4um0eh|M@W(`fZif#|?Pf!HWHx1WFo*1#S4T%Kz#AQr{xs z!bf@_N!4Hb954YNpkRu89lSU7&yV4+n&B7CJG1_evBX_CXp#cwhW`)Ea}0hl#%=Py z?=;aKh*fPGUq0Ub_mzq0Q}Bzw;Mw0F#jqTr=Qw}h|8d_S)xj_7!he#s|9ZP32q(%X z;eKHG|9&`o_=Wu=fHeQUMqq@p9oX1+pS;odZxR0d1H3f+;{Ur1KQ6}qOK-!KZTjAITJ64Ab_3pMJ>f@xFRM%-1EE3W5%@o-?9TpEsi+it;^ ziST2kpVuNh)%_V{Aidr1QhKEhIOAH`twX;4zt@(MH@=fLR0-jy5J;aZ5k{b|>2pvhV}Higna1l;b6=$d^R3{RRUFSeW9{-$|; z^)*28XNQ=M2g_7<$^$!Y)(J7T28u~Ar9UcT9lCSQIim6Q#~aH+ZyvA z=lmBg-=)L~+dol_U3>xPSH4UFXbfAy)KeZ>V1Kd%pkMgw)^8x;w}rUAl3B6V zB{qtF@#`P~zf<7Ou7k{E?CcHz8o@L4Xi)s0C_(VSAf79T|4ZY8Ywm;~ll8)z@`Q;l zw?oC=UZCnZ8>_9Ss&>g3(bFu~-5a1Cw46-bEA zCj3h~>3%<5t~?c%7D%CJzo*UXYe-JxJwgupK)sU<|ROV)N;@7n<*(&rt)_YSQ>);6wIYKeGSE>qgartUB*B@5@Wzhw&a(z}e0xV=H zvnuE-jUrYLQN$?pi6+lv3CDvWF8uyq?MT7c9$&$4EZf8lKOXzT@ZTR#hGgLE^tA|% znudLuN(Ea`w^ius*%eF?0dGFvsjmlquUjPA&~9IV4FkMhLlIv$(b z)4HeQEvI&0eBJl|vLxVVhq32IGy!|s>C$cviMeCoY1>!O0mh(9Pn z7a#xWyRqqx02gQ)tazAf`+?~S9hXy`>2kXVQ!@=C;eCbjy7~Vj_K6mhJ<=2kAH)^* zsZ2|Yr&EH{!&-{xOSv5ngv5-%gWdhiG0q3GrOU?^SbWT<5zp`?0*iYaq;Rxz3*QK zrMQ_M>H24D&bML)gjf%iVFFwu28~eS zL9=>3Xjtv^Wzb{1*8k~_{0y(`3tYWvGE~$zuYh0a0Y`uWq&73)vkN%9P zVFY##{QZ_tR6-;*umVJyc5hN$nu;-^M=BcQxP^-vooD$GU&r6~gs_My()NZV7jnj{ zbq6K$VGZicUb*w<58zMt-yLPgm7QEDlc)Nt$nhmS!)6N(xU-4mPs&*YmP#PUI zXQcb1k!IWbdzDw!rk+s)%Ig>K>P6IdH4^r@1P)jkw+Ql4f4_*hSsKOHam1P|kN?H( z%A(~fitc>{d(%K803X|{bQYoY4-1}-`1_vb+MvIJNFH`n9T`o><`I%n`wVFNApVxH zq)n5MRfgr|%j)xQ-j}GKYs}3Z+BYPseI;*vju*rIDnNyLNAjPz^W)lFq=9g$hV)vS zqgZcFZ4NY8Y`BJoftQ!@RrR}|A}^4NcvbE~y0s#HKtA{A-+3GMrWHAjZ>Gx@6tz=vPD=Enc~4j}g;QI>?5RzVa5p}0iJdhK0pTfTTBsaP
{_qU9|G**tg3Q-WTbn-oS(-;B zkOs@w>E1l`ig<)M13uc9MKi;SDM*FvX8JKjM05cz0QMXWgWvlXJ3QB*D=M{H;&tKg zHDwl$R#cpRb?2mNYN@$!y$CqqZf~#lOf6nR0@=osRu)M!Gzka-$QgA_EI6J|4fuM` zq2KH{28Axu?z2EA-gSBw8t!y*cv*IaS*f>0=6SVXE&zbxv$UshT=JVfO0fR^R?#ygm9AAx=VVj2$A*~cjsN$IrM zHTgpT`a?KV)Fv75pB2eJ5}6D3obIrt8=+%)?(4_%fS?rW;Vf<~mHoLmUT8Q`rXa(6 zR^y&w-99PU1@|fIuGmc<;oIsH!e)g6@0U=wL_l9W0*(rqs=nKH41vBw3Ui$s1Hp&Y zbYt^##YH~o1$6>1S%1sNfr`WZB7)w?iQCWYNXUidk)Vo5W1!2*!4bzP=LGn}QT8`q zW}n9xVGyS-n{mX=^k=|dqnaMb2;Y*Rx$366b!RMb5JI|-bpFxprifx`>QvQ!+LW7Q zfL0cPJ<(@DZ1oE$0})NXFr*U#W1`0xM?=7(L3Am`J#HiR2I2eR4;2!>|LyTwYhZhaKBV!G-AKj zCoqZ_?ROm!a_AiapY1j8?)T8JFPuWxE9F1bQLp+hu{=u)gcKRXr|qdaEGqMx(~8I2 zMALZH&Uh+5p`rR?ENGUiPdt7aaTk{he!O|Ew{)bL)zsw}rXO}UFmb1V-V!s36a#>Z zF`P67`;IL}j#tY4kiZJo=Wbvq2>4no%TFtRmft&lbf+ITW+kZOS!*U@lF&9OKZy|L^)3i@HTiS}s>6_U; z>F7fwaKJLWYJxcf;-HTo+^1^6AdCOp^figcPK0JKt)2jvxf28BeCHW@&W?hfjHeF} zGqn&{P>n}BQumVrL4dS{iQjYYT{gnoUuOYMx4#xWOv^`VPAqL)+%aEu|(;sUWe{YZvn^RyliF7CF@AI zVUoR}R?={TQDubUoM_RZiRy;ECuWSKAQxs4piutjVG}Zl&r)BR1r-$*fBUZXw~OWN zm7y)z_Jv#7g{*?{6QBw9Ebf)(^}Ab`A2Ji&15&VNE-S!9%n%^6}!0K z_WN6yT>#nq3E0$SrA`N5vLM3#&=s#gfe+c(l7tbkAvgp_sH8N3SZYNhIr7fx*ltCA zvt!iFV}FV+_?(0cn*C0?Aq2PB+)hm+b~D~r-22Jew<7<#1mb3rBtYhi!B^X|(Aa(G z*3p~oX7e8!KfgE#4%|~9@;N-uYcJ=H6lE`&{ePMD{ERPnY7($F5dKYhGX+X*2nrg+ z0Dt}Sfa+Z3p5z^{Vuzl*y?+4z79C5IGB4F-`uo5YFCa`PDH6&+Jqln(FI_Z0sLJS`Koida4re|H23+{?4Kk$#Mb(E2%_yjkb7}+ z17dpl^p&I*>}=oyYyH*p9NRvT@4cdBGbA4F0mp%@;=a|0=lqW-87(2}JJ|`iA`(j{ zE+3GdJhOan#}}t-VH}VtS-@*9XI9^}LaOTGrOJBCzg|qd$BYEVq|wua-PBmT+shhv z@B)Gwbt2rS?t(nzRU+?H?xp?xfW$n)?yr!gAIRhZ0IvK0k97b(3Jq+lb(nTh4ODvT zhuu8FMZqC9HHRP>MGRyAUcT(Y-XQoo0sOt^z2P~vTVO%FfITj$|28-kA9{pL{P@BjrGdy@8n8G9IF2TqT$GffKy%x%zo-XH;o$CbrE9z6 z7x8%L9hg~V3ngC1$i8wOZymPuTjQ?9MiPH~5%Ac~9IXeA4}-N?aI!njE{#hZD1W^) z(LpJvg>2CLoS&cKSKQp{3jtN9mpfk_BwSa5S~#fx7b9A24Bg@$VH3^W>P*~CY54)K>< z2S|Qzs?@_*!B18TJ_f`PgE-E4X!i3kFG!%)*ixkrA+()u%Tr>9Zn?Cvd%H2LD| zcc6vvKfV%gVP;>H1ww9l+pSk95#xPJ_QKi<5Z_c1^Pc6}3*t4<2V{#`m<>;(!3V*k zFF%Tz7T1s1l8)!^VvGB8SI8$Z5bVA{vtil#-JSR+FArz%G#g2Vb?nhkY)Y=U7DEig z$V!u&$uGowDPweKm~p6|UjodI$$1bApuKlJVKwVoxsiUNp#4o-8~Nb=EdMw=%xe+H z26hmxKeINnyGpl%LG;h%Skocmx?#f2r@bc?dQID~@~9opr7y+epC7Y-$Bz0+V{INW38c-%jv5Kt*@y=doz>Y^AcwBq{6TF)OMRLJBywWF<5Wd?y zDKIf?^7DKmRY1KW8g+P6O^-9-7F?dSJ%~y1^I|Ky1|qRdI{}HJ8(E^CcQIpJJQA_( z$8~>irk|^s+<~!Z?jcI3K$(bC^rzoj%a}&2rs|=Sm_?WyuWNh~FXZKI4;lM9S;2ibnKzjewa}Yg8RxRt&_G#vftMH$d0A93FMilxv&cWC z;z@y8`@pjs-7y4NETV(wQS#(e^>L96&ohrCSYr(n@Y_H6mHpnQzwX*R4U+BvJ-1OH zdJV^JjD+%2NNcjk}%d2~^Id2$SafK~j*aDM$a)CtM$|wyWEr=r>4RBQjP;?Z* z0%dzQi!}b7xQ<2>scF2Qz1*aHrV)B{96w=qA(>Us70py-nOnbOe)trr&hpi3z|7 zH{gGB*H~^E!?oCd^?ckRjFZqp=DSDRBnNmk&iVB&J_SI&eH1*1`@udTjBe{>`(=-b z_RH(vzk(|De86+?3@z#fl_CNDhk7;}{^Tl-D)xmhkK#$nJu}{XEz=V%em@LJ9W7 zs>jOg&!tOXp&99ox}RmNu9iq;4{cl5tW){`x=(L*W+-%|NQ6jbrqzfr8te;agCPZK`DQ%REn=d>t%ir9g8Z}rW{WASP`sS7AP0-R&K?C^eiaV8s_N+6E2cn% z)hX_~hFys#fSE+VlT}!UuAS-HQ{QA9yPkTyfUy`yTj0SD@Y@K|m5?Vih0!I0Rlru2 zuzHk52tZ%SXg6Gcsjcs;+7qy1vgWZxkAB{QMqzR-!#QuzrjCLLcX|z&^if=V*FS(u z!8b3dS#NbL9FR=K#g-V0NHU0uW0kaTS`_p!rfUQ$35C$fG$byXUR593gPI{Sn`ptX zD=1a<3wywq&=pm%o1B12jJ)kRlynvHjPF6f-pHOuW@Q<~U%dDIVLXKfT7GYec<)_b zSC+NRg{Sh+-{`X{-Vbi>-Ecy8c?_ncw}0$?Puspa zLB#J{Zcy+%%sxxs%mjn*@JFyS+c>OsegzaKK_br#2*bA&D0RN(2eI#aR$A6tI6#NE zWn=DveTQl@C1W%tXOi{$Q`gOnb@gx+`E3P|{aFKP|5S5GIdA=z!Yz8QidxMnF5aI! zKH-RWYKD2`+-m<>W$OVqCUjKV&X==Z>eCb@j?$0POs@f&)cqCsedD}X85cNk z@DcacArOg@y2K7juRTa?NxSx~Li>L5<`pP%_Yg+%Z`2@%gPtIfz;W+fme92-sQ+ET z_N2GYVv?oyp6vvSWw@j9TBp<2nNsR08Nl!qaRz5P3Qx(sb) zmzmFab|e&CRC$e0)^kvn;{!yFIA1eCclx&rg6Bb&aLvzAvmyVQJn&kfZWC2x^0>f% z0+MxtKA}Oz=h5?Sd$j=h95^MdYE#Z(}@f+(u!Ms3= znk$Oqt%q=?ei|pq^U$}z=(0||p{6RRTjamI5-;Ww{*WlYiw&j4BHKcRxpaNmM3<7W z_Z!mZ4E!`riry9|GoCp43d*=`}cejEw2aQ#rDP0Y*Rwzx6hMOqwX+m zk%a}CVy-i4F0D8W1Me-Kr|%i;982Ti1nNAqD{!(pS|0BL~Qm-_tk3F#~d6yYAf%^waI~3aa-8PyvR%f$L}$bmeY*hf)>>{K8&lz zuiP?}b)@W()W$b+u)~P4J#DMrbOomn>2>M(8?a@Fr0j`?0d~{S+Wij1DRZ6& z^_(%EKMud@j|JvdTp5GbgQ-?Dn2$W_zB{C#%!2W3_9h$08>BX`G~aeX#)bt`(H(oa zlY;&xZ3orCYkLOo?A$+U%*etTWUkA86T?&o3`GhmOrF(NpEAaEL&7=Cg{+;QZ80p` zL@Nfgb4l%#C(8qstg#Tbj3E1WLVP1fqP%!a4`Ci%on((xS^fPFr)rmRs4j{t9Qe!JNCJ@ zn!YKKZF?9dm{wtTJ6a`DaA5XlVujyD0;H*%gJ9pmuM#1CV)R=p7OSGi81rRp_4E6x z7?F7T#??xr$^#jZ3X`}hw2bWLkoG2e5%pj+z~jrxRaK^~YP>9S-8vhUn)nuyzX<1*Qd?M_%L zc46L8(-38L53exJ!Y$e(lVQk!Jx&&s38+$Jwn$*S4=EuUCf%L;nJJDip+ku}^K1<0 z>Pt>)*KS>f7MBa0hAepJ+S&mlR6@tp8KgWiBxm0e=e-69VGAp!p2tuWP|0X(zRA<>&r5?YGQ;&hyj<`m`bRDv z5JS z%Td(1$MWfP-}Y=iWn8sQA9_c7>99}2d>iEfUO#k;2AnBbPx|8Z^qfX(TR_Gpf_K%Eq>-i zP8kYR2|1Nm+pi~yTlma3@xJ>;)9}PSx!L%o;gp+?ghr^)>B_fqMHORb=og)k%<;QT z>6V^_rd$fwJ%hgYp#eFK#uYqWxpX9l*R?7%v&`~FH&h9i(0~etEKmc@kopz{FK^{| z3#HH*YJ|0p5GbDJJY@`mitQ$9RHh{fuse_@9<+%LblnrHgrSt#UxX{hb>Dorf?lSz z{?2u}A~Fc0Mm4SKL1&t$Mx+n3mVHQ`RBRyn1R24u*UVNqeemCU2A@Ll{>T_NZV4w7 z4sd|$P16Qlb8VS{)iG@hz_(w<>RJy-6`eavL1dtYdEsJctlJ{!H}w7F#8_V0m{ z;s$sQsM>pc!^yj}^3Z7MW|Ve48XY|vLSai!Vi&;^^%FJwgt2{|$E8k*q1&okk_t-%CaVTRA)lnkTh~X;KxI#*CjQnrhnl3*R z;;=6Kw>BhH(u`xjEGX_`#*wBmh>wlVfSycV^{*7@Cw2nWlnz{N3He^`!tWTQ1c2IO z3>{&%E3|E9uqrRAV(;GV2g*GMD+NYO-n`D{ z4P6XlNS-Z23m&4FqPnxy-la!0!|@XH=neEvlahy6>v!9jIAz`6A7arc>|GGyEF|d!phW^ z-ZW&0@9aNZ=Y1r9OEdL#ou(?L|6&? zh*av?`1p^hI}T2EWg4tPEp%kGOOkKr1;!XQiKdrVt08Hez(rA%_`R6QH&@Q({J!k@ z-M97StI$Dh0o2)t&<^gh%U*+qC~E=79?a*FArEMWVldmZ#v$)!`9z0PPE3fW*b4{Y zfHHhG;|%$y>dW-U8G-c@b{8W-271^fP4CFgCR?WS-=Qbn6N_iPOai>Ek5}*nP6PL9 zi)vWZ310yCg*3e#ihks-{{dA53Zz&g?&-%R>5qVJxcUTZNZN@Y#-U{c10v?gX`2EV zHQk=SuMLVjE)*&vk_m$RaSVnxDPaahqqN+s=6E8ZIQTB@=M5Nqd=g5ObpCwy^3(+k zKp;*as-9t&q>Pm8ygm9>F*HdhKvVUyq?J4x-KW^Dtf8$>aKV&HdM+GnG1O)DF#Bhp z3DAs;aX+#+Q@b{J_Vdtg^pe4dkg?TeA52y+CcElh&$s~x@i4`wHK-B;#kRVZ9+E~V zC(EUKGIJ)8ifWPc?bF9$p5YBBy~DTGssenW`;OuuMFWrHXX0CwaB(G-=k=g(OPrd6 zx~@&CLdmB0V9n~?Mn#41Xq(_P7}Wg_OhjNalU$uJ+BrimWvdGUF{?T3un1hju9mZt zuoxW9QJDCh(76FsOx)Frp-?i>a!9qFB2nem#$7z!1JK1lJ3I{X;Z<=@yRq|{GY0-)itK>uOrqKgO|yLEZMl}36gE$;zdkIK!`_?ClW|+mjjd~rG)hL_ zFaG+yg1=q|a*H2$?}`lS%#7`uibMA}*4Mx>5tm8y?d`&B1THSJnb1OWBqW8zNe$WV zHewUdr;c}W*_6K{%8s+7U>9~q@*aU(fsp{xJ6|2Bip9g*DBrHVW*x?l7Y{Oh|1`4< z%5CSR1we~g(-4ie*cs=!w37V)q7TFy$yCqSJg1D57v~4uVBR^HS&rZHDtYt~P6qO! ztHt{;8v@J>wpE8=oD4u4kEan`b+bVO_}0bqYjCS&BZIOiqV+@xV}zn$*t>6l{y6EQ zF6{NPMxsG7^&^%ITS)AnY~aKjfnf^vu}vpy$OyczlVFUsVkI|;Ydi%#6ChIRK0)ol zgwxlFd9PYWR`Ij5{y{@YdMqRR=>5Bob%!y2$HSt5h7j}jh1{uqXQkpVRoF^3Bb{yc#qGgK%rO?FRZWj zDrE-Ud`XjXZRxFBWa7~5QAScSP zXQ9HPpdIROz8dCZ-s(tW{x~yM$~%TQ)0{5njEEdMX*A^reHLO6^}>ojKK(J^KKB(~ z7F(4F(X@v?**8zGt-<{3KD@D6ux_1hd~nc~>UO>-L-4L@?gQIsa#aWmO-RWC zok@rhEvI5`8WvZ2li8af7~6IsRM7GgNf)1{pumi~c_UH!NXJ&z@Zp0;VC)%wk6I?y z9S#|UTNX;gy5LPpFpf3KQYgPjEz@b9xC+XS)D!`oz_ zDY0I(34Ob5_4e1m68@GD*vyN$#kmG_ko$zK;YzcSiU1QtfG@*SZ{%`&;RTpY#G_RADGIL+j!NSd+{gI3=#9Y^(kY zO>>x=O!axeikaLo*nkL%1liDeudv`R-E}NItfdN(?$3cngsCamrfvv(o5>`){*x94 ztlLxlo?Zb^)3(x4yYJqjmc8GIFIp;ED4rsLNhLfsK=M+C;r0b%t~X;)7VNKZ&?uX3 zY7-aRj`1@%iiAW+rDDjy9d5OQMO5z2Zg#;A19R6Cc8s%{s$a53O;BeN*O)PZ7eZur zSx?fejKn1bT$t!clQ~e%OC!{%!m|#D?;-7PVA!`Lxdv;&{2+85BF#AO+g&~WVo)$Y zzq&ZlQjUM}Y0kx!g;6<<_yU_3_t^Gs*OpWOI8aAUkg7Vw>1aIj2xLj)4eLos+gPYS zTtA@#wyx?tIpAlZGM}YSH+v+{C&9jfwd71A35i9pjEfz!}R=4 z9m~r6J$L&|^9iAmQ)tLAcg1{xuGEo`b)eOzNLtOpvHKz+?}=;I{IEw)E@~rHE=gpG z%2{p$3pXa@Nkl$V+-4YnTM(yG0?ci=a2>49yh65)Zi$~7Z&LosE)t~?(qx|GQ#yxw zQ7e2Fg=R72F=xRsc)_xL4;7p@`At$Dv0Qkw>SOy#9}dkvsZfij>5(FoQN%3L@4t3k z2&X_|5j6?|kBK^a&G)6%-o<`^+%^Zf2hok4qYw+54(}-eI^Oe{O4&w+k~6Bm^Y;p&f62Fg8L^W^To`X6g76}#@5th3LvqJVVFghT01o( zuiJ6JsUzerK#pq{$GsDhTskEG9nz=FYRyoX0t7tvtK+eDwMT2fl!i%{X7zsV^L~4N zM-UPxjpxgnsD=AJcee_cM?Pjl$qA*hwWY-3cW<+M)G31Mr!}iOR={^bV;fDnSnl?2 zi)a-rVEOVLhwOPuZK{I3)xNN_2LTiBi>wG~J_LHj=O9Z`3FiV!Vn#Z<8n+`dVi-O_ zPUr}J1F=A58jz|6SkmP3x76^*dQh&>jfh3v@n#>$p(1Nwz-fb9*VU7;3+$(;LVP>}%4nqiL+C-EuL5 zz`_G*8rD~v8`yg%Zy0Cj`1aH9@myj)XcMu?{Jp~X*V2Go1S>{lLFwlKmx4*FAd3^} zeG3;ka=eQ>EWo@VkJ?yl=()e(RjJ-$)A1$lkUz&P@7TMzCdi~}Zb({}w)Ukx65X5X z9sO9={S%u_Kr+HNl+)XU4P$&34F=CvyCS6W_dLy38_3e}CI-GSx-5BWy0jR;qq6veM?S{7`~jM|IABQ*F_WElb95eDBt^e|@U;>VqzqI|BZc zDmhcm3VRU#YWJhanT7VnSmLpVAf^0ZH*DWJ3i-nxXCm%G`}ix)3%&P3uc>XLFWQUe zmc78?`8G>^-?ks;s~$fX{rA|vUEW1$7*qBQEpX48U$ z5S|<%@coPD<2*b<=T+&bwD;$~{`IloF|<BaF}AM_F{k-jIXaIL%x%- zTV>0)Zuu28KL<@XoY1_ETOB(p$&HZu%36>s-v)UfBr-k5O2=C1t#ghb;AmxOjlYOX+hx%J8v^AKjxWROGM zPe-5TlAu5|%PpXMEgh1x`odR0InGkuY!zdP*_%6WSxE%Y9pc9BZhDN#D6Es zcr=#~xt97(_7$ZQSFV-pSD{7iCd8t}$Tv(9zW_q4%&F`A_+1Xg@;?k zH7^<>okkLe$ae)j3@MF;`MKFQHwKR9JeIIv3>nXRRKE!uyN0ca@4 z70MTXqQbw@GeQqm>QKER^z&G7b!ruE=bet!1GCk?#OegU7@mvtM9gsbzuN0Z@^2V{ z1QAN{>Klvvx4lQ3IvvX12O;?QVUx^Ve>hG*(K<&g?gm%p+2B1ik?%18b%Z@QK2Ty6 z=CBQ(I?dJK{Qr4siX)dx_TRLqaqIPKpYdy<5wFp1$@sQG+lCF&3giAoO!*aR^W|aD zHlO^`ae2Ez!wwOLJ*A&;*uUXu~!r3C||{j`hE|E~OZN z^PT|}#b$4beG44riKmWjGy==);?taSz)t&(T*5aJdmVws0F3&kuNT@`49R3!olNB|Fz8jdK5g6Ya8)S6U=9cCdikIN=~fjIjEAtUa5%p>_L%Fz{sSus@W_o!h0iws!08rQ zWS}6x5G*J$-mpz2R*rVO#vdt4ev{~e*^`4l{gYPp=l}7}7#FuhIR{(m9bDR}%&UdL zte^%Hccyr*vzNT`$9Ld5wZE^%FP)wWIji+8Xn%cPkS1xi5i3F>18@nH zu4M?FAb!xG>AK;P44}BQSuPnFpa_X|3MNq}B$Z#Ac3*9{a(YrPSE+oQF3??Ov*6*ek z0E#0CXDwv@=X&C0fwTq@=s=N~wjv91uUP2SZM%R1D7AF_z&}?|5B~@=UMBmf)xJI# zA~=u~zXLJdzS#76Knh%P{%p?Ed;jMFp=9Xz?}CE~xRasd?`2V#NJZOP&ms2h)*`t( zc3%2_T0;~hP$wYJI`loBl6OkZ@}n|%>Ynu7+qp-7WcaT6(|`6G0h0wcYb5{D3LN@T zu2)w$G?>q7Eu_4Y69mS{FEf1Di-~56}4Cfp|gPK z>|j;FQd+IrL`{RlXw$ylZw8wCM>`6;CMh~y#|7Pv3;UXv|GRsMH?X;SL@_iCHc3T- zik3|U5Oq-3WjAXU%>*E<#5bW<5B}#NbiX#d!B$BNc3Zp|7+k`pI(iL#X!bM`f6j;Z zYyER?ai|vsH?n-%5xFQ^nfVB zhMgs2!k4|nEdQ<+9O)ez4V>WqrR4;<8!OpwN@xE~Otlp(_GJ?Gj_3c$hJ=ihJT zNIwidb>-#=A|1Re!_2GS=o}=hx?P%{0B~dgK@k6p{D2cOGYxwjYei z-g{q@%L)w?*IJfIBC+Lu`iUhN^+lfk<4I_?j2pjW@UDUW~Bu3WEZ0 zCyYFTaCmw9f%t-k_MgX$o*-T~MBJO@A}!ZLATA-Y$Y$j-4pzcQD4;=NsIDR!=2k`2 zmo&e$V^_U^w>{``ZI zw6@z3nB#zTYy!9eU3-^{pVf=*4+CoqRLL;lm%krXxKcM>E&KQ+1;b&?YeBLd6l?l} zv=)4C2kdsE70_yg55Av%avPU0bsQm+WZeL_c)Klo$q@51 z`k6WceFpfmook=#BLqk8*d=FbxR|5^uRgu znH1lFzNt%A8GU5ZzRkjQj{7FEChPV(K8|KbgjfK!zAm%Aee@C;(x*zu?Lh;vBj)q8 z4--owKZ#mw_aPS4CNnaaJd2x{-?sQxw%?iAJArtK(WRqssx;8pDj6F|Fk(r(qjS>a z-Y0B>s!aJOqj~9a$Ni#0iL;nu>#|j-Lf|6v;DWS3i~NI%tYt+rViR*s1`gGEXiNm3 zOinIVSM);GeycjFq2Bn4;!W6r85KGQS|SJlkXI=TS02#~Dn3xwCV~f%bl-tjRk*}5 zZK)J>i%F1#I6#9EQG5G;v`_HcGmIlOvtp}cu&TyE7yGXxpLivWa8>;*5V}K4gl`@_ zdxE9vhOO;wG6{Hay?`?%_fxo^8bC^__!*2ks zg_q0`orWeVxCTpbY0rIPvF#h=-Vdl6aK_>J*MQj#jD#z#j74QDz(&Ad@U*J_>zUwL z38fe%uh)Zcq60MhyvRHxy&XF7YTyV_6`=g$VAYH3J~V=kE|{(>I}ZZ$0X*@HBHN+a zIuEmx1mj9Bt+1HQJz80^`d}+YboSQZycI8ZWT#srOSm1w*&N~qUxSvx>@|2vf9!}D zRI^Z7)PRC)nsFRxst}$Sc$D;vD=%(dXuSW->T_t?GQ^}!plx=tnTUR!8?wwLkf7%( z1uznvDC!%btpk{RgOT?$mQ(*6zj!PL2!FN5pG#se^x@v;TCq4>qUKC~vUlv(Q06Ue z^lE43T646toY3g6<6W8l;2TR1l=lm8$~!>Ul(^-}b=|g0*WbOU{O4&27gyE(6^3E!d(X!ZQdk;dAbg;PQ3@ZVRP3hzh*x z(@F>y*l|OnlH>dTy}2zzjv?eadk9S_x$mGUZ>8qFgg{vB#LN`cce}5o= z!9dIA9!RT!o_u0t4N?Z6rau5LXN2JaXz?oeGsI#UvVA|bY~Nqx4OG+NbMMO_zh{)u z_eGlD6%O~COfUJ)4azAMKuwwiEH6M&fi5r)5w1oQF!FtA(2llg3_4pX=-If(^KD~x zHIfESL=oRXmP0R*tLsmb%V<_^uR5zZDMB%Ib=0|yiBp!StEcl?9pNO2-qkI?JIO6y z}8 zr6g+{4k0lD1Lc|$$MqZ8XSm<%?jlqHp@|QLTBWkl_vTpt5<<=NvV++akFNt!hHZ=w zlC?sni0D5(QKGMfi24+g{E!m^prD`-Svgr;QXcK_4N^7YDb&#gvq)W9O{C`&@p^y$ zDVOw*8nsT3X%+SAsKK$|_VkIF(ik}0y_pM(+v!Pg>c!D3QnWmfE6+ZOMh^+A>figFeZ2*Gpp-$SlS(sa&C!HM9rf z0A#CqT{qnO;NLp!1T-JUGFI`!6%RQJGQ%6_IG0$J=x;z;u=3)`A#_9PTfa5R6UX^A zgH#?$+#1xY{|Z>)5R}%v0Jn68kz1XNu0gm{PUs#u}0J5(%>? zr(;>4QpL^D3QB7?VYp+0es845n+DZc3pRah+$@`pA3terR+VR`)y!X%KeR59kK3VS zjyl0lP**oDhqu~ByT9c_aKwUMtCvr`>T*-=py~!N$tQUE%-?54 zw4gr$xK*3s=Ks>qFt|(bjE8l(eUz_WV0-^a15ltEXY`v_yCh~u_sKLnWFM8>pz~xk zs@qFJojQ26l0D+%mijw|KJPq;HK(Nc_r{oitb!118S?C_o)mj=>~>^dP~}Os(W5R{ z#*&El1Ay_VB2yJHw~(pG;kLG_lEY9NyC)Y*UVV$jwrVrSM`T#)6Z?tiETL}N$>aUU zYFJveKO!pNp(hqnakH14D-G7ZV0jgvIi(#j;Ej~GOr-EX8F{HJ0Y829Yp=`E`jW=R z-3l7V52fp&!RSU#O%bF83@_9CDB~!h#W2xqL7sb)*?0|Sgx=~GpjdkFj`R#zPdv$@ z-JE&(;&+b>9y?BsjK@4Ypd(ph=Ub&X z8&n+DpQoFEbBtHq$DctbaQcxjOL-^rGmRWouZ{ZlfjdPH%u4Qr6XxICKgIUsTtcO< z)P~&3lf&z$2M#@vf3xetbrDzdzBMpkg|t~aAQ+~6+{#aA!hJPm6Q&^T2q~bjD=5l# zc8PAIU}OY3z?1D6<{z4$3BI`5pzMtLbXpl&}q}QYU}lA4kre7A2+Hjb@jSv>&jr?3#F+kWtM-p$AEHwnE4{^E$FyP zk1XfxyRfF40F$(`+F-s+lz>@iqXfOjLf)&a7U)7>k*OH07;mZ^4_s? zKa|fGy z{K>&x0>Nx4&~`d_h_yUvwRf023MS()e1dXZWraDWs&{a{igWvZIe%<=Hud=!!Wa$b zcHucX&u4473ws|jB}QqIlUuIp=*8`(hyYpg=!+-28D@Zxj@avoBx>Y^5Dd`=->9o& z$<7TtOVjc-7V6nfPr)$^zs=Q|Q2`n{s%U*S zE_p2d`*$iQfnMdqHGe|NM?Oj<;5=ITOx%>y)gA!ey z`JGF%faT+yn(^-SF@no-XtF7Gf*+{;v5Kn(Ucz*g8^%AOwgTMEvc!^~gLO@cuM}xMsXgQYuAN9h2bRNQbQ<#qwmKPdXOG!wl>KV`1KytABkcsUY z$79sT=vMCWtCA0Y0t7*$RD8ygXx|WTLh-w1&v)j|XSaVIis1izD6XTtbTm#1XYU<_ zXXeW#lCFf-$~{`I@AfsU_awAhc>=hP_dLyXcET6WHh7(VE=e}4Tg~v`h%$pUI%~dW z=7-^YmP?|sZv41U=o|Rs_J~I|3s}tMHvmJ`{?fAk-?enT2(z_{WiSZAF9^(Xu+_eE zEaW$BD_!fM_Gu-HdrJLGh?PrbH-1gmCF2(Lf z671}Q``1k0NuEKP?unuJGmk*^`niRft~_9UW4a~wOROq`Q?AH@92(St!rw*#zmd4w zq5bvI(v8bQ@8RUw__p@Q``;buA-^Pe2#1WRgLLw++tLUiqDss|vwIvC4uU#AXgXd2 zd#CTHmNI?>#?P0LYj7i-`*81`raeHrM&CJJ`5EYWYrE0uK=aw^b4nEX;3PJ5>v|2S ze4!WwmW(nS;lRf_grd4P8+wqNG7w@&xQr+mze8)8GI0Eb2)dkRX|z&MVFh%t#+y*) z*n!bYUA3PRlBeBXb00Vmpi{fw2o1wA#EwU!`qfF~T4gYZOB56_5;E2+tcN&+CNTlC zbsKcdo}2NChb+9CR9Mf;1ADhjegU>(bw=2Q&+YSg}tMEFTz^ zC(WLRn634q(y(4JTM8oEc>VG-L$YZlw@V!kZjBI1fHQS0Vc# z>V$ST{b%s;-c3qYhdn!CA4>h2@WTUW0t=r49M%5-VBBQ0RfOe%h+LI%cGvw1z@@n1 z0L+{EfMHJJukn7?tXc%S1YD(Kn8gqh>f`A}vpAj;3ia}1;A9580Q3!% zy7_PC0v*#;F0VLWEF%rOrquRaRLNu9lV)8ATyGA*pdzAS;dnhV>4`9=wWn!2J`J_d zr2wLGt$GOU@;Mmsdv?=Q^Fgyb_~C*ZJ`;r0DXBc6fyIuDpN84^dUt68 zE-YD}tiAJS{ddc+D=>}0WJb298)1?XHeYwgLfhJN0VhaQdV~@WI4s)xMbIi6c=7<{ zT@b#p&y=SB1t6X7=1xye!wjOekc1~1MmSvR&!8Z_LB_D5)eBHL?ZXXlPD(1Z2sh;K z6sTE&W>2~;XACjbe{g5m#z=%C+{9KHjV3_doF3TLpypZ^-_)B zW5Z+~ibe2UqI^zync4Km@<5DLop_4ANg;Ib0G2pIv_^I#NnP2e9DBY<+4oHu$wBcv zA?bUGK@fHB@dSuepa-tF4Q@@Z;oNc)Htm6=oLKtCJncPB4|I$>UD<<4P+niVd0E9| ziWQ#&#y$l{?adyXBqj0@Dg)Bh_a0A&Y@fiRZaq1w0bRmpWBQZ!I_6IIyDN`8L8oBv zJr|=Auhagh4Dd|*iL9)0HrL-$U=G zOw?!=)2DPEgmo9SQ_-G!3X>s~CNY9Mr?jO`rKzgmjzyiH>6+HMuAKN5wCZG}#VfBX zi~3o3dB`$ct$m};}WB4p2xmYFszVE;M;cZ|gXyMef|QKq!Zfj?(x zlX1R*Jw08O(dBy?94jgW7(a|X1#9FVSXuKG2DhIrz4rRy85<(q#$On)1=Eq0 zi$XKAml^Y@g&%}ybHsz-;#aOUwH{X?#(cY-o#0nhNX{@Tov)vdrTx}kuA5AXj#JI0 z0RJvqicenU8@jDV&9@$;&DM>jN{t&nsr%-{XJ3l;nucMNJ-(6qNvJW=CMv=YbvgpZ zS@P16LR*)r$jkdHUSzfcPVMwkA;7e%epFlJWBzjhLH~^%H?pH$37c|Xow$Xq(ZXM& zJZSML2n1*zrm}-H|zm^aBKj9lW2%qPRO^n`y6k;>~_u|a|dLs zmIOP`xM@*htT`q^xiiS9X+{Abjo9a`U^c8V==~E8%fK@#2~!C2*tfyqiU;;cX;?NM zLaMaonzscTGnB5IR7>hjRD1GQXpyYM3I_3ggDw!`O>3+$6oU1AA>af^{8JRH!OfekWFFY}72 zgG^-nl*UeyFU{TAgcSfj$lF5+K#tqvc=09^A6EySXtpa>Fubtdn5toUv^0XQn zbNV%;#`V5wXK8|%@W?(?7Ez5%QSr!E+K1%)Zp~~qCZ6lJHPvV%;%TO~1&BGSiG$L3 zqfnbF=3;`bCaI@;-_!yfp$7Nzg}@5x{qZv#^B^6xg$xg$HCs2d_nz*Vc;;Wdwqj3Z zS#dedG8)neRquo}$WV^75sKhx(_$Hrr^ZE+Q+RLU7~1a<60HO zyQBugkW8r0F6o({;C;_F1sW+_F>4CSHM23(;wtU$6Khvae7_nXzlVvOzG_kUhZi7- z3Okb}oQ(2vjUKgbFj&799zyh`4+0;E#qCy}B@$7z+K%ch$v zU3q5=--J_J1*ZR2amic?-WB`Yi2A4>t5WnT)nV}Vq+;~Lc1$Jy6ZdW5Iq=o2iD0Z+h#K9}fol*!1~X{QE=(U6}M zI+(KfFo!dXkP{DM^op?OYhDz+HB0s!CraQfYeVOAlloT3M>1p4LW9J18S+eb)Km+D z4q^JwR_Qs<)ZoS8JF#*4KY%(N)$SaFKOV$#Uxby6Q`_+D^{PwNNh%_Bc~kS6VnNp( zbq}}Qn>W5AL$L^8Dw54A4y?VpKov-4l(0FeNubs7+ z4+ZtT?tYwDcEcvG@<6ag?MXm&R@~*7EM=bSOT#OkVi?EF=QbHMi>Q}PVl_e0*Mug>|?~^BGUY!%6g|+ZL;wrUbh=YZn2uXAudaP9~jf0Mz|I0 zv;Q{+Q-nF%bG(CxOY-i?Qk&#+?()EuDZ2#xcY9zi^KoHaR$`nSEu*m5CAlkT8p^fb z_$*R7ldxA?2`dp8OjySD+;!BDdO3DhX-ZYc3!INT~MCi3x528T!Q{1Wl2FK@3h=(Akp`FyjV zGWVt^s@w1E=GCHaL0!@-OS|i>%f_%X$1*en>GnFMCiD`|H^_it6R#4NWPYRk{y<%a zw(u<%;$^Fd(v4 z+M8DOHP~A6Y8`ENMTcN#+Efu1Bm*fw{ z^=Hf%seH;Qy6+|`c5=jJ4gzC>>j5c@Yil7Rh8TrL4{ZsAwlP@SV`5b&`uCv9*d*+F z+0Fe;MrFd6rNFYH_FZ+J{Z$AI*;d{Zh~nifg-hq>1L$i4k8${G>-ZEqO0>a z9z{kp4bWKy{8|r@H4x7EVQ?Y!U48fkkV8Nx(*yNe^KBd&`yfk@2${a*(Fo+L6%|5e%I<`E}O-u`qr4GJE)OH7x6 z_O^%Q;@ZiQBCH~*VPfMk9U$RGaM5YuS3o!kV&Rlz7^$=oY7JB!-c&{ULl2{nf^9cd^K`=&m_v~!l2+|} zslh!(Oa-?kw=vcBk9_kV_E-UX|Fo%vqM5|^hM-Hd3gzH&xvijKwCerVd`8a(epXAf0LC8(XLu8QJ*A>x8XI$EP(8~dc2O*G94rtSE46;zH(pI4){l$_pBkSKoy?zcI~t!6>ldS&qi98XA;YurK2uz|pa3moO5Ebuq1hA= zwUUl>PPTpxFL|6mV_~ZGZIZ-rFgSnTTn=48;f@Wk8^{<$asv8~slGI{KZW0{ zOQW%O#G9ZJ7ePjwxg*H&WbL*;LmHcMK&d z($=jqW9kN{6*s^_OB^`8x8|S|Y@W0F01{`DPw-f8H4A;=z0%07afix|*XE^_qO(}f z`(rY5U?-C*er9NHGHCX8-GvQO;q*oP+@zWu4t9%1@_=1u~9kVu*{I2y4fwhKPdUmW5| zi`tpTxZmoDbQ9h>@7h|Uf3qGGHmBnDKWYHK0Xj=g9*Oa`X`}8;|K%V z3Af$RVg}jHq`U!94ibR~^E__E8nEukcAv41s#cIY?Tci$m{9YFKMX0Kzgt&|&)UHd z_grfRbT5PRKsYN3j&K<4e*F%Zs(@ge0V0O6>c})f8`#Cf zbmB3CcjDZ5s?8KB!o4(88`n0Z>4@yVtMJXVS{#w)y`_qzQ# zMi!D3_$G3U|2$MCaC_Po3!NSHrqXG5e4ms7Va;mPJFJERpD;almLIaP%R)2MGIMXI zv+CU}%`I%yzq|lOX;M&u6hSVnQYcMUc~5upt(zOYI@G?Mlh}vRd@{MnACXc95x`oLU z6lgwAmCRMRal`&o&P?B1oyS0aVbk|uXOIlrdg(dp_$*7&m*={p$2;y1a{c=sv#s_& zr8seR3k-yOzGM(?uFP4|^PVo93c6u=c1!L7U)YT&VZ>Mc2_iR8N>q%IbMSOaTl2K# z@F!m?In#D`H*yyQe$<%&leqpaDvxWV#GzRFu3ws+7YTcKGuXGtoZqMTOq^Su;HH`M zskwPY(}>kJWNB7x1bNV9{8Oh9IRk+(D-O?Dy96LQ0nef zl7ad+bP+L(>8He#Zeu4*&B>S*F0oOi3m9kMbI?Du$u_Q%IXGjut$kxJh)%*}pF-mS z8#1spWJhQ2<|eZDz!pDRVDPf%Y;#sh8n?(2n;%)|<<*3{27>Kq=Z9yM+OH1PPz(~j zGYFBN?u`-54V5)0k$+0!l6pMSmn=7gULhwssFOUYvhXfbD1K>=SId6-wk%4R;A*O; zy$`9kdXTKHQd>w8D!*y5O1Er~XUHwEW@sTwu)ki1$8-$Tv%&m2NzNig6btFoX`+I0 zNF&D+A_w4ak~!M7$s=2LdeYN_B2;4}v>AnRv2T93-@lPP# zry_3kCk|NtQZpr&Pma_mQX(EgdXwNEO<}a$=;0ag0p^PzBxWpQtGMO(tpAUGt4JWi zc)zEEitGykM{LhGEYWUR{RgwEb1T_swr`3)j0qWmfg|*>^P}|TA}K6(4ynyr%57_s zY+FAa+HcsA+L4-sr6%(vyS+xgK<7|xNP;WKwczG&L2WgO^`R%Ix=HjX@)|M+wi4&5 zNbf05B-fMy3{|}q%e|3pPo1`b1|Ymnl6>Vl(BpWp_5(|v(K=@d75$qBu(f@22808O zyplwc+fJdkJ`%7*`07v9;m#3MFX?=e9?I@ z>=$C13`wy7^_gM)&VsuiZE3I^sMKm{DWS2|3QvE#&YLt!il2X}_LN@8 z7H)Jn{l2#A#aB=3Wgj%`;p*hvi61w}%gr_4Cw2&}i|n#%+TfQXk`FdM-{LRaaw6o8 z8K&bGEW8q1a`Oq!U-5_0Mu|hHgT9ApF)iZ2soQ_*3>=(q`RwCAn@|UF<#WYIJHKLY z9wT6B+TYvC-A2a}Z&1a`Yt_W#e~?OFEB?fBW`_Iq*JXLub+o6jJxT2Ed{N5s9s>z{ zCgD%CqHXI%!si0qa-&w~z`PY@L(w7JV;O8uyEeU%zB4*9(hDF5x~e<2HIypNaj7>Voe=NWeG6**=fR zg;#@DTP1`CXs7gu&JZk$O?5|v2FZ|D+Zdv0RGW985We%2N>#UDY52Ytp`B}nl|<_s zYTLTK4dtj7rz1~=i}IFY`qq=Gp1CO239vZnSVfN$73yUS@xL=ug;#_2!&z|<@9@PC zJX=pntyE}EP*9Hue%>r-WoT_PUoe`GfWJ(6d(!@_Y$Zi|Ya^pTH}CEGyRp($xfJDVuy#UJ^Jb!mIwOR*mN7&N+L(RA>Alq0KM}7q?NrY6tzA>u-#ew6mNv+% zwFq~e{3DmI4*q9>Oiq3!U-GjuT|C7RCPNZQpEioL9ohhrnz1m6x()4TJUN(4REG9+ z5wB3|Qaz7TsVGpT7xqY|o%U@dB@T2(abYz(tN!+392dF@`bJ#p1zx8#%cbc zC@gH>H_MRIv#40ediewoqV6&LbOCe{T$z}l_YR^*+AcFzd9GCsdY>xlQRj9qCJ-T1 zRX0(8cr}QVkG`P%wnCs?|6ZFw2T zNOWDfo+`s}*Ik>nlZs5bcU2ti_09;SV~$E}Xhk=#9{*0FIy?U$DiYG(H1MBedf_C9FMGyfIN-piHF#rC<2Yy)}~;f^aXoPt1%@8*krgBc#lxs zXUE@5FQz&H zcz_>H*2mDs`)0nw#r=pa`L09tJ?vUdiQ(?%?5gcH??%ygrNYWC0m*B>_D>fy=eAK* z7C}+=Y;9S`@Z|n4u63#_GqaAX+~PmyB@4IGzSi_V#_b8bx?K7|YQgL!?HK}b0$G;e zMbhv!zt3D1z878chS(ji%^3IcpRit{Ri+xV4yw>}@Bt2Mt6?DHmY(v{y>EJpg#pq} z8U!pJ4+$d`!v+^Boh*(9CmA?VRVU1K-}}*ae)dgTk5damU_AC|MvZIVgx`^K<86GJ zXEO#v1Kre7x~cB?YZ**dl?w(m!|q2vtMevl3uoP+JJ9V-)h}&)EqA*6)K~lCBHZW$ z{>F1ACn5ugFzz3Ie@^~=cpw!`=(_7^#d~o|Lo|mtvBw96YIF2dtJ9u$P9?DF(seV` z)TA@Cu%M3H9l3mERsAUx;F2)Tx<$)#2aUmae*O-6^kLFsJ-5~;shNi8L%IslWXdPv zf9Prve6;q|jD)C2D#!hMt(S@hWsgz{aTcs+fk(zI0_ApQfkxCI0X?oPx2bv$hCnZz z%dWIlGP#VbL&n7!(;wqE0X;*-WnyaQ5$}YPm@kJk*K2_Fy3g zAv*P&pGL^Imazmwc-JFC*PR}E_b}Wix%Nl`cp80y)l#41WFloElNif$tCb!)gO8^d z6)9aC`*1!;$|yfvK|~}7-~zreiehsqQ~HBv<1urIT>9oB0{m$YtWYBZVVgkPWF@oMs!*y=P3j*=~Bp24qbHD8|{ z7b!AR{3!WGoOI^4j5fNCj#H|Lf}0uuyr`)uNcq!YnFf`KzA@Ja8}3v@9}G5|%*K=~ z(tI3h;4wWw!Wm+JQQ#c`3JdLO-0tP0?NpC!z<4SgfWSCvaEUfsXSqIf00kL1ppl0ItWcF_4PW!7D zu}mkpa0*GKLJq%lJaHZ)obmlUg(3g`dd{?p^Ql(vU!tHX6JBq+F z{QDmPAYV)Je{Vgqis?{jMlM$KQZbG0%%31hoFYa=)`{IKuZQWP*plURIzJebBkF`&KNrwrkSdRWZ4Ds)U&deG)meQo4-Df~ z=)K8oxzo&L+WIf=8nuoB<$$V9Lba_LoQ*)U4(iOsI$J+h1%Ut%`uBQ>l;UQltW zv;7k8!o==>1JFy7E2L(7sgH1&fRLEN2Bxr}1N)#I3ghjZnO~4+)EaRdu441>W^9Ce z#cnXL&?IO<>arWyrc#y5pqY%c#6Ex(ZjwDB?5TKt;%0IJaCer?&7;krlfdEhYr5g3~(+majZCA4os$jGX z*jKevYj`*0A|{-B3(hZy+m~5+W}<9_PW6)S`~E+ZWqxH7U>7F*=V&?XV9%|t1$VZc zv6}RAJrEmcxZ7az+*JY}I(@NS*5o@2>jNeK6+k|6nwKeJ0A-Oc2I;MrDYuEHCcqI) z89M9XF(2;k{9-#!?Q-#N6a4SD?axmp`0~u4()*pc7XL;06#)dl zk;j0!ysOs4o6u|55c{6$Wn_*jlnxF`6ewgG?C6h`7yv89{mu-wnLc@Ny)-_9P+A0T zzady4?qpX40jEG9UF$=sVjRvVtzI|InFjP>6`fDc0aCh4%XQ&&$Px1|JX2EWSBq-$ z+$#ylqO8%!{|+8Ql`(wiuoW&p*Of2xSTvDv`{vsYB7#ZN$!GFp@s+gM2{v(RNBAR} z(XQk{IzH*sIPM$WzNaq9NQL+WChjk(RWXUsSR?F7hUX$o&_uJMwgQY#Z(SIP!R)_j zA5vRy$(C?BxBW{ztGw?qV)%ER;MZ0NNAMg?u_eJG0FYmV7_yuX?}-JFd6pygc#M9t zyTG|`3u$}=K&WDCTd7)n#{HPxQ$Be&t*CQh3M&EYx9FB35%R+S=YL!t7#xjVK%d_E z&>svOm@X3U@J{mhhnlCyVa3vJV`W^s%7```?FS?TmN*0mIYTxgo^Aom)Yywx|C|#` z+QlQ83rzRBO1COO=dds7c+5rM4v0tMik?>gw|@vZuUaqwU~jb_@wUI%c=IX{*2;gdJFypbyHe@4Y%U zjNexV$@K`dC8u@?RIrXL`p@?uC!ysDPXu!<%(x%yiZxz#{C+$@{{Ns?gKa<%FVg@l zcUmffTfae%T6Wniz9rzvHIT-T#`-{N3M3;8w*WE9?--0Qb}xa#GLHz{C7EVsx#~=Z z_M#K-w>5Fu*+kH7rK-H6jui#7WllKg%n%tuh zV~89yP7L22oO)=cyv*lFSq*YM$P5yD@5JV$e~Ts|IT|jB>HQ66rl|y)mk@;&n~itD zNYO8EgY?3&R(07029-rur0LkfqebBUGVQ;PhCfFw?DvLfl>CYl4K5=O297kxQSzoX zFL{qLEBbK##BS2zdW9}fNs(cN_Cb?0Z8r@PhPRnAg6K%FOhsq`J-GN|Q%?jACl3cs zWq(D(`XZpkKDJT7Tg^Y9f+U+CJr>S1heNT(hy}@9r%$mh@4OaCsW|?0!XzFc1(@j z6DF@M8-OHg^t{3@&KYCt_J|c;OjQG}M;fDfgzTkM|6IVt|L+A5HdWxi{Dg4^g91ts zYyVoky^p_?d58`f*>FXE&4r-}=|zQA$~LDr6%$WNho@@K@U7}tQvRue-eER=x6r-c zYm%c0OSCvDtIoRpb^Kxvtryn(*UozQ;x5TfXGvFg{P`yv@vkNF$EQ-QOsD1BR|XgU89KrC4C2ydnwm(9v_GG{hy|xwcENT< zGAIT8VQ96?X%V~I-XN8nv^Uk!>7RQlMwd$-tvj*6VOQaN%(D$468_zWxK1G(p^?!% z@@oh#=exdExipCkD2YUr0MFgO6i z$?G}fE+4@cEltRlA`m7vIXsbf*>@p+1pEBleN9wzD4p^}pe+5diW1%fWipQ(uc~5s z4S162-S@th|6kGqQ*?#gzlz^T9Rglt&_y=C2Rf{frbr?i#(<^Xi0NsCj6r(RF(!5x zw7?$jj5m4e)yl1sU1oN)`gU%Jq*^&<0hA8%%n*F3) zT|5Eu(xqTw(9bdxbR&Q?@d)IfJO566O*wZqXugQ=r!+{aD)^<08@&WMl=lRX(_y0PLglsCnS%;Ty|;rIcFFT>0E>h%0>O>5-*12p$||)ff`{NwE-c8ZF5n>be2!4 z{weFZKxCMEjc`K`+c!S|Tg^sAY6^C5ATk}@NI(#>jOplT?&UJGX0XLm#ja|-0&|(DeJ&ci zMBx(ee|A2Q_kmyEF6I4c_-BhS5uioO@~tR-1(UK=<}%{tV03#LK_IG~B!3x+!!e3% zQ~gT%1sgBh#1Spnqq=1*Ep@EyG@;@rE|Od{e8oDeC4>n2&r7d~5ku{XRsC}#?P(i{ zuK`@+>N(VLX4l+8N-~1F*-Th)0Ause_2(!1^X}T-Gs1*7m_l$oOe4@dKKt1=&O3AZribVo|!~G>B)`Qbp4_UXO78F3@@s<)A`(rgwXn#A6u545Tk1`in>Z zDztqQ-1dVGbp!lA_>wUJgnW0ZKS8vDC*{8ykqrgZgiI3EhC@vC*;+rSxHD9P{dSY! zIm3uhLAIQf%E0~l2KpUMgd)vUbZer32!T5tGYX{u&M8)_A4^jD7~@X$HiU<-1cQpD zq#DA1K;7sGdy)RCMex7Qvj{rc>Nf77KfAId&mjwkfa5*DjmHy&PSBLA)Oo+GM6;xB!@NKf;5 zpGUIA8`y0VNtaAMb)?*_d-nG;>E-|1@$xTaEd|n6nf|-2`p=*2YQP(~r2e13_3xkm zzxYYkj}Huqjp(kA;CJ;$MwK=2;Z9 zLOyBE_vAq!<^pQ~$pgD;_VM2(fij{K4JAWr}~3Fo_0h_gFo6Ixkq#xL~=Z zT^M;RjFKNwuvi*qp0pMsD~azejNbWmb>j6z=m#aQ;sN!=waT3ryK|unv@T69+m*eTP{j^~`FWV{g?~Mn3ehQohh){BjXpcJo9{o43($Ew#M&It8 zElvBIx>zDm28<90V7dx?1|%2>y02oB?Di+Wz>0bSvSE)uK2r~Dn)AO_{X5vN04x{) zL?px$9XyqiPoz~z|ILn^y8{W0XCq(O{F5yTPRrVaszH*J$o>ek$ZxtQ)@D9wfn8Z)(it#qA&j0v(wVmW}wZZvoa(`kl={ubw zz=#y#WFVv&r-Qv->Ef1s@TIW?sUr}Ml4D($5hrm-Bhh?$2yr`oA_qV$FL*3*{>^@l ze@_XJOMq|=yHpp&dVdF4N&#U53dDh&Ft8qNv5jBn!uWuoRR4x<92_in>>zclsu)~f zfMoX-ln)~W&tV%A86^0aNE;u{LawX!)Y+1hZ{ICt13l~G@`kZ{9n#cNs%4*E`m`5cUi{_L ztIHQZ^~?IcHKPtE+XoECz4t4$DP~}d0E74%pkWV#iNrJk8)6INkpd2{dypW&2HwY% zyXUTqr_qt}_ymC8=h*_7Ko{!YMwA$Ex#}+O=(oK`D{iP(B^`6yMS_MQQh;1M#I7eC z_y{Zs6>lxH31!zxJ_BFNz@7JT7kIh~r@N!?zJ?U7k}W8#yM^<|kq0D02x`;3UHBb# zSlAf38WU5cuQjue__T0-etJCYZ0t&Ye5goimv_)1iGG}hQ9Ca)ChOiypnt)TV+MjRBe`wPouL2C6??Ia#StTWaQGX5hC+J3!IRcEWX}mQ_igkaKU#i_w5TQQ4JjFT z5CuU~R&*$@-}sUA#4gw*=HElB$7f*Yd&?v3NRgg4JM}#H#32v{8xL)hFt;1kV|uc; z8zvDf+cj_xVC8Ez6R}-vJe>ZCq#@Ira}g0SYvH~k-2dDBi>EW#L#Ps53>(^%!+1j%$M z#bni1W*VO9TL2xGnqw4X~nQKZ;E0f_y|3P)?5_poDQs^HM|kgl z2oDf<)pV95+B!N0Y$9EKp?h@8TOVFU$NNsv&GXT83*cVM__LA)(pGW_X)Z-9>3ila zt$Y`b9XHFmq7t&GpC)~_)8y|aP0Meu5vhCZbOL;_jq_;ma7{dAn`jdC@)m$uE^gp{ za=4oFWo#<<3wd~`wo_cYT@B7#@nvPv;kya&MHwO6BFpla01nj5#?9S^3fE8eSyy`= zkltN3Y-eW?xwl#NIhd#M$hE6IGA5d9gCj*A-5fvBl2Tf0`RoImAIMM0iBbqZDV?>_zllZB&P-5IgqbT0M25tJtx=kFul2Rge{!P;3| zM5D3TI@l(RJ2cDi`6L0UQ!TAz5_^zuC%M9q4wfpYdWG@+zy7OMc80z;R9xTQMk`4uob=6ZX+r%DkIyA-Bnj?CKo? z$-le6zE_%=Ogx$7m$7PT-Cr^%)IWLi93{doNwyqWaEEBg0 zJaLVgSYGS}pG(o&1^uyVH0t(37%M`_YiJpwvV~?q(HnF9R6y$W(=1ffO7R?pH^}#q9568nzzgfXR;J3$17?M+N^vqw@LeQ-2p~Iec;2RSys3i|eBV zH108N#YIcFb-ir)T=&G8;fi|xhi$6#jw7yFw?ZJM4T;jV_yomG<}dNEa!7q7W|;cz zweQJc!hs}UB_tDxDRfGr&Qk^8v-pM4mI&$S0QHr?fuh_ECH-dN75f% zOxdL{Zbqdvhs&Dz{!H*|qhmzodU6^zmQzYQ;*gA4Mbi_Bes)~Z)!+5M)cL`asP1#I z_aPu4B?M^>KTyObk~R3QfOLm+t#)!%IQ!zQ?we^vKiN2s5|!)CD}Ih@usg-+EYX7d zX8(mSJ!tCeRTR_~i(vFoKuC4B#~huVy7BWo=QHge5SYu?Od0TC+T4aa@)%ugqIn7! zO2iyC$|Tmtipy}HyeZyW^>^9ViY9H?;{GiA&MT;c@va?Z-7n5Vw?CX^v5T=h#wtFI zyUqCNL_A@!Qq$^BNy=-Atx;RwQx|aEdOrjDPXtL?t-E7Jr0zG{HOl?F893{ohVYnjahEO2IF?BsFy$YDI`Dz zcgoB@Jm&?q64W_o8POC(mh``=+#~3$$*i3Z4%I_{Mh(&_8u3Lk?N0Xs@O7+_C#N4U zL)dJSwH5Db5~$c_OFGYH*JysfM?5d%z@2=PL70fuA0!Q+EtALPX;K0A0 z3is6E=v?$;7ZGIr^0l1rw}{8LQ#>tMqH}k2!c^60FNRz-i1D5Jc+YlH$MiJ2XMbRU zQmd?Yk;jBN>p=AY-XVqBs-#byROV6T+upB_1m&A*Fc{HR*a zUTi;fsuVJ7r7n8Y1w>-Y?=*C? zxQsj#(Q|7P2;3L$NOwK`=VXcy`;U{^&jMeqc67SXELwQ9Mv}at&m$NL?g!mEqLCBt z*N~{JW0m1h(^j=CXLn~T;;!5empdvLEXXGQS^tda3MOybbV)$d8vWCo&UVV2d9C}J zQ`UDNQ_uYn5SI${Tc!}~-@dl;oZ3r%LTQU5%^aOidD`Xs?}c+x{!*h zRGOC`+eGGhOiR-nisAcaLzvx6DIyCg zg1IyXr{u)Mb+nJ@QetJ#`ghB*xwQT4eXFXYwT*y{<8-g zdZF4`C5x0^b!@0>d~(JrtkzeuJY_Q6%I2R5)-!u~VLOpsWgC!vr(=Pqr;I-{=4N#3 ze%z%#daagCuwN}=U9yooRfjF&cx(DGIqes=zmgINx~@0IP~@M|m0~`R`xcWWBc@u) z&3VSq(CzPa5cqp-47|HWqTHt2&|lZ^%KF6L!=MBcPs2AjTL!&cQY8BOg&rJJZ2hqX zuKT|T)V4EyB|mZYe>YwrIsggCJ9DXt!ODUovouRZCNB?#7AFNVY9!{SbHh`WzR$Rp z-*7jRy`otaLxt|DX7utzt8fg`4F2&?mr#3ttI+uPqmP|HZldrM$riJoRx~}sX zckgbU{`0DckRK9~>Z3Gh$!nQ%o`3 zvR0;zqszPMVw#PXL}K?lYoc4IRN{LHZm~*^;cZDPINf8z<06A(r!}&pn3xUZjy`87 zX61Ny5_Qk-*|*a*$Wg9|lhmI&s)Zu5G;eGuI^52e9>V{au(^_x`? zFSR9E-2Z%v`v3J)D)-Enjmlt(4`70uO!x)2yH<_Luz6*WGquTJ}|D4*y|J!emUu8x9K5XkF7AV))qT|g8=`(wOmy8fJ z_PhFmUdws|5JhJsqtrOWCOd&>G!Hc=JsC?Zx+VF7g89%TgPkljQTg2t7w_tXx zuDJUi`TEAmlHjJN`&5&mujr?EP45bR2^B#;Q3N`Jkolb%*br!u#$eI=3L`q+UjYy? zWR1-@k0k4WE!nR{jYC@?2p|9fe0#o2ufJ43YGkhq2>dGXiZKqNI+nuEJP8lAvGIjG z7gcE11V6D5>iT9{zujtU!n&J_nm} zqT05Nkj#z*dvOowt3rT01dNdfV6T>Ah?1)VVeQP^P7?Bd{8^Wp(pbeIE|$E=%kZ2c z#d|@ukL1D}sOgGt?~eNUr1lb=W6ry}QUZZ7epf-inJss58|J&J0D2&Q@tWUyzwd?5 z4^57wpP*oYVyYk941U0+!hj2!tK0tR|FH*svu+H-?5(a3GMIBEVwP?m0=j1B+r|K~ zC;6gbU%;KR+@{EFmVY3C}MgH47B^)sh1JQCpuyx=!qlb zw_r3{@a;^MyA-4jWeotAVw z7)3^@^*va2^jXS}@tYl6hVaN{Aw0Z^ips0-V3c7KE7^`9;ZXGGFRJ{Yo}GO=K|fAW z(IT%a4s>#WR(?Uw;FxNDP0HGC5ZUYV2YmVT1OA6tc|ZmO*2+69_ki*J8eAuA1M6Mo z4DyyauB|l@M9F+ml~KJ5HO0T4A0F)stE{zY6CP<%LL-27`jt5XLb)4zA^V#h0O$e< zN9l@T;avjMmpz!pf4P61Y9?28at-d}ui*7=K6DN<*H=Iq0c(5?%_sQPDnUQ??9I;= z7|i>{avabb4y4j1C@o|_LN_I+I)ZC07GQXrYr(COi0)VN!x!NnEetj?DHKMwtU+3*0em zXt|gfthvr5e^{s%q~>s@%g2rPs-YAX6zzL%h+P+81FLs72lR}JDmqAqnO~$e>fi5w zqdd{>neIgRb&?}`KjrRpg1er~ruzC^OPJj{1iS2~pSk3aa(Bfz+JI1n4T1WWvAA5* ziDz`t9lUL_pN^FYRy}Cpexm|7ZX?fspG#o*cwXEE!?uD-xECRSl$Eo1*1lN6F@aAB&X;bxb%IwxPmI?YL1LGvwXei2 zl`4{6QtIhrV>1ehPw3#vU2Zq1ZD`3p6J09f=BYGirdvmUs6 zF3?u7)$|m})R(Kfz2|eIm%wnzJ!W8i3V`XjvL&&`Ghf|PF!kR_)i#Xo>B!z;Nzrg> zk6$X_7L+0CeS7cjXNQyxOv+wg@9lPVc$V)}r|^Wx8ZJ9Fd#{hAFMkB%i*e=O1>|5I z7*qYc`te-yz5q0KEX4J9xij~d-Z3;;`qj7q+ejpJ3VF27oG{EU1B_Bf7)?{TO(Vk? zzn5{fOp>(9Yz~V=$3Mju>J{Xu#tM09|7n(xAGi|mzp*n@L5A>cjPT`O%XeA?tcau4 zEFx=(_axT|>VGoh<+$mLp@_v8ULZU9QvXC-Cp40q3c!4XvUczgA zV@3TFi@c1&cp=3mT`MmYxDbt~fi(RYD-{ZF2>T85 z5r&^kQMqR$t#^t@l_g1evh{A#J6ymo(bJ}CfO+i5%SmV6tfYK@tu^7z0r1IW;$=k& z$`McQuee`BYTNG7vM(FSs7hUMU%~;OAAWj<$tzly4US?NZ|Un?DeO_BoU_;{UzYEm zu>6vgx?=PyPMkB~I`I-@+puaCw*3I7>LQwsKU8ge*LBcYL|YzgjLa&mUIo!^o+xM>Ha6MTdb1YoVP6}-VTJgFz>_J?H*>pVko)F;m=tSNN{c@ zShT10TB&d224-Te!}WcVYb$ELjNUQ&x2Gk^DSu}@O^J@n%j24qM{B0gD_xf8%q_Rt znAfhP`d<@viNr5_Rc`sU=QC98BYpRn2V&%kyxFaEX1HTbD^MIbt7-eS)7Y0{EEtrv zHZqn9oBF9)9_rb%wgJhvS2Uvd7|BhD-u9g>w{~mB=3&Pb6mxNeRu+~K6v2*L{~&ir z8Mu2i;!D$m5 z;`l6a3H|8eY|V_3M4De+_c~NV*lyvav6&(LScoGI~{hZG?h zGDV?`4VnvytW?6P%sxXXp;8etB$^XViu%x?GHb1*l1heB2^lh-`^~u4KIb~uxvq1r z^UvOY>}zlG8{XghKEwUo&;88g<8u5|;<;PvhhehTRx-Z3nwc)isukUSg)fg^Io*18 zZ9u-CR3NiO?q}wo86(8ptdh*jbP~FfzS~N#>=<3z)~8+P5_^1ilG~XJ5VMRxbvp9T znWCjq!p>1e`gz-lMrOB>@tK4MH;-wQO zxPCtUYvRngL&dEGsgEC`Xz>i>sNXt+sd`_~InD?OD%{w@r#7@L&NV6f?Ftuq^bZY* za~qi}_WZ5y$PvaJS)vd)Lo}JOrcChkz4fl!vJ7M?C0ar2UF#cDFISeu1g0#NP3wHP zFSRn+cS}>9s^aJ3K>J0Ra^y`!^^O*t=hlHZpwlb1-HW{|b^LG1hp2y&71WQFRf+WE zSTVcy&rqt(4NdP*F=4U!M}1o!mFu20ND9`j$w-kG*y31u@%{RV=*75~O=CnXby%~4 zR9WP%7`NP7FC=Txpx3(R(R@rNR_HzZoJmue@2%sn5Rgwbo|wE)lnMIjB`h1Y)wNNo zVkrMru^v#sIhW&dCQnoF+@^-e;1w?H#Mb>}`%O^kuM-KmDUhTP-^d!HT;>`=`}t>~ zDSkf&3+zG&l~r-JciAk9JGkKq4%GJO`N4vwT ze&2TLztTfEMP-%Jc`N^ze2MWDig!d^n1lVM&crn){)qED%FvWwO_y(tn~+KO%pt74 zEdKxFO@x%wf8J`q^BV`yQNB_RVvi%4*5oH=#~XXFJN=$r2U;IMud;WFTi91c)v-upBs_;ec zIQBBRP@Hnt@QY>$Ct6LMtg-!IOfM9|8lGy{vf|&rXr6^%6jnVxtvv0O$c^Gp!8diS ze7%$WLRaKGs{b|%(jc9=8vH>-#u!w}83E;jlOIvlsTf#TJ5ny%#41;JW););r&T&Y zS<1QP#FgvhYL6+9Qj^hRVvJSWHJl$t?YTeTidm%{7r@%x8;)T7+=8mu#$Vvh?7tC} z*v+B&&?9LK=EBNwhA?cRhyBkpy}7;yE3L(szU!pl=$)Do=xhDZZB6NhvF_vs%+ zCsi#oaVby}{26D!Kk_mGas{@cE@-f-829b9ZE+qPvnG>eJ1Dw7^KjH zl&LJpnJ?v0E0IcgQ-CbwyTmg&(H}2;M+XTTSmO;OVQZ!bW6H8PZVA2#jZw&WA{bz2 zEWpsd-6!0sk_VcY1yB(q@<<%L4^YU*h%VYN%IJZxr-18!fPJwR?FvYXAbAtx$v~W5 zQDc*;2c7DjDAuol8XXd!OVE5E=@AI~-KbJ2)46ky0V7$ocyV#f7bwrC!s&v9vMfl0 zFN}z#AjZ)7Y(BYHXpdMs%;TMTU%a-*Qi+Hf1q}rS2R7W^Lt+h-MYkc#etowUX&}(z zBaO_v8g58ep{6#l1yxaalXeqhdko-~)nnNB2?tL@Fn$p5^;a%Ko~_)iRK))qUIZTyeH$?wH$(r=dKls}5c){iKZ2n6 zX}~Bq=Z;r-@`EG;k@Vw?zq;Y3%GUq^g3N<*M#g2@xM<4tOF{t;M#JTKAQ+`sh&gK|{3s02(HDGCD{M5b`J1q=R@3_{6_OaQ(6c4VG&uwN|?r#$naT zw2o+v#PPF=Rksj>Ntt+EedZ1QKAcUrFueBYr;12lP7Mj?GnDMvL@hpPRY){@;+c33 zE{$eoWNtSiE2*j%7y*$$@v1<1Uk9aF^-m@V#Z&pA@iVHZtVNP}`w=|89~m_34#uui zKhsfJpf~^=JW9_f@aga z3cH}hT*rG?Z(gCiMX)%2IO9`r$z$CG>*m6i&~`GT|FgF>zs^P<(@-#zrPA@{5(lMn zz5NdUggpL%Qf^(1vNli*^-oT12Xm3iUd|i+Zm6K<7{wfoaCJW_`i&0SeN)~@$cLWV z@139iF6`%8Y%mRKbx^O=ZEgiIZLutdQWz*@r&QLclxA=zbN@k8tdsKM{Gz%AjCe7P zta8_qU~LXH_3h`$cKrR-EH*HY#@IBNZ+nEtZKd3*@Y;Q@C zxBb~M8T~Ogs!aL1Yr;1Oa0yv{xVwHKxfp#HlNNm3H$gK!7lGntH-Ze$t1j~$JJ zoJ3aMVikP7OtygFjI2h2i-H--cL)3D1Ofse@Y6>>0}Jl zJm)`ri6Hi~u6?(}#A}S*5$D<#=-_9ARo{`iO`UeK3@O;9WznZwr<~i^|BCm+&RF?9 z$~Wuj+;V)Rr9&lN50+ugn%*T*|nMngTNq%wrW$ zQRhoNNNcYjuY!GjGJ?@r4?17E!GcM@hIjf7E)ue!S@SmZp!dkR6^J}0O5ca2e|ce5 zD|Y9)8$0Yh=5|Y?GtkKIleS zk*_qdgfw2VNoJI7At|zj9v!9Kl{tV5tuL@~7!KaPGlmmRfP z9Pf~ax})hN8b%~Me`t)dK)!Xugw5l@*Mw;>+P3XQYTNayK++itNTEGkvlG^NQay%85KE0 z!joyyFceu24p8AG9@`PT6g_e&!{uB1K~~}yPgHlJBqJ7aHVg$As+pQnxOQUQjOoGk zaSSOsu|fNRka=tU*eaiZv4^^HL0VQX*lLK$ZxLCjU!$q>O ze?Ilw3M)FFg75UnfpCz3+ne{5|G}08MjnYscC_9S$aug2UjH}R1t3i*8z=Vzop!sw z2u(9@Sy?#c2?3WI6A)7zHOAI0cn#kS9D6_V>a11-4A65HkZ^XTuR9EIR3P-^IFnRW)Xlcy2l6iG(+|Ce_m%DW@GkYxG{N zd=L~UK17a?*1B}iBHy2k)BAiLp@qwxVH}vR0FGlE;GDNZp9w8R8V5QZyNJ$@nAhED z>(D^fVQ|22Kkgz-0e>^wU~85Izn@_EcAmb5o!2n01hPqtoZYyx9{7y^`eQf z*8Zq!bMd^*gU7!_Ji&=YYq)HG7nQOv7WSPR_i;~2(AVrM{P%yD;UQvX(|&E)_-~Wg z%#U9zt|~eGA2u-!OV+jO-OkCwF>V{<7gLrV%Tf7#4~ulBx<~T4T8F1uLE0RWCzh`2 zm7HZ^PS@Hrs6I!^d&&1#cc5`QZz6}_%tC3Gvw2&7?f6wEEv_7??UYzf66z)_IGv*G z7z0*^_S`Fz9)0YOKsrQDlk#Gm(mWOMD8=Iy{vX_gUMToK%Ul|YmoNU+c2;N;cKICD z9LH42ZAl?6#dr7byfpE|G&?gkVi@S5^X(A(WU{<@^dG4U+wlFa`y)t(teci{Xv$4r z+%988tiB-~^5hb4W)Vv8!3T!0nV-R>5*frG2#7f*-CKLpXUh5{5J%+|_hR-+Bsjw3 z840rz32TqZhdqQgqkr>JbL`OT*o$OjCWuv~BOMY)D%__qMcOAUan_J*FFR^dKiSAI zy!F9>r1@cnb z!8f=VM&;cF%^m$4Hc1+vpqfqsR>&Sk6J55G+&)Tjj2XuJZ(2-7V+F1Mj2tcavx9B85gX6x(W1Tsz8ryJsdvDcKXNz zvK1W_HTlDtnA>wtN_Wo|Fib?FZpzsVtp{Tacd~hFmou)jPk4YpGKT||wiAOX3$`sI z3e>sv%%w9+Qm?JM9?JUtHiyJRBk``msWTDpfK6Zfd9QyyY{dr(#1fqyO5pF32D6?s z{Rl;_t$sPAfV(u+JHZUgi{|+`qdN|*@b#ZMi0e53E6knMM11E(U+% zA6KI%5QlRt?yux&p5gZzI^M(WJr*S?k_#2gBPmv@vmwZ1q=1F_phfWUU<{z;PWcYh zJN4JubhH8;HsZk5Ni$)5kyFU>ui4%A$L%mYG3jdrs)PO%9C8sd{15tyq6Z3O8pxCY*hKd5sA163quPr)Yl{nkFitDDb8h5hv{V zW(=-dCc}++bUT*#Tq5NF6w%Q}y%qzuNFNKpg7+jsLLufSOnKe_V%g)aRT$01W7yxc zdEcu~fTK`)|AdII9-S(w`x^@oe%0IrV&sh&WtC&JfIJG7#wQGiI!oKUrb8X3c-{*? zB#O8YR0h%y&%fISfVRbuaEf4Gj^D>@gnEp8P?6>DCj*U9Ep+5%4pp!|J%kZh2s*dT z2dASy9eRpjOQ1LE3+~^D!K<0&qZl6SqmyD(oojGuHyF3%1#*71Yo%(HwMmittW>a&IxW!%&R?m_0MVyw3)&viv_ z`nRdvKVvne+8b3 z=Xr#kU}!J%xK;0iFIlQ!z`sQ1vY#Cw&Rj7WF5pN-UvIqzn85Yf4)J!qz_f~6Rt$;w z$5F&POm)Iukyb8c+z}Y5qFAAF5N;u^XxB;YtdGVB)*Xg^azu&>F)WyC&FB2N+M}f- zN9?&Ls%-+QcOX#fFm3&Qa1#QQc+HAL7mx#e+Ethd8KckbjqBnzjdHTkZqPY3pFQ z1Ca|tB&A&lUv%f!O|wzQ>>P7ca|&!q4sDPpc5;xy-C%n>_|1;nq|pjzA~stzW{q=2 z37olrs^hWH`eEG_q;(9SuKvsAf9Z0Kbu#Jtd!CteE-qVXn@3_Q??>q(5M*QZNIu6f zEg#%y@jDf1DT*GakQ&o{1U)aC+%?$yeeJ(J!)4fim#-hTOs>uNKd80FNBKYei*$>- zZ*%*`!fVDY*2L|e8zV`1c+8Gug`tFB`(D{U4jbzp;jQKDDsU_g)j4$mg$J~z^DCOh z@@{YaEMy1}5yOsJ`c>S$WLCH;bRJjO@jE3Y<|vEwGOqSJ-FTS7*&H{WZC3dS!y`a8 z8`cr?3Q*Eto72f5r~qdn`(*78v&r|P`|#CAizu{vrZt|$)lP(bsdWo_L%>uVPYB4g zmPfu394~|!Do^|w?KT_mme(aG@j8~$omIgjr1z}|^HoWsFW@8ul%r&*tt)AMJkmc!CY4 zLI&2NCSdA^K=DNu7=7qn-SPU?SAhH_LHEZfVJ}c!2Yz)MNF<61TR|ciEZ{q(gV#Ho zVlUJCnGAu&q=zl=rWm(-3h{OA9W}4oiub4x41-3aC)EzBk%As zkXG;=U&4DLTbJ1Tkbfcl($`HsK{GmN}7}zrsrDd9I=KjWBN6y?J9Upv}I)0j7;F~*p;gPbh zlh6tEDMS_edOi*AIb_PW^)DXJr46lnihc@5#~J=oYW77+*6zbMf5Vjx3;pKijt;b@ zDONQ4$l`L}M*!-=Dqg9-Wl4r0I8xJ<^R5_$I637A8~AZq-;L|#7VoWDDpF%Y6CX6*L{sSwVd3dV|81$l z0v_>!nv7lYcyy!3%Hd~UM4GKGxn;BW!DQGGLNQ^8(jTxQND`qV413i02abTRrc`gI zcv>YTtNr#UD9>HbQg8RYH#HBzi9607oVcELi?6X>^D^z0m9|)=9R}4o@X_u-OyDDW zgrB(5iU7TypDD!?Vci>P-C{!JxbLmrzPY14L&Ev7J^ddWU)C{(#uE!JDD5086yb#Z zjXX5AJJt)zC+>{r?dGYzWYk;A7skg!)ubNGw?OgmCA?|uypx7h1X(=vgAi$^uWBn~ zNQ7S(i=wITGU+&zn^dq%D*4IuedbHUZFu90P`{CjFxFC|-OO;ZO$)yJ$?d$@_peFE z;!jZ8{*p3mcKK$^PU%3A!=>eNTLPjNSe-*hsWp=G3FAt8k|UI)>+PCAXZ?hc*7_U1 zhZszEFd5Y`Yn2M9q6$hA~IVnHn9?XBR z{^Dxo?9EwmrNBP(f=|M=jb(ekQ*lv~4F0zNn9QIyq;OASkncHemHeklI{^CeNQD3Oui!^Y+;4pyr6 znPvx5eC9T-S&Rh>H#unPbo@R`jCGSrv(uJR)$Uwx?Dujr<9mS{#onYnp@5H~zm6U* zawqKPk4a3aX%E|>p(*EuH!iHUq4Lg&y-Dzua5d>r3FOXH&|}nm&XUFz-y7NC?QC;`f&A zR(YFDDjn`~n{+yF1or5AeH=qe=M?ByjrDNiYIy_(?NCaZTA;D-Nq>R#)CZdk!iy_{^Eh^U{2yi;OqdL79E!t{RwbBsEQ#H`($74YOS|@Z% zpoVbl)Ii~1h&VrD420AyNETEBk2|5fDjVkx8{#E=Y-t8kojjC#ku6__t_PAT!uV`U zP`VD^3?io&SnZBXC7@gtKs3kmJo0BGh_Dh{U!_2$zTNXYOgAX!pLW2hVYtQ{Op-uI zeKazb9r5yaMA9aeh;+?AB~P9Jn4&2z34Ms4c*=j$oJs19mOzP%UO=(W!0)2lRZPRB z4E2q@$Vz!7bmC152Xb1Cu7}|%XNsyfVPaMrg0YqcDI6Unh_Y6m$T?SafO>}Pc6nY! zn$alC!U)#Q68Z3Hts2%^XewC3zOUJ6;mn^9==(va-3^7k-yy%_kIZ|CSvn@m<0LN{tkU_&y`i6# z>nf}b3IfNI=Cfy9B@_*$A*V7LF$qvWA2{Ada2a?~F+QTlOjOsad8x?e9;EBFbJCG$ ziP<42;B2@L3GLWMluI(NDc7T_2=y#$F3!f z?^l^0)67cwcyyYWLb)DNMQa%7q5!ANG;+uBAy7M(-f*9yxWWn(qRxYnEt{Fyx|W34 zvm-o_HD+|uN%Diht4D|o`zyW7N({e>Jt~kI`Ih-^6V38IuDhM8)*yq&$jbsrcQTpWoq=8Ymd4vNp2yDLD?EQ!4-gHI0+OJ z=cUAO&q;m!hAu6^=Q@*ntztzkmdwTWp+_l19Ac+@?h=!JbwTM>qRYgrrr#)++~{)a_FYJZ z6e(zSnf*nmI>e2Yd6<%@;fTc~8XlM^zjOw(u&Sdo<*sOMc=0M*gFn&2CMe?Q$*jLV z#uKUw~{;t3G5E$aqUt z&vXLbzBkWzs<4WB0D&g`Z=FOZ*;5%tN(2-eIuR{_H?jR$LOay@=JH7X`NICicR<L6X_`JTguJsZlEC8}0qk za&g)}dP>5q@t&cFT)9CeAGmHlOb}MTnot|}GI^Y`NB1CL`$j9*f8(UnOD>Zp?YD6& zJBcUqSde5nVd%z99zB7$x30MQ*;Clc*so}W5}x{k@KFZ!pzvO z@K~Tq2p>0@#%fgp{(qqV{g$x~e$G00_t8koN-rBV>7K82=?NKqnh)o=3CqNUv z-n!R*GZTK}dhyM8j$=d2>kZ)X)#$L7Yhd32Y5IDj723v>zV; zRLkjq5z|El@IZb_Nv!_+5Py-XtZ zSbgI&zN~a2NQUXXQKHQ5h8Zb#=bDGIk40ZuSHflG@l(}$VrQ1ABGMThe*PD6dhs_X z1aGd+Fm2~DZVFjWpbx;oU$Kn9j32k4WzDqo%wj8Vi!f4WgFqt3Foh~p-7<=SZe`s- z)4KwZyvB)Xwt$hi>&=}X_;OFR0W$;7jxvpu@G65Fe|dA~+Qy+(8b2t0g7LoK5K3$} zW_|bmc5ij6zQ#{r&XY#F1!Nefw~xA9cy4A;oJMcEU7#84tLiTcU@GEqL$@WbQ5xK1el;k zCgYscbgVYfG3Xt{|5rd4vLk%AYMwd(=h!^OXLvTVqz!moaW|$&t3mISZ%J z1H;L|>C9M1ki4c*wzlh1+L=GtoT$cHzd|IPdlYC@X#b0YUJl|WSKd^%{-Bm5H&Oju z%#9_%i`dL{dh|2YjB@UG;iAxG-0_-M%5ZUSd2ZUP83fhuWNtB(Sonv~oDs~mwxEU_SmNPY zON4e%A6x^Nfn7s=V=?UcQ#p!QmrJnsK-VYx7WE&}Z~0)+rwq(7I}|A_`|U0t!)D11 zq&USgFh)H35m&>=PidJ>!kr2#2Ah*mR0zH3_OJ5w3~H!)k0YF6&BDybLy*|2CmOnc zf(rsnhD&6AOjk3l*e(KL;$E4zV#Kk9Uf}bxv)OW{M1qrJsCj!u(a8yW5zj|*knox% z#L^52HKmRsrZjRY2?uVqzoY)>{NeKD44J5~8N9AOOlP?X1$O@CyVYHzeGuwWs1xH` znEp89EOR$aW0w;STAK;O#&xcX2c=Gej4JeKIZ(00)yjpUtV5+{O%{k zRoYi87)LyspY0Ydnfa>X55ox~P17ZeD~YP&-`-1KfziNRzP+qG0+F%KTjp53&q^h% zPQREO<~q7Cv&@7B?Z+dFG&V|W3WZy;+JZ2HewJGGGpHI?`smznlqVC%rLw~rUUCe_Aa(cTf&fObp)NZLgvhni7w&3GHDJoNa z>PY-{W$DHnP+zt9^5lEPzX>vY96D}JW=a9B~bgVf6sAw#z4hq z7#8RK{dEnU5h7b0zm}@*@y4m(%NVFgHV)xfWzuEy71+u(t_)07$Cez;=_rc zNS-Y4s4k#0NlDa`YFj10O7BIcZm-bV6$pip>#ZoJs}TM|L{6Il zD*pj=45cSJ$4K?DOp7@*hzcoy3Hf5VU^7;#UxuY64NMza`y zC+dn`%aO+;EFfJ9n=!BvxjBd10yT!oj5%hZ!>p=_TJI0dz3OpbDJ{8$f#RM~7OCUYUj&$thu&%ahnSVquJ%I0e?%XKQ=V zc*RkqJa_9&J7l0Z`FVh&zpqAs5Y&F|l%!XgG`NC%i!6k~@}qJ*%cinx>@>E@Hy{{{QB|=6FL{K%~<}P zFTy*@lPUQ{qyVZU?M=qikXw50r+Y{~wl+F;8OL&+?sCn;LIdyKZ}SOG|HxD%mzk3C zC@54&+UV_!#H@I#MLoH!?sA_H>#JKPB=^LjW*AA`h${V|-E0|M@Ajpz&K4hHbf`!IAE<=P3T<~5-mt!TVRfKbA3S_d%bAujhSAuB05}=BoC7*`lCMNzvO12)M zgLgafo;&xMwWEe`4T-4JaeukuBM$&TNptgW|EOZ$Fol(&uK>36+2ZAUz-aVRV(1UG zS#8O!YmC)C%X;l^%er$$4T1tzoMk<%S?%JCWZw<_(r*}t_h%bLiNBWCOxC(}Rv>XN zZFwZCwO{t^^kC++dQ+#oka`1%lE_rwqx1f7T((U8?~wZ(2;j?7k{0l+e=J^>RN#>C z_yvcAOo+g&M{->MdY-91hxwdfm_*bjIn&HoGv_W4plitvL1-?h&U(>;Uo?r#QlpH> z>V>pt<*tbrX${AvO>$TaQKyhCVshqPATF~{)70<7!{``($Im9N%eW(>d^Ul70wXWA ziruA5LOW@{`t#w}4VFE0Qy-PMgbk@JEbvklOky4ASiE_}pBv9=a*Q>5fW0`qc%U$5>)08Gvjp~Ncs--HL z%+fzXJH?H>M{>~y)ayiJfFRD7Og}a1;{tK(eI1n&IR$|S7h9dyQdXfy3uNAyCJvK8 zD5M&3wX@O3bAVFFwDr0(+sr{AjgOw+KC;bfCHNO>9i-Azt0Y(h`Z``6w;XZ*tL|Ld z`r-L7e=VxiZfbnGC=}XUm@f%*K1=21ZgMMQ`XaLXBj@PtK}7vT@JDQCC(yvEB+bYY z#M`I{u<-P?*G1kjZMn(I!OCY&TT)!hVr6SlPDB$n#r`FDUQ@Al5q*#FCb?go!KYqT zj>3Z3fb4$$HJJ4mW_=WKR&=PZEGhpj!7`Sd7Lpjd@6|d@L*sl%oaw z#zS*8uDwUblVl#kJZhz9!TkXsA2o+>hoeSG`A#XXgioUWKR-W`In2F?CSLL@_C_-j z2YJ6mo(atSs-{H96K-$*$X_2N=0I6hRva9M;F>zRyGWwU(Q?4RG2Vx!DTi>TFl>6v z=XsvERe~JYIDq-|-vvE!vr|Xgc^JPaKj!lJCwyW@@$r7qDkjfa4ZYX9l^>r>+J0Jc z20~?*DCSGj9&2ajtIpfkisDnE6}>EGxj@_-O61sVr2?tUS>pXOwf2Tcs)B{bb z#W!!Zj3?W6cC21ss@g)mM*l|mv2rEj)_P~DxeF+N@1!YtBBXF$QP_kv%&MYX(v|C7 zGO%&OSiehfU?I_3B3}UI`URmr2cDry?yl(gTJ?Qg%76$-i~MG?a*aT}l=Ry`1HtkI z=51WtRmo{J#yaS@zj(3(vFtvs(XlZJ&%hZIvMc}1fz>gpi5Sv-Tur@*A~-j<)@1i4 zd8y~)e<~Y$YP_3(u0?jS0&m0X)wCkQYk1jhQ~h=^MQ=}!14&M1b9U&IP@52^r?L9G_2dWffHf$E~%nLIB^3CouDdI zm$Z_L*+rvo@Q~{$f=1sm>Btx8)I=HM0E+C}zVxARD}`eR1k)Yy9*8aoDBqnm=iM-9 ze+D`0Dq5trM?K%y!lqEJ`d0uFa@`QiVWhY{;6l`SAB6GfMLC)QmLzq3rC=Tbgg!g& z?B=QbDKz=w^l79{;|7ix;0q{ej36W?odFQ4IlX`IArF-qJRFsE2r9!IhEYc_8GL}` zn(zdndt(w#sB#FDuW#|;BP`Z!eGjuS-@F{$gX*yKzqY0!XC+z*oAx5|yZ&lA#p()n zEJ|68P@|GN%mCN+{4*3nAm+UFKIm$!)F^6hM<9e$ zTI;7+43Uin2<39;2+6KIO%JAetSmtR+F~)7fU8D-1z->YM!J$(zfDS_?gRhvErpL> z3&k@NaYLUy>zUb(tq9r6y867!(_jn09ydu1f152(uKPt0jYkST&=ZFA;jsgVj%Jk&|FPem-%sdmW za%OZ>=18H|L?pjgIr^NO=7i+$RsC*&V zR%x9Tiz84;^SW%0o)180L9Po>9&-S^U7DeZcm;m}>al7VHCE?CRQ=FQaVWBg~X*)}RVw(wt=#a)L*!W1$A%Oq=BmzfR_VyCS4ypi(GC>e3s&`2+ z>yWn)G*6%V5S)%$WY5`a=@FQ>K62Z#X1yl+Gy ziDvtJPaMU%5OexOZxs#<$Eg=4KzkIpH zFYqg#S5wtiW|eimFLfWv4=F+4u^-s38Y^VN#EJHJWznaT8>xKg()0wA`XN^$AAJ~; z%A1pmspb1Eg0_qZilisA=$o!GN|}AXy@o+!_v4F^GCH$&rWv$}t3pXq@oZTeoeTB9 z=-Pi&eU6bXx?_u}mykra&t=F(sLt<%Gk^^_=*vn+wqcaGBV_kzRw)znK~Wc`L%reL zHTE`WN%-&ps?}Is^#(Bk#S~A<_g-l7&t4+Xk;O|%W!rM#JzK5xI`9V6{Fv>IK;!Nw zN}ug0>T|GI}G*1p|1 z=OA~^y=m=2-zj3FZRLHOI1@;sf3k|I{T_qFcz_p~LyN14h zBwQ4P<#Oj4+)6Z4Aw$Q{%**Dd1pR6+-^)0MSuUkrhs>lfctQ+HeM-6>8s+c7W1;TU zsx^}FmdsvVR#;~+Wz#KX;Bzb`!~cTGUM-wx7^t_aquQs{@T@&kIQxO)TYp{S{obnk z&?P8lRMCo0N5`*;KI2%+@ZyT^xtalyQ7p1l2t?jX>j7cvNjask+buS%jZd28s^HJg zYnbNd-$pZLzq;fJvv_Zo?%b3TQReIw&8EAEl9IDThZ_gGpOVMi`X9(AG~P0h)!Te$ z*`3tRu|7aLK{)Vr;B@e0I=;HnnpT*y74VZmXZzi%^uTZNOw89liH+LIrtp!TbBwA# z=-H~3Z<>kx9>WGC_O!fmkq#p~+gGe*R*Qm8s=KrT*t!Rs7g1ibsFaWN#Az2e+&Ip` zuX{9N7SZ9)J^rP(NwH#0KhN4|8UAnI7yfES zTXz*-V9UTZ8dEA|$A(};tuBs7LOL7LesYO2s%DRDi`6}NdpKM`RNR>Tpj@>`h*?V& z4C*s=`gtRYSkywhBhUcT5R?8vRWOlrES8Iq$c$ror`5r$-=W?@TUX?_a2wwilyLVQ zoFz;cr>9?WioIm090X{inVi8RmuXw}m=*;?5;sCA&*kIUE-KIej~;U$O5ye{63ApF zGL61acs(Z&ZlUlg@2He9?+^~*8SUWJmg0XQM~oVg3O&I?%s$AW%f#Bk7%L-u45+@( zPR?7g%1GOEN7_5=l^W$6qotk94s*Hnlg=~&?1tCI>JmdkuDzcEx? z;ns4rL|^|aUu(iy)AAWM9h||p2S~$9Rt!^Jhx~g`U2eO7R@-jk;#oxmjeA?8|`(|4UH^$b0M1eqMdoZY;7raid6N{IM}*qgB^ZWURpp~hb*>%Xo_ zV@`FU`NX@)fi^JedQJ z0Z<|k-pKU49o=aJqVfX8F?`iXPSHJ%R||GFF*WvY;9hmQ>sb8L64&L*9;!+SD-+C0 zRm=LzUR!C09Z~hj+8SXP4TXPKso}2XpUnGQ_!J?N3%^cv<9Cs4p`!TFif*I#06j5 zRFDVCDWo#U>w$-ogf>~y7p_<}M5O2mMhBX!5CSMA5syPFPtP6!dPo``Idva^_=9fq zzB_rQ6jmx9%{YXKnIa`^mpA+PE&TufQK7P6R{cF4?@J?5aH;h6_xB=7D)~SHQ`V!K z?-sYm;jZv^z=CnlW~(z#HQUqYK`VDlZMX!66m}u(*;(Z#SqxfqN-1i~F0Lj^BD0kW zj9V03+#X68B4EaYBps|4`g0`t(kEA7>VKsK0F3QvuT!*2j4uVH?_3)|E^B+&xmL;A z^Sn8`K0)yWKPH{v4J+i$%9i~Cu346Zuy6OPymRxq&dsJvKG1%vWgm%mAunZ@6ynP| z5WGZ+!m1*o16dOiG@PL9GU_%T?DapxP_#XhqRdjExRVta&3@FcvzK_$Z?&KXAh@X6 z6XjP93L1kFZWVN)l9C@U-RiA_($Nd|`G2CZNz2kRJ`S&n>JkfGsyCTI@047 zt02EKL*3_tQ3PY1vA!l=mtt)%6GFHawZ&+#th?|ePU_nF{Olr|4Pax@4vRNv-XH$} zWaX9U%u;JMZpRA*#8MC-KilQ@F{HO8KV%g7RKOSm z7MwIlV!Yza$Y0}~*sWYK5YDyB{hJFWfK?3iW~ohX3)Z_L@%%n}&M7cc(k_t<$?FS9 zS!)N~hRXlivrnpAJUjqUc*0k9y?cGAw(pnh9T}QRDSOT8PnXuoMLyMU;MDElUf9q*ZCTpT zauT*X;f1tuQ@3zg!-!SWp4-1xxY?xWkScV*(-$Fy{7oMo9roR2(*1NF(^D>uqz7?^ zBzhWT%U;sBH5Zvdp&hN8=NP}s_G0XOLA{|q=g~6P$X8A2aVr)seIZ&+9u!L0RyaTRp?1!S%DrKB=rs#0|G!MtE)_1#_? z85G_<^>u#<=l-3(N9r6ab+t?`F1&cCwS5?g^?-h`uOd44tv@u<1@#TE7<(s>^sBASE79v zDeDf1%aq7g+Q`)WDWd3bqrbu)EsNue&&7!@$TW}eh)7gsuP~$<9QvhWbqQ5n?eL;K zYRbl45tj|y-Y$M+a#FW>dt}n4>U%GezOfnh?5rD}^+N8qjl{A1M)^MauC1zQ?YA^0 zy4og2X`{#pSEu{izMb9fyEpKU+0um^g{^At3;(p;I`z!IPdR-)n+xmaB^QdvZ&WTU|r&PZ4n~ay)4KRI?{O6C`RvIs>mQVO&>Z;`4>0ZoY zCZAX?`~U05EAZn}b{EH=cl?+1XL!03yt&{0_2d655AA6DX}HdS!g}9=eyuU^e%aM8 zJvm&>w_ECF@3b6$?ej~TF5TYq&Y^}sb!;7{9{LdL<%LQg>JtC4Y}D1VG}-g6N`IoK zd)jJJg*eKnhx)Y?#=qIPef^7CM!hJue2(-#f4r2U5h`PNgk#DI8Ss@c-+_ax~fhKJxgo{&$t9cC_7pH^$US`R~T~@5cDQ eV3!#Wm;;tBmOrnYo5h6xST40OySl_N^nU;(ZqPgc literal 0 HcmV?d00001 diff --git a/format/Layout.md b/format/Layout.md index 815c47f2c93..5eaefeebf21 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -78,7 +78,14 @@ Base requirements ## Byte Order ([Endianness][3]) -The Arrow format is little endian. +The Arrow format is little endian by default. +The Schema metadata has an endianness field indicating endianness of RecordBatches. +Typically this is the endianness of the system where the RecordBatch was generated. +The main use case is exchanging RecordBatches between systems with the same Endianness. +At first we will return an error when trying to read a Schema with an endianness +that does not match the underlying system. The reference implementation is focused on +Little Endian and provides tests for it. Eventually we may provide automatic conversion +via byte swapping. ## Alignment and Padding diff --git a/format/Message.fbs b/format/Message.fbs index 6a351b9dbf0..3f688c156e3 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -87,10 +87,21 @@ table Field { children: [Field]; } +/// ---------------------------------------------------------------------- +/// Endianness of the platform that produces the RecordBatch + +enum Endianness:int { Little, Big } + /// ---------------------------------------------------------------------- /// A Schema describes the columns in a row batch table Schema { + + /// endianness of the buffer + /// it is Little Endian by default + /// if endianness doesn't match the underlying system then the vectors need to be converted + endianness: Endianness=Little; + fields: [Field]; } From 268e108c2d9101eccf2624fccf1fddf6f7f97b8b Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 15 Aug 2016 22:08:56 -0700 Subject: [PATCH 111/210] ARROW-251: Expose APIs for getting code and message of the status Author: Jihoon Son Closes #114 from jihoonson/ARROW-251 and squashes the following commits: d1186bf [Jihoon Son] Fix compilation failure 4275c70 [Jihoon Son] Add tests for status 1162084 [Jihoon Son] Merge branch 'master' of https://github.com/apache/arrow into ARROW-251 a76b888 [Jihoon Son] Make code() public and add message() --- cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/status-test.cc | 38 +++++++++++++++++++++++++++++++ cpp/src/arrow/util/status.h | 16 +++++++++---- 3 files changed, 51 insertions(+), 4 deletions(-) create mode 100644 cpp/src/arrow/util/status-test.cc diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 4e941fb5f5c..13c0d7514fe 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -70,3 +70,4 @@ endif() ADD_ARROW_TEST(bit-util-test) ADD_ARROW_TEST(buffer-test) ADD_ARROW_TEST(memory-pool-test) +ADD_ARROW_TEST(status-test) \ No newline at end of file diff --git a/cpp/src/arrow/util/status-test.cc b/cpp/src/arrow/util/status-test.cc new file mode 100644 index 00000000000..45e0ff361ac --- /dev/null +++ b/cpp/src/arrow/util/status-test.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include "arrow/util/status.h" +#include "arrow/test-util.h" + +namespace arrow { + +TEST(StatusTest, TestCodeAndMessage) { + Status ok = Status::OK(); + ASSERT_EQ(StatusCode::OK, ok.code()); + Status file_error = Status::IOError("file error"); + ASSERT_EQ(StatusCode::IOError, file_error.code()); + ASSERT_EQ("file error", file_error.message()); +} + +TEST(StatusTest, TestToString) { + Status file_error = Status::IOError("file error"); + ASSERT_EQ("IOError: file error", file_error.ToString()); +} + +} // namespace arrow diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index 6ba2035bcd3..d5585313c72 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -138,6 +138,18 @@ class ARROW_EXPORT Status { // Get the POSIX code associated with this Status, or -1 if there is none. int16_t posix_code() const; + StatusCode code() const { + return ((state_ == NULL) ? StatusCode::OK : static_cast(state_[4])); + } + + std::string message() const { + uint32_t length; + memcpy(&length, state_, sizeof(length)); + std::string msg; + msg.append((state_ + 7), length); + return msg; + } + private: // OK status has a NULL state_. Otherwise, state_ is a new[] array // of the following form: @@ -147,10 +159,6 @@ class ARROW_EXPORT Status { // state_[7..] == message const char* state_; - StatusCode code() const { - return ((state_ == NULL) ? StatusCode::OK : static_cast(state_[4])); - } - Status(StatusCode code, const std::string& msg, int16_t posix_code); static const char* CopyState(const char* s); }; From 246a126b23dc20bca7b665ec76d75ca4a68cd1f1 Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Mon, 15 Aug 2016 23:04:46 -0700 Subject: [PATCH 112/210] ARROW-107: [C++] Implement IPC for structs Some other changes (I tried to isolate each in there own commit): 1. Changed NumericTypes to be its own tempated type instead of separate macros (this made debugging easier) 2. Fix an existing unit test for IPC that row counts inconsistent with row batch size. 3. Some minor make-format changes. Author: Micah Kornfield Closes #117 from emkornfield/emk_struct_ipc and squashes the following commits: 777e338 [Micah Kornfield] fix formatting 9008046 [Micah Kornfield] use TypeClass::c_type e46b0d8 [Micah Kornfield] add skip for memory pool test fc63bff [Micah Kornfield] make lint and make format 9aa972b [Micah Kornfield] change macro to templates instead (makes debugging easier) 3e01e7f [Micah Kornfield] Implement struct round-trip. Fix unit test for non null to have consistent batch sizes 8eaf1e7 [Micah Kornfield] fix formatting --- cpp/src/.clang-tidy-ignore | 1 + cpp/src/arrow/ipc/adapter.cc | 24 ++++++- cpp/src/arrow/ipc/ipc-adapter-test.cc | 46 ++++++++++-- cpp/src/arrow/ipc/metadata-internal.cc | 10 +-- cpp/src/arrow/types/primitive.h | 97 +++++++++++++------------- cpp/src/arrow/util/memory-pool-test.cc | 2 +- 6 files changed, 115 insertions(+), 65 deletions(-) diff --git a/cpp/src/.clang-tidy-ignore b/cpp/src/.clang-tidy-ignore index a128c388896..5ab4d20d619 100644 --- a/cpp/src/.clang-tidy-ignore +++ b/cpp/src/.clang-tidy-ignore @@ -1 +1,2 @@ ipc-adapter-test.cc +memory-pool-test.cc diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 84f7830092c..3259980058b 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -34,6 +34,7 @@ #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" @@ -118,8 +119,11 @@ Status VisitArray(const Array* arr, std::vector* field_nodes RETURN_NOT_OK(VisitArray( list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1)); } else if (arr->type_enum() == Type::STRUCT) { - // TODO(wesm) - return Status::NotImplemented("Struct type"); + const auto struct_arr = static_cast(arr); + for (auto& field : struct_arr->fields()) { + RETURN_NOT_OK( + VisitArray(field.get(), field_nodes, buffers, max_recursion_depth - 1)); + } } else { return Status::NotImplemented("Unrecognized type"); } @@ -313,6 +317,22 @@ class RowBatchReader::Impl { return MakeListArray(type, field_meta.length, offsets, values_array, field_meta.null_count, null_bitmap, out); } + + if (type->type == Type::STRUCT) { + const int num_children = type->num_children(); + std::vector fields; + fields.reserve(num_children); + for (int child_idx = 0; child_idx < num_children; ++child_idx) { + std::shared_ptr field_array; + RETURN_NOT_OK(NextArray( + type->child(child_idx).get(), max_recursion_depth - 1, &field_array)); + fields.push_back(field_array); + } + out->reset(new StructArray( + type, field_meta.length, fields, field_meta.null_count, null_bitmap)); + return Status::OK(); + } + return Status::NotImplemented("Non-primitive types not complete yet"); } diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 2bfb459d6e0..6740e0fc5ac 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -32,6 +32,7 @@ #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" @@ -205,15 +206,16 @@ Status MakeNonNullRowBatch(std::shared_ptr* out) { // Example data MemoryPool* pool = default_memory_pool(); - const int length = 200; + const int length = 50; std::shared_ptr leaf_values, list_array, list_list_array, flat_array; RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); bool include_nulls = false; - RETURN_NOT_OK(MakeRandomListArray(leaf_values, 50, include_nulls, pool, &list_array)); RETURN_NOT_OK( - MakeRandomListArray(list_array, 50, include_nulls, pool, &list_list_array)); - RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); + MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); return Status::OK(); } @@ -238,10 +240,40 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { return Status::OK(); } -INSTANTIATE_TEST_CASE_P( - RoundTripTests, TestWriteRowBatch, +Status MakeStruct(std::shared_ptr* out) { + // reuse constructed list columns + std::shared_ptr list_batch; + RETURN_NOT_OK(MakeListRowBatch(&list_batch)); + std::vector columns = { + list_batch->column(0), list_batch->column(1), list_batch->column(2)}; + auto list_schema = list_batch->schema(); + + // Define schema + std::shared_ptr type(new StructType( + {list_schema->field(0), list_schema->field(1), list_schema->field(2)})); + auto f0 = std::make_shared("non_null_struct", type); + auto f1 = std::make_shared("null_struct", type); + std::shared_ptr schema(new Schema({f0, f1})); + + // construct individual nullable/non-nullable struct arrays + ArrayPtr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); + std::vector null_bytes(list_batch->num_rows(), 1); + null_bytes[0] = 0; + std::shared_ptr null_bitmask; + RETURN_NOT_OK(util::bytes_to_bits(null_bytes, &null_bitmask)); + ArrayPtr with_nulls( + new StructArray(type, list_batch->num_rows(), columns, 1, null_bitmask)); + + // construct batch + std::vector arrays = {no_nulls, with_nulls}; + out->reset(new RowBatch(schema, list_batch->num_rows(), arrays)); + return Status::OK(); +} + +INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, - &MakeZeroLengthRowBatch, &MakeDeeplyNestedList, &MakeStringTypesRowBatch)); + &MakeZeroLengthRowBatch, &MakeDeeplyNestedList, + &MakeStringTypesRowBatch, &MakeStruct)); void TestGetRowBatchSize(std::shared_ptr batch) { MockMemorySource mock_source(1 << 16); diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1d3edf0117f..8cd416ff585 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -265,11 +265,8 @@ Status MessageBuilder::SetSchema(const Schema* schema) { field_offsets.push_back(offset); } - header_ = flatbuf::CreateSchema( - fbb_, - endianness(), - fbb_.CreateVector(field_offsets)) - .Union(); + header_ = + flatbuf::CreateSchema(fbb_, endianness(), fbb_.CreateVector(field_offsets)).Union(); body_length_ = 0; return Status::OK(); } @@ -278,8 +275,7 @@ Status MessageBuilder::SetRecordBatch(int32_t length, int64_t body_length, const std::vector& nodes, const std::vector& buffers) { header_type_ = flatbuf::MessageHeader_RecordBatch; - header_ = flatbuf::CreateRecordBatch(fbb_, length, - fbb_.CreateVectorOfStructs(nodes), + header_ = flatbuf::CreateRecordBatch(fbb_, length, fbb_.CreateVectorOfStructs(nodes), fbb_.CreateVectorOfStructs(buffers)) .Union(); body_length_ = body_length; diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 18f954adc08..770de765f1f 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -53,54 +53,55 @@ class ARROW_EXPORT PrimitiveArray : public Array { const uint8_t* raw_data_; }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass, T) \ - class ARROW_EXPORT NAME : public PrimitiveArray { \ - public: \ - using value_type = T; \ - \ - NAME(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, \ - const std::shared_ptr& null_bitmap = nullptr) \ - : PrimitiveArray( \ - std::make_shared(), length, data, null_count, null_bitmap) {} \ - NAME(const TypePtr& type, int32_t length, const std::shared_ptr& data, \ - int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) \ - : PrimitiveArray(type, length, data, null_count, null_bitmap) {} \ - \ - bool EqualsExact(const NAME& other) const { \ - return PrimitiveArray::EqualsExact(*static_cast(&other)); \ - } \ - \ - bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, \ - const ArrayPtr& arr) const override { \ - if (this == arr.get()) { return true; } \ - if (!arr) { return false; } \ - if (this->type_enum() != arr->type_enum()) { return false; } \ - const auto other = static_cast(arr.get()); \ - for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { \ - const bool is_null = IsNull(i); \ - if (is_null != arr->IsNull(o_i) || \ - (!is_null && Value(i) != other->Value(o_i))) { \ - return false; \ - } \ - } \ - return true; \ - } \ - \ - const T* raw_data() const { return reinterpret_cast(raw_data_); } \ - \ - T Value(int i) const { return raw_data()[i]; } \ - }; - -NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type, uint8_t); -NUMERIC_ARRAY_DECL(Int8Array, Int8Type, int8_t); -NUMERIC_ARRAY_DECL(UInt16Array, UInt16Type, uint16_t); -NUMERIC_ARRAY_DECL(Int16Array, Int16Type, int16_t); -NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type, uint32_t); -NUMERIC_ARRAY_DECL(Int32Array, Int32Type, int32_t); -NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type, uint64_t); -NUMERIC_ARRAY_DECL(Int64Array, Int64Type, int64_t); -NUMERIC_ARRAY_DECL(FloatArray, FloatType, float); -NUMERIC_ARRAY_DECL(DoubleArray, DoubleType, double); +template +class ARROW_EXPORT NumericArray : public PrimitiveArray { + public: + using value_type = typename TypeClass::c_type; + NumericArray(int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) + : PrimitiveArray( + std::make_shared(), length, data, null_count, null_bitmap) {} + NumericArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, + int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) + : PrimitiveArray(type, length, data, null_count, null_bitmap) {} + + bool EqualsExact(const NumericArray& other) const { + return PrimitiveArray::EqualsExact(*static_cast(&other)); + } + + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast*>(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i) || (!is_null && Value(i) != other->Value(o_i))) { + return false; + } + } + return true; + } + const value_type* raw_data() const { + return reinterpret_cast(raw_data_); + } + + value_type Value(int i) const { return raw_data()[i]; } +}; + +#define NUMERIC_ARRAY_DECL(NAME, TypeClass) using NAME = NumericArray; + +NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type); +NUMERIC_ARRAY_DECL(Int8Array, Int8Type); +NUMERIC_ARRAY_DECL(UInt16Array, UInt16Type); +NUMERIC_ARRAY_DECL(Int16Array, Int16Type); +NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type); +NUMERIC_ARRAY_DECL(Int32Array, Int32Type); +NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type); +NUMERIC_ARRAY_DECL(Int64Array, Int64Type); +NUMERIC_ARRAY_DECL(FloatArray, FloatType); +NUMERIC_ARRAY_DECL(DoubleArray, DoubleType); template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index 919f3740982..deb7ffd03ba 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -54,7 +54,7 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { #ifndef NDEBUG EXPECT_EXIT(pool->Free(data, 120), ::testing::ExitedWithCode(1), - ".*Check failed: \\(bytes_allocated_\\) >= \\(size\\)"); + ".*Check failed: \\(bytes_allocated_\\) >= \\(size\\)"); #endif pool->Free(data, 100); From e7e399db5fc6913e67426514279f81766a0778d2 Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Tue, 24 May 2016 13:38:09 -0700 Subject: [PATCH 113/210] ARROW-259: Use Flatbuffer Field type instead of MaterializedField Remove MaterializedField, MajorType, RepeatedTypes Add code to convert from FlatBuf representation to Pojo also adds tests to test the conversion --- format/Message.fbs | 22 +- header | 16 + java/format/pom.xml | 163 ++++ java/memory/pom.xml | 2 +- java/pom.xml | 3 +- java/vector/pom.xml | 7 +- java/vector/src/main/codegen/config.fmpp | 1 + .../src/main/codegen/data/ArrowTypes.tdd | 80 ++ .../main/codegen/data/ValueVectorTypes.tdd | 59 +- .../src/main/codegen/includes/vv_imports.ftl | 4 + .../templates/AbstractFieldReader.java | 8 +- .../templates/AbstractFieldWriter.java | 10 +- .../AbstractPromotableFieldWriter.java | 4 - .../src/main/codegen/templates/ArrowType.java | 129 +++ .../main/codegen/templates/BaseReader.java | 5 +- .../main/codegen/templates/BaseWriter.java | 3 +- .../codegen/templates/BasicTypeHelper.java | 539 ------------ .../main/codegen/templates/ComplexCopier.java | 18 +- .../codegen/templates/ComplexReaders.java | 72 +- .../codegen/templates/ComplexWriters.java | 30 +- .../codegen/templates/FixedValueVectors.java | 94 +- .../codegen/templates/HolderReaderImpl.java | 98 +-- .../main/codegen/templates/ListWriters.java | 234 ----- .../main/codegen/templates/MapWriters.java | 42 +- .../main/codegen/templates/NullReader.java | 23 +- .../templates/NullableValueVectors.java | 104 ++- .../templates/RepeatedValueVectors.java | 421 --------- .../codegen/templates/UnionListWriter.java | 23 +- .../main/codegen/templates/UnionReader.java | 28 +- .../main/codegen/templates/UnionVector.java | 105 ++- .../main/codegen/templates/UnionWriter.java | 16 +- .../main/codegen/templates/ValueHolders.java | 43 +- .../templates/VariableLengthVectors.java | 73 +- .../arrow/vector/BaseDataValueVector.java | 5 +- .../apache/arrow/vector/BaseValueVector.java | 31 +- .../org/apache/arrow/vector/BitVector.java | 43 +- .../org/apache/arrow/vector/ObjectVector.java | 220 ----- .../arrow/vector/ValueHolderHelper.java | 203 ----- .../org/apache/arrow/vector/ValueVector.java | 10 +- .../apache/arrow/vector/VectorDescriptor.java | 83 -- .../org/apache/arrow/vector/ZeroVector.java | 30 +- .../complex/AbstractContainerVector.java | 49 +- .../vector/complex/AbstractMapVector.java | 47 +- .../complex/BaseRepeatedValueVector.java | 63 +- .../vector/complex/ContainerVectorLike.java | 43 - .../arrow/vector/complex/ListVector.java | 89 +- .../arrow/vector/complex/MapVector.java | 97 +-- .../vector/complex/RepeatedListVector.java | 427 ---------- .../vector/complex/RepeatedMapVector.java | 584 ------------- .../vector/complex/RepeatedValueVector.java | 2 +- .../complex/impl/AbstractBaseReader.java | 19 +- .../complex/impl/AbstractBaseWriter.java | 16 +- .../complex/impl/ComplexWriterImpl.java | 22 +- .../vector/complex/impl/PromotableWriter.java | 48 +- .../complex/impl/RepeatedListReaderImpl.java | 145 ---- .../complex/impl/RepeatedMapReaderImpl.java | 192 ----- .../impl/SingleLikeRepeatedMapReaderImpl.java | 89 -- .../complex/impl/SingleListReaderImpl.java | 14 +- .../complex/impl/SingleMapReaderImpl.java | 10 +- .../vector/complex/impl/UnionListReader.java | 19 +- .../arrow/vector/holders/ObjectHolder.java | 38 - .../arrow/vector/holders/UnionHolder.java | 7 +- .../arrow/vector/types/MaterializedField.java | 217 ----- .../org/apache/arrow/vector/types/Types.java | 596 ++++++++++--- .../apache/arrow/vector/types/pojo/Field.java | 105 +++ .../arrow/vector/types/pojo/Schema.java | 74 ++ .../vector/util/ByteFunctionHelpers.java | 50 -- .../arrow/vector/util/CoreDecimalUtility.java | 91 -- .../arrow/vector/util/DecimalUtility.java | 802 +++++++++--------- .../arrow/vector/util/MapWithOrdinal.java | 12 + .../arrow/vector/TestDecimalVector.java | 63 ++ ...TestOversizedAllocationForValueVector.java | 11 +- .../apache/arrow/vector/TestUnionVector.java | 5 +- .../apache/arrow/vector/TestValueVector.java | 137 +-- .../complex/impl/TestPromotableWriter.java | 7 +- .../complex/writer/TestComplexWriter.java | 270 ++++++ .../apache/arrow/vector/pojo/TestConvert.java | 80 ++ 77 files changed, 2464 insertions(+), 5180 deletions(-) create mode 100644 header create mode 100644 java/format/pom.xml create mode 100644 java/vector/src/main/codegen/data/ArrowTypes.tdd create mode 100644 java/vector/src/main/codegen/templates/ArrowType.java delete mode 100644 java/vector/src/main/codegen/templates/BasicTypeHelper.java delete mode 100644 java/vector/src/main/codegen/templates/ListWriters.java delete mode 100644 java/vector/src/main/codegen/templates/RepeatedValueVectors.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java delete mode 100644 java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java diff --git a/format/Message.fbs b/format/Message.fbs index 3f688c156e3..2928207db8c 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -1,10 +1,13 @@ -namespace apache.arrow.flatbuf; +namespace org.apache.arrow.flatbuf; /// ---------------------------------------------------------------------- /// Logical types and their metadata (if any) /// /// These are stored in the flatbuffer in the Type union below +table Null { +} + /// A Tuple in the flatbuffer metadata is the same as an Arrow Struct /// (according to the physical memory layout). We used Tuple here as Struct is /// a reserved word in Flatbuffers @@ -45,10 +48,22 @@ table Decimal { scale: int; } +table Date { +} + +table Time { +} + table Timestamp { timezone: string; } +table IntervalDay { +} + +table IntervalYear { +} + table JSONScalar { dense:bool=true; } @@ -58,13 +73,18 @@ table JSONScalar { /// add new logical types to Type without breaking backwards compatibility union Type { + Null, Int, FloatingPoint, Binary, Utf8, Bool, Decimal, + Date, + Time, Timestamp, + IntervalDay, + IntervalYear, List, Tuple, Union, diff --git a/header b/header new file mode 100644 index 00000000000..70665d1a262 --- /dev/null +++ b/header @@ -0,0 +1,16 @@ +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + diff --git a/java/format/pom.xml b/java/format/pom.xml new file mode 100644 index 00000000000..ea27a3072bc --- /dev/null +++ b/java/format/pom.xml @@ -0,0 +1,163 @@ + + + +4.0.0 + + + arrow-java-root + org.apache.arrow + 0.1-decimal + + +arrow-format +jar +Arrow Format + + + 1.2.0-3f79e055 + 3.3 + 2.10 + 1.5.0.Final + + + + + com.vlkan + flatbuffers + ${fbs.version} + + + + + + + + kr.motd.maven + os-maven-plugin + ${os-maven-plugin.version} + + + + + + org.apache.maven.plugins + maven-dependency-plugin + ${maven-dependency-plugin.version} + + + copy-flatc + generate-sources + + copy + + + + + com.vlkan + flatc-${os.detected.classifier} + ${fbs.version} + exe + true + ${project.build.directory} + + + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.4.0 + + + script-chmod + + exec + + generate-sources + + chmod + + +x + ${project.build.directory}/flatc-${os.detected.classifier}-${fbs.version}.exe + + + + + + exec + + generate-sources + + ${project.build.directory}/flatc-${os.detected.classifier}-${fbs.version}.exe + + -j + -o + target/generated-sources/ + ../../format/Message.fbs + + + + + + + com.mycila + license-maven-plugin + 2.3 + +
${basedir}/../../header
+ + **/*.java + +
+ + + process-sources + + format + + + +
+ + org.codehaus.mojo + build-helper-maven-plugin + 1.9.1 + + + add-sources-as-resources + generate-sources + + add-source + + + + ${project.build.directory}/generated-sources + + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + + true + + +
+ +
+
+ diff --git a/java/memory/pom.xml b/java/memory/pom.xml index 44332f5ed14..12ff4c81d86 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -15,7 +15,7 @@ org.apache.arrow arrow-java-root - 0.1-SNAPSHOT + 0.1-decimal arrow-memory arrow-memory diff --git a/java/pom.xml b/java/pom.xml index 71f59caf279..92ab109f939 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.arrow arrow-java-root - 0.1-SNAPSHOT + 0.1-decimal pom Apache Arrow Java Root POM @@ -465,6 +465,7 @@ + format memory vector diff --git a/java/vector/pom.xml b/java/vector/pom.xml index df5389261ba..fac788cef14 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -15,13 +15,18 @@ org.apache.arrow arrow-java-root - 0.1-SNAPSHOT + 0.1-decimal vector vectors + + org.apache.arrow + arrow-format + 0.1-decimal + org.apache.arrow arrow-memory diff --git a/java/vector/src/main/codegen/config.fmpp b/java/vector/src/main/codegen/config.fmpp index 663677cbb5a..6d92ba830ee 100644 --- a/java/vector/src/main/codegen/config.fmpp +++ b/java/vector/src/main/codegen/config.fmpp @@ -17,6 +17,7 @@ data: { # TODO: Rename to ~valueVectorModesAndTypes for clarity. vv: tdd(../data/ValueVectorTypes.tdd), + arrowTypes: tdd(../data/ArrowTypes.tdd) } freemarkerLinks: { diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd new file mode 100644 index 00000000000..4ab7f8562f9 --- /dev/null +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -0,0 +1,80 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http:# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +{ + types: [ + { + name: "Null", + fields: [] + }, + { + name: "Tuple", + fields: [] + }, + { + name: "List", + fields: [] + }, + { + name: "Union", + fields: [] + }, + { + name: "Int", + fields: [{name: "bitWidth", type: int}, {name: "isSigned", type: boolean}] + }, + { + name: "FloatingPoint", + fields: [{name: precision, type: int}] + }, + { + name: "Utf8", + fields: [] + }, + { + name: "Binary", + fields: [] + }, + { + name: "Bool", + fields: [] + }, + { + name: "Decimal", + fields: [{name: "precision", type: int}, {name: "scale", type: int}] + }, + { + name: "Date", + fields: [] + }, + { + name: "Time", + fields: [] + }, + { + name: "Timestamp", + fields: [{name: "timezone", type: "String"}] + }, + { + name: "IntervalDay", + fields: [] + }, + { + name: "IntervalYear", + fields: [] + } + ] +} diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index e747c30c5d1..421dd7ef92e 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -17,8 +17,7 @@ { modes: [ {name: "Optional", prefix: "Nullable"}, - {name: "Required", prefix: ""}, - {name: "Repeated", prefix: "Repeated"} + {name: "Required", prefix: ""} ], types: [ { @@ -61,9 +60,8 @@ { class: "Int", valueHolder: "IntHolder"}, { class: "UInt4", valueHolder: "UInt4Holder" }, { class: "Float4", javaType: "float" , boxedType: "Float", fields: [{name: "value", type: "float"}]}, - { class: "Time", javaType: "int", friendlyType: "DateTime" }, { class: "IntervalYear", javaType: "int", friendlyType: "Period" } - { class: "Decimal9", maxPrecisionDigits: 9, friendlyType: "BigDecimal", fields: [{name:"value", type:"int"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] }, + { class: "Time", javaType: "int", friendlyType: "DateTime" } ] }, { @@ -78,15 +76,11 @@ { class: "Float8", javaType: "double" , boxedType: "Double", fields: [{name: "value", type: "double"}], }, { class: "Date", javaType: "long", friendlyType: "DateTime" }, { class: "TimeStamp", javaType: "long", friendlyType: "DateTime" } - { class: "Decimal18", maxPrecisionDigits: 18, friendlyType: "BigDecimal", fields: [{name:"value", type:"long"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] }, - <#-- - { class: "Money", maxPrecisionDigits: 2, scale: 1, }, - --> ] }, { major: "Fixed", - width: 12, + width: 8, javaType: "ArrowBuf", boxedType: "ArrowBuf", minor: [ @@ -96,51 +90,11 @@ { major: "Fixed", width: 16, - javaType: "ArrowBuf" - boxedType: "ArrowBuf", - minor: [ - { class: "Interval", daysOffset: 4, millisecondsOffset: 8, friendlyType: "Period", fields: [ {name: "months", type: "int"}, {name: "days", type:"int"}, {name: "milliseconds", type:"int"}] } - ] - }, - { - major: "Fixed", - width: 12, - javaType: "ArrowBuf", - boxedType: "ArrowBuf", - minor: [ - <#-- - { class: "TimeTZ" }, - { class: "Interval" } - --> - { class: "Decimal28Dense", maxPrecisionDigits: 28, nDecimalDigits: 3, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } - ] - }, - { - major: "Fixed", - width: 16, - javaType: "ArrowBuf", - boxedType: "ArrowBuf", - - minor: [ - { class: "Decimal38Dense", maxPrecisionDigits: 38, nDecimalDigits: 4, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } - ] - }, - { - major: "Fixed", - width: 24, - javaType: "ArrowBuf", - boxedType: "ArrowBuf", - minor: [ - { class: "Decimal38Sparse", maxPrecisionDigits: 38, nDecimalDigits: 6, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } - ] - }, - { - major: "Fixed", - width: 20, javaType: "ArrowBuf", boxedType: "ArrowBuf", + minor: [ - { class: "Decimal28Sparse", maxPrecisionDigits: 28, nDecimalDigits: 5, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } + { class: "Decimal", maxPrecisionDigits: 38, nDecimalDigits: 4, friendlyType: "BigDecimal", fields: [{name: "start", type: "int"}, {name: "buffer", type: "ArrowBuf"}, {name: "scale", type: "int", include: false}, {name: "precision", type: "int", include: false}] } ] }, { @@ -151,8 +105,7 @@ fields: [{name: "start", type: "int"}, {name: "end", type: "int"}, {name: "buffer", type: "ArrowBuf"}], minor: [ { class: "VarBinary" , friendlyType: "byte[]" }, - { class: "VarChar" , friendlyType: "Text" }, - { class: "Var16Char" , friendlyType: "String" } + { class: "VarChar" , friendlyType: "Text" } ] }, { diff --git a/java/vector/src/main/codegen/includes/vv_imports.ftl b/java/vector/src/main/codegen/includes/vv_imports.ftl index 2d808b1b3cb..9b4b79bfd7b 100644 --- a/java/vector/src/main/codegen/includes/vv_imports.ftl +++ b/java/vector/src/main/codegen/includes/vv_imports.ftl @@ -17,6 +17,8 @@ import com.google.common.collect.ObjectArrays; import com.google.common.base.Charsets; import com.google.common.collect.ObjectArrays; +import com.google.flatbuffers.FlatBufferBuilder; + import com.google.common.base.Preconditions; import io.netty.buffer.*; @@ -25,6 +27,8 @@ import org.apache.commons.lang3.ArrayUtils; import org.apache.arrow.memory.*; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.*; +import org.apache.arrow.vector.types.pojo.*; +import org.apache.arrow.vector.types.pojo.ArrowType.*; import org.apache.arrow.vector.types.*; import org.apache.arrow.vector.*; import org.apache.arrow.vector.holders.*; diff --git a/java/vector/src/main/codegen/templates/AbstractFieldReader.java b/java/vector/src/main/codegen/templates/AbstractFieldReader.java index b83dba28791..e0d0fc9715b 100644 --- a/java/vector/src/main/codegen/templates/AbstractFieldReader.java +++ b/java/vector/src/main/codegen/templates/AbstractFieldReader.java @@ -41,7 +41,13 @@ public boolean isSet() { return true; } - <#list ["Object", "BigDecimal", "Integer", "Long", "Boolean", + @Override + public Field getField() { + fail("getField"); + return null; + } + + <#list ["Object", "BigDecimal", "Integer", "Long", "Boolean", "Character", "DateTime", "Period", "Double", "Float", "Text", "String", "Byte", "Short", "byte[]"] as friendlyType> <#assign safeType=friendlyType /> diff --git a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java index 6ee9dad44e9..de076fc46ff 100644 --- a/java/vector/src/main/codegen/templates/AbstractFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractFieldWriter.java @@ -31,10 +31,6 @@ */ @SuppressWarnings("unused") abstract class AbstractFieldWriter extends AbstractBaseWriter implements FieldWriter { - AbstractFieldWriter(FieldWriter parent) { - super(parent); - } - @Override public void start() { throw new IllegalStateException(String.format("You tried to start when you are using a ValueWriter of type %s.", this.getClass().getSimpleName())); @@ -62,9 +58,15 @@ public void write(${name}Holder holder) { fail("${name}"); } + <#if minor.class == "Decimal"> + public void writeDecimal(int start, ArrowBuf buffer) { + fail("${name}"); + } + <#else> public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { fail("${name}"); } + diff --git a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java index 549dbf107ea..7e60320cfb8 100644 --- a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java @@ -37,10 +37,6 @@ */ @SuppressWarnings("unused") abstract class AbstractPromotableFieldWriter extends AbstractFieldWriter { - AbstractPromotableFieldWriter(FieldWriter parent) { - super(parent); - } - /** * Retrieve the FieldWriter, promoting if it is not a FieldWriter of the specified type * @param type diff --git a/java/vector/src/main/codegen/templates/ArrowType.java b/java/vector/src/main/codegen/templates/ArrowType.java new file mode 100644 index 00000000000..6dfaf216ad0 --- /dev/null +++ b/java/vector/src/main/codegen/templates/ArrowType.java @@ -0,0 +1,129 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.arrow.flatbuf.Field; +import org.apache.arrow.flatbuf.Type; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; + +import java.util.Objects; + +<@pp.dropOutputFile /> +<@pp.changeOutputFile name="/org/apache/arrow/vector/types/pojo/ArrowType.java" /> + + +<#include "/@includes/license.ftl" /> +package org.apache.arrow.vector.types.pojo; + +import com.google.flatbuffers.FlatBufferBuilder; +import org.apache.arrow.flatbuf.Type; + +import java.util.Objects; + +public abstract class ArrowType { + + public abstract byte getTypeType(); + public abstract int getType(FlatBufferBuilder builder); + + + <#list arrowTypes.types as type> + <#assign name = type.name> + <#assign fields = type.fields> + public static class ${name} extends ArrowType { + public static final byte TYPE_TYPE = Type.${name}; + <#if type.fields?size == 0> + public static final ${name} INSTANCE = new ${name}(); + + + <#list fields as field> + <#assign fieldName = field.name> + <#assign fieldType = field.type> + ${fieldType} ${fieldName}; + + + <#if type.fields?size != 0> + public ${type.name}(<#list type.fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + <#list type.fields as field> + this.${field.name} = ${field.name}; + + } + + + @Override + public byte getTypeType() { + return TYPE_TYPE; + } + + @Override + public int getType(FlatBufferBuilder builder) { + org.apache.arrow.flatbuf.${type.name}.start${type.name}(builder); + <#list type.fields as field> + org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, <#if field.type == "String">builder.createString(${field.name})<#else>${field.name}); + + return org.apache.arrow.flatbuf.${type.name}.end${type.name}(builder); + } + + <#list fields as field> + public ${field.type} get${field.name?cap_first}() { + return ${field.name}; + } + + + @Override + public int hashCode() { + return Objects.hash(<#list type.fields as field>${field.name}<#if field_has_next>, ); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof ${type.name})) { + return false; + } + <#if type.fields?size == 0> + return true; + <#else> + ${type.name} that = (${type.name}) obj; + return + <#list type.fields as field>Objects.equals(this.${field.name}, that.${field.name}) <#if field_has_next>&&<#else>; + + + } + } + + + public static org.apache.arrow.vector.types.pojo.ArrowType getTypeForField(org.apache.arrow.flatbuf.Field field) { + switch(field.typeType()) { + <#list arrowTypes.types as type> + <#assign name = type.name> + <#assign nameLower = type.name?lower_case> + <#assign fields = type.fields> + case Type.${type.name}: + org.apache.arrow.flatbuf.${type.name} ${nameLower}Type = (org.apache.arrow.flatbuf.${type.name}) field.type(new org.apache.arrow.flatbuf.${type.name}()); + return new ${type.name}(<#list type.fields as field>${nameLower}Type.${field.name}()<#if field_has_next>, ); + + default: + throw new UnsupportedOperationException("Unsupported type: " + field.typeType()); + } + } + + public static Int getInt(org.apache.arrow.flatbuf.Field field) { + org.apache.arrow.flatbuf.Int intType = (org.apache.arrow.flatbuf.Int) field.type(new org.apache.arrow.flatbuf.Int()); + return new Int(intType.bitWidth(), intType.isSigned()); + } +} + + diff --git a/java/vector/src/main/codegen/templates/BaseReader.java b/java/vector/src/main/codegen/templates/BaseReader.java index 8f12b1da804..72fea58d0bc 100644 --- a/java/vector/src/main/codegen/templates/BaseReader.java +++ b/java/vector/src/main/codegen/templates/BaseReader.java @@ -30,8 +30,8 @@ @SuppressWarnings("unused") public interface BaseReader extends Positionable{ - MajorType getType(); - MaterializedField getField(); + Field getField(); + MinorType getMinorType(); void reset(); void read(UnionHolder holder); void read(int index, UnionHolder holder); @@ -60,7 +60,6 @@ public interface RepeatedListReader extends ListReader{ public interface ScalarReader extends <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> ${name}Reader, - <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> Repeated${name}Reader, BaseReader {} interface ComplexReader{ diff --git a/java/vector/src/main/codegen/templates/BaseWriter.java b/java/vector/src/main/codegen/templates/BaseWriter.java index 299b2389bb3..08bd39eae23 100644 --- a/java/vector/src/main/codegen/templates/BaseWriter.java +++ b/java/vector/src/main/codegen/templates/BaseWriter.java @@ -31,12 +31,11 @@ */ @SuppressWarnings("unused") public interface BaseWriter extends AutoCloseable, Positionable { - FieldWriter getParent(); int getValueCapacity(); public interface MapWriter extends BaseWriter { - MaterializedField getField(); + Field getField(); /** * Whether this writer is a map writer and is empty (has no children). diff --git a/java/vector/src/main/codegen/templates/BasicTypeHelper.java b/java/vector/src/main/codegen/templates/BasicTypeHelper.java deleted file mode 100644 index 0bae715e352..00000000000 --- a/java/vector/src/main/codegen/templates/BasicTypeHelper.java +++ /dev/null @@ -1,539 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -<@pp.dropOutputFile /> -<@pp.changeOutputFile name="/org/apache/arrow/vector/util/BasicTypeHelper.java" /> - -<#include "/@includes/license.ftl" /> - -package org.apache.arrow.vector.util; - -<#include "/@includes/vv_imports.ftl" /> -import org.apache.arrow.vector.complex.UnionVector; -import org.apache.arrow.vector.complex.RepeatedMapVector; -import org.apache.arrow.vector.util.CallBack; - -public class BasicTypeHelper { - static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BasicTypeHelper.class); - - private static final int WIDTH_ESTIMATE = 50; - - // Default length when casting to varchar : 65536 = 2^16 - // This only defines an absolute maximum for values, setting - // a high value like this will not inflate the size for small values - public static final int VARCHAR_DEFAULT_CAST_LEN = 65536; - - protected static String buildErrorMessage(final String operation, final MinorType type, final DataMode mode) { - return String.format("Unable to %s for minor type [%s] and mode [%s]", operation, type, mode); - } - - protected static String buildErrorMessage(final String operation, final MajorType type) { - return buildErrorMessage(operation, type.getMinorType(), type.getMode()); - } - - public static int getSize(MajorType major) { - switch (major.getMinorType()) { -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case}: - return ${type.width}<#if minor.class?substring(0, 3) == "Var" || - minor.class?substring(0, 3) == "PRO" || - minor.class?substring(0, 3) == "MSG"> + WIDTH_ESTIMATE; - - -// case FIXEDCHAR: return major.getWidth(); -// case FIXED16CHAR: return major.getWidth(); -// case FIXEDBINARY: return major.getWidth(); - } - throw new UnsupportedOperationException(buildErrorMessage("get size", major)); - } - - public static ValueVector getNewVector(String name, BufferAllocator allocator, MajorType type, CallBack callback){ - MaterializedField field = MaterializedField.create(name, type); - return getNewVector(field, allocator, callback); - } - - - public static Class getValueVectorClass(MinorType type, DataMode mode){ - switch (type) { - case UNION: - return UnionVector.class; - case MAP: - switch (mode) { - case OPTIONAL: - case REQUIRED: - return MapVector.class; - case REPEATED: - return RepeatedMapVector.class; - } - - case LIST: - switch (mode) { - case REPEATED: - return RepeatedListVector.class; - case REQUIRED: - case OPTIONAL: - return ListVector.class; - } - -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case}: - switch (mode) { - case REQUIRED: - return ${minor.class}Vector.class; - case OPTIONAL: - return Nullable${minor.class}Vector.class; - case REPEATED: - return Repeated${minor.class}Vector.class; - } - - - case GENERIC_OBJECT : - return ObjectVector.class ; - default: - break; - } - throw new UnsupportedOperationException(buildErrorMessage("get value vector class", type, mode)); - } - public static Class getReaderClassName( MinorType type, DataMode mode, boolean isSingularRepeated){ - switch (type) { - case MAP: - switch (mode) { - case REQUIRED: - if (!isSingularRepeated) - return SingleMapReaderImpl.class; - else - return SingleLikeRepeatedMapReaderImpl.class; - case REPEATED: - return RepeatedMapReaderImpl.class; - } - case LIST: - switch (mode) { - case REQUIRED: - return SingleListReaderImpl.class; - case REPEATED: - return RepeatedListReaderImpl.class; - } - -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case}: - switch (mode) { - case REQUIRED: - return ${minor.class}ReaderImpl.class; - case OPTIONAL: - return Nullable${minor.class}ReaderImpl.class; - case REPEATED: - return Repeated${minor.class}ReaderImpl.class; - } - - - default: - break; - } - throw new UnsupportedOperationException(buildErrorMessage("get reader class name", type, mode)); - } - - public static Class getWriterInterface( MinorType type, DataMode mode){ - switch (type) { - case UNION: return UnionWriter.class; - case MAP: return MapWriter.class; - case LIST: return ListWriter.class; -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case}: return ${minor.class}Writer.class; - - - default: - break; - } - throw new UnsupportedOperationException(buildErrorMessage("get writer interface", type, mode)); - } - - public static Class getWriterImpl( MinorType type, DataMode mode){ - switch (type) { - case UNION: - return UnionWriter.class; - case MAP: - switch (mode) { - case REQUIRED: - case OPTIONAL: - return SingleMapWriter.class; - case REPEATED: - return RepeatedMapWriter.class; - } - case LIST: - switch (mode) { - case REQUIRED: - case OPTIONAL: - return UnionListWriter.class; - case REPEATED: - return RepeatedListWriter.class; - } - -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case}: - switch (mode) { - case REQUIRED: - return ${minor.class}WriterImpl.class; - case OPTIONAL: - return Nullable${minor.class}WriterImpl.class; - case REPEATED: - return Repeated${minor.class}WriterImpl.class; - } - - - default: - break; - } - throw new UnsupportedOperationException(buildErrorMessage("get writer implementation", type, mode)); - } - - public static Class getHolderReaderImpl( MinorType type, DataMode mode){ - switch (type) { -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case}: - switch (mode) { - case REQUIRED: - return ${minor.class}HolderReaderImpl.class; - case OPTIONAL: - return Nullable${minor.class}HolderReaderImpl.class; - case REPEATED: - return Repeated${minor.class}HolderReaderImpl.class; - } - - - default: - break; - } - throw new UnsupportedOperationException(buildErrorMessage("get holder reader implementation", type, mode)); - } - - public static ValueVector getNewVector(MaterializedField field, BufferAllocator allocator){ - return getNewVector(field, allocator, null); - } - public static ValueVector getNewVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ - field = field.clone(); - MajorType type = field.getType(); - - switch (type.getMinorType()) { - - case UNION: - return new UnionVector(field, allocator, callBack); - - case MAP: - switch (type.getMode()) { - case REQUIRED: - case OPTIONAL: - return new MapVector(field, allocator, callBack); - case REPEATED: - return new RepeatedMapVector(field, allocator, callBack); - } - case LIST: - switch (type.getMode()) { - case REPEATED: - return new RepeatedListVector(field, allocator, callBack); - case OPTIONAL: - case REQUIRED: - return new ListVector(field, allocator, callBack); - } -<#list vv. types as type> - <#list type.minor as minor> - case ${minor.class?upper_case}: - switch (type.getMode()) { - case REQUIRED: - return new ${minor.class}Vector(field, allocator); - case OPTIONAL: - return new Nullable${minor.class}Vector(field, allocator); - case REPEATED: - return new Repeated${minor.class}Vector(field, allocator); - } - - - case GENERIC_OBJECT: - return new ObjectVector(field, allocator) ; - default: - break; - } - // All ValueVector types have been handled. - throw new UnsupportedOperationException(buildErrorMessage("get new vector", type)); - } - - public static ValueHolder getValue(ValueVector vector, int index) { - MajorType type = vector.getField().getType(); - ValueHolder holder; - switch(type.getMinorType()) { -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - <#if minor.class?starts_with("Var") || minor.class == "IntervalDay" || minor.class == "Interval" || - minor.class?starts_with("Decimal28") || minor.class?starts_with("Decimal38")> - switch (type.getMode()) { - case REQUIRED: - holder = new ${minor.class}Holder(); - ((${minor.class}Vector) vector).getAccessor().get(index, (${minor.class}Holder)holder); - return holder; - case OPTIONAL: - holder = new Nullable${minor.class}Holder(); - ((Nullable${minor.class}Holder)holder).isSet = ((Nullable${minor.class}Vector) vector).getAccessor().isSet(index); - if (((Nullable${minor.class}Holder)holder).isSet == 1) { - ((Nullable${minor.class}Vector) vector).getAccessor().get(index, (Nullable${minor.class}Holder)holder); - } - return holder; - } - <#else> - switch (type.getMode()) { - case REQUIRED: - holder = new ${minor.class}Holder(); - ((${minor.class}Holder)holder).value = ((${minor.class}Vector) vector).getAccessor().get(index); - return holder; - case OPTIONAL: - holder = new Nullable${minor.class}Holder(); - ((Nullable${minor.class}Holder)holder).isSet = ((Nullable${minor.class}Vector) vector).getAccessor().isSet(index); - if (((Nullable${minor.class}Holder)holder).isSet == 1) { - ((Nullable${minor.class}Holder)holder).value = ((Nullable${minor.class}Vector) vector).getAccessor().get(index); - } - return holder; - } - - - - case GENERIC_OBJECT: - holder = new ObjectHolder(); - ((ObjectHolder)holder).obj = ((ObjectVector) vector).getAccessor().getObject(index) ; - break; - } - - throw new UnsupportedOperationException(buildErrorMessage("get value", type)); - } - - public static void setValue(ValueVector vector, int index, ValueHolder holder) { - MajorType type = vector.getField().getType(); - - switch(type.getMinorType()) { -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - switch (type.getMode()) { - case REQUIRED: - ((${minor.class}Vector) vector).getMutator().setSafe(index, (${minor.class}Holder) holder); - return; - case OPTIONAL: - if (((Nullable${minor.class}Holder) holder).isSet == 1) { - ((Nullable${minor.class}Vector) vector).getMutator().setSafe(index, (Nullable${minor.class}Holder) holder); - } - return; - } - - - case GENERIC_OBJECT: - ((ObjectVector) vector).getMutator().setSafe(index, (ObjectHolder) holder); - return; - default: - throw new UnsupportedOperationException(buildErrorMessage("set value", type)); - } - } - - public static void setValueSafe(ValueVector vector, int index, ValueHolder holder) { - MajorType type = vector.getField().getType(); - - switch(type.getMinorType()) { - <#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - switch (type.getMode()) { - case REQUIRED: - ((${minor.class}Vector) vector).getMutator().setSafe(index, (${minor.class}Holder) holder); - return; - case OPTIONAL: - if (((Nullable${minor.class}Holder) holder).isSet == 1) { - ((Nullable${minor.class}Vector) vector).getMutator().setSafe(index, (Nullable${minor.class}Holder) holder); - } else { - ((Nullable${minor.class}Vector) vector).getMutator().isSafe(index); - } - return; - } - - - case GENERIC_OBJECT: - ((ObjectVector) vector).getMutator().setSafe(index, (ObjectHolder) holder); - default: - throw new UnsupportedOperationException(buildErrorMessage("set value safe", type)); - } - } - - public static boolean compareValues(ValueVector v1, int v1index, ValueVector v2, int v2index) { - MajorType type1 = v1.getField().getType(); - MajorType type2 = v2.getField().getType(); - - if (type1.getMinorType() != type2.getMinorType()) { - return false; - } - - switch(type1.getMinorType()) { -<#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - if ( ((${minor.class}Vector) v1).getAccessor().get(v1index) == - ((${minor.class}Vector) v2).getAccessor().get(v2index) ) - return true; - break; - - - default: - break; - } - return false; - } - - /** - * Create a ValueHolder of MajorType. - * @param type - * @return - */ - public static ValueHolder createValueHolder(MajorType type) { - switch(type.getMinorType()) { - <#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - - switch (type.getMode()) { - case REQUIRED: - return new ${minor.class}Holder(); - case OPTIONAL: - return new Nullable${minor.class}Holder(); - case REPEATED: - return new Repeated${minor.class}Holder(); - } - - - case GENERIC_OBJECT: - return new ObjectHolder(); - default: - throw new UnsupportedOperationException(buildErrorMessage("create value holder", type)); - } - } - - public static boolean isNull(ValueHolder holder) { - MajorType type = getValueHolderType(holder); - - switch(type.getMinorType()) { - <#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - - switch (type.getMode()) { - case REQUIRED: - return true; - case OPTIONAL: - return ((Nullable${minor.class}Holder) holder).isSet == 0; - case REPEATED: - return true; - } - - - default: - throw new UnsupportedOperationException(buildErrorMessage("check is null", type)); - } - } - - public static ValueHolder deNullify(ValueHolder holder) { - MajorType type = getValueHolderType(holder); - - switch(type.getMinorType()) { - <#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - - switch (type.getMode()) { - case REQUIRED: - return holder; - case OPTIONAL: - if( ((Nullable${minor.class}Holder) holder).isSet == 1) { - ${minor.class}Holder newHolder = new ${minor.class}Holder(); - - <#assign fields = minor.fields!type.fields /> - <#list fields as field> - newHolder.${field.name} = ((Nullable${minor.class}Holder) holder).${field.name}; - - - return newHolder; - } else { - throw new UnsupportedOperationException("You can not convert a null value into a non-null value!"); - } - case REPEATED: - return holder; - } - - - default: - throw new UnsupportedOperationException(buildErrorMessage("deNullify", type)); - } - } - - public static ValueHolder nullify(ValueHolder holder) { - MajorType type = getValueHolderType(holder); - - switch(type.getMinorType()) { - <#list vv.types as type> - <#list type.minor as minor> - case ${minor.class?upper_case} : - switch (type.getMode()) { - case REQUIRED: - Nullable${minor.class}Holder newHolder = new Nullable${minor.class}Holder(); - newHolder.isSet = 1; - <#assign fields = minor.fields!type.fields /> - <#list fields as field> - newHolder.${field.name} = ((${minor.class}Holder) holder).${field.name}; - - return newHolder; - case OPTIONAL: - return holder; - case REPEATED: - throw new UnsupportedOperationException("You can not convert repeated type " + type + " to nullable type!"); - } - - - default: - throw new UnsupportedOperationException(buildErrorMessage("nullify", type)); - } - } - - public static MajorType getValueHolderType(ValueHolder holder) { - - if (0 == 1) { - return null; - } - <#list vv.types as type> - <#list type.minor as minor> - else if (holder instanceof ${minor.class}Holder) { - return ((${minor.class}Holder) holder).TYPE; - } else if (holder instanceof Nullable${minor.class}Holder) { - return ((Nullable${minor.class}Holder) holder).TYPE; - } - - - - throw new UnsupportedOperationException("ValueHolder is not supported for 'getValueHolderType' method."); - - } - -} diff --git a/java/vector/src/main/codegen/templates/ComplexCopier.java b/java/vector/src/main/codegen/templates/ComplexCopier.java index 3614231c834..a5756a47ad7 100644 --- a/java/vector/src/main/codegen/templates/ComplexCopier.java +++ b/java/vector/src/main/codegen/templates/ComplexCopier.java @@ -42,13 +42,7 @@ public static void copy(FieldReader input, FieldWriter output) { } private static void writeValue(FieldReader reader, FieldWriter writer) { - final DataMode m = reader.getType().getMode(); - final MinorType mt = reader.getType().getMinorType(); - - switch(m){ - case OPTIONAL: - case REQUIRED: - + final MinorType mt = reader.getMinorType(); switch (mt) { @@ -89,12 +83,10 @@ private static void writeValue(FieldReader reader, FieldWriter writer) { } - break; - } } private static FieldWriter getMapWriterForReader(FieldReader reader, MapWriter writer, String name) { - switch (reader.getType().getMinorType()) { + switch (reader.getMinorType()) { <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> <#assign uncappedName = name?uncap_first/> @@ -108,12 +100,12 @@ private static FieldWriter getMapWriterForReader(FieldReader reader, MapWriter w case LIST: return (FieldWriter) writer.list(name); default: - throw new UnsupportedOperationException(reader.getType().toString()); + throw new UnsupportedOperationException(reader.getMinorType().toString()); } } private static FieldWriter getListWriterForReader(FieldReader reader, ListWriter writer) { - switch (reader.getType().getMinorType()) { + switch (reader.getMinorType()) { <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> <#assign uncappedName = name?uncap_first/> @@ -127,7 +119,7 @@ private static FieldWriter getListWriterForReader(FieldReader reader, ListWriter case LIST: return (FieldWriter) writer.list(); default: - throw new UnsupportedOperationException(reader.getType().toString()); + throw new UnsupportedOperationException(reader.getMinorType().toString()); } } } diff --git a/java/vector/src/main/codegen/templates/ComplexReaders.java b/java/vector/src/main/codegen/templates/ComplexReaders.java index 34c65712601..74a19a605e2 100644 --- a/java/vector/src/main/codegen/templates/ComplexReaders.java +++ b/java/vector/src/main/codegen/templates/ComplexReaders.java @@ -27,10 +27,10 @@ <@pp.dropOutputFile /> <#list vv.types as type> <#list type.minor as minor> -<#list ["", "Repeated"] as mode> +<#list [""] as mode> <#assign lowerName = minor.class?uncap_first /> <#if lowerName == "int" ><#assign lowerName = "integer" /> -<#assign name = mode + minor.class?cap_first /> +<#assign name = minor.class?cap_first /> <#assign javaType = (minor.javaType!type.javaType) /> <#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> <#assign safeType=friendlyType /> @@ -38,9 +38,9 @@ <#assign hasFriendly = minor.friendlyType!"no" == "no" /> -<#list ["", "Nullable"] as nullMode> -<#if (mode == "Repeated" && nullMode == "") || mode == "" > -<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${nullMode}${name}ReaderImpl.java" /> +<#list ["Nullable"] as nullMode> +<#if mode == "" > +<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${name}ReaderImpl.java" /> <#include "/@includes/license.ftl" /> package org.apache.arrow.vector.complex.impl; @@ -48,20 +48,20 @@ <#include "/@includes/vv_imports.ftl" /> @SuppressWarnings("unused") -public class ${nullMode}${name}ReaderImpl extends AbstractFieldReader { +public class ${name}ReaderImpl extends AbstractFieldReader { private final ${nullMode}${name}Vector vector; - public ${nullMode}${name}ReaderImpl(${nullMode}${name}Vector vector){ + public ${name}ReaderImpl(${nullMode}${name}Vector vector){ super(); this.vector = vector; } - public MajorType getType(){ - return vector.getField().getType(); + public MinorType getMinorType(){ + return vector.getMinorType(); } - public MaterializedField getField(){ + public Field getField(){ return vector.getField(); } @@ -73,50 +73,13 @@ public boolean isSet(){ } - - - - <#if mode == "Repeated"> - - public void copyAsValue(${minor.class?cap_first}Writer writer){ - Repeated${minor.class?cap_first}WriterImpl impl = (Repeated${minor.class?cap_first}WriterImpl) writer; - impl.vector.copyFromSafe(idx(), impl.idx(), vector); - } - - public void copyAsField(String name, MapWriter writer){ - Repeated${minor.class?cap_first}WriterImpl impl = (Repeated${minor.class?cap_first}WriterImpl) writer.list(name).${lowerName}(); - impl.vector.copyFromSafe(idx(), impl.idx(), vector); - } - - public int size(){ - return vector.getAccessor().getInnerValueCountAt(idx()); - } - - public void read(int arrayIndex, ${minor.class?cap_first}Holder h){ - vector.getAccessor().get(idx(), arrayIndex, h); - } - public void read(int arrayIndex, Nullable${minor.class?cap_first}Holder h){ - vector.getAccessor().get(idx(), arrayIndex, h); - } - - public ${friendlyType} read${safeType}(int arrayIndex){ - return vector.getAccessor().getSingleObject(idx(), arrayIndex); - } - - - public List readObject(){ - return (List) (Object) vector.getAccessor().getObject(idx()); - } - - <#else> - public void copyAsValue(${minor.class?cap_first}Writer writer){ - ${nullMode}${minor.class?cap_first}WriterImpl impl = (${nullMode}${minor.class?cap_first}WriterImpl) writer; + ${minor.class?cap_first}WriterImpl impl = (${minor.class?cap_first}WriterImpl) writer; impl.vector.copyFromSafe(idx(), impl.idx(), vector); } public void copyAsField(String name, MapWriter writer){ - ${nullMode}${minor.class?cap_first}WriterImpl impl = (${nullMode}${minor.class?cap_first}WriterImpl) writer.${lowerName}(name); + ${minor.class?cap_first}WriterImpl impl = (${minor.class?cap_first}WriterImpl) writer.${lowerName}(name); impl.vector.copyFromSafe(idx(), impl.idx(), vector); } @@ -141,9 +104,6 @@ public void copyValue(FieldWriter w){ public Object readObject(){ return vector.getAccessor().getObject(idx()); } - - - } @@ -156,18 +116,10 @@ public Object readObject(){ @SuppressWarnings("unused") public interface ${name}Reader extends BaseReader{ - <#if mode == "Repeated"> - public int size(); - public void read(int arrayIndex, ${minor.class?cap_first}Holder h); - public void read(int arrayIndex, Nullable${minor.class?cap_first}Holder h); - public Object readObject(int arrayIndex); - public ${friendlyType} read${safeType}(int arrayIndex); - <#else> public void read(${minor.class?cap_first}Holder h); public void read(Nullable${minor.class?cap_first}Holder h); public Object readObject(); public ${friendlyType} read${safeType}(); - public boolean isSet(); public void copyAsValue(${minor.class}Writer writer); public void copyAsField(String name, ${minor.class}Writer writer); diff --git a/java/vector/src/main/codegen/templates/ComplexWriters.java b/java/vector/src/main/codegen/templates/ComplexWriters.java index 8f9a6e7b971..3457545cea5 100644 --- a/java/vector/src/main/codegen/templates/ComplexWriters.java +++ b/java/vector/src/main/codegen/templates/ComplexWriters.java @@ -19,8 +19,8 @@ <@pp.dropOutputFile /> <#list vv.types as type> <#list type.minor as minor> -<#list ["", "Nullable", "Repeated"] as mode> -<#assign name = mode + minor.class?cap_first /> +<#list ["Nullable"] as mode> +<#assign name = minor.class?cap_first /> <#assign eName = name /> <#assign javaType = (minor.javaType!type.javaType) /> <#assign fields = minor.fields!type.fields /> @@ -38,17 +38,16 @@ @SuppressWarnings("unused") public class ${eName}WriterImpl extends AbstractFieldWriter { - private final ${name}Vector.Mutator mutator; - final ${name}Vector vector; + private final Nullable${name}Vector.Mutator mutator; + final Nullable${name}Vector vector; - public ${eName}WriterImpl(${name}Vector vector, AbstractFieldWriter parent) { - super(parent); + public ${eName}WriterImpl(Nullable${name}Vector vector) { this.mutator = vector.getMutator(); this.vector = vector; } @Override - public MaterializedField getField() { + public Field getField() { return vector.getField(); } @@ -89,12 +88,10 @@ public void write(Nullable${minor.class?cap_first}Holder h) { vector.getMutator().setValueCount(idx()+1); } - <#if !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { mutator.addSafe(idx(), <#list fields as field>${field.name}<#if field_has_next>, ); vector.getMutator().setValueCount(idx()+1); } - public void setPosition(int idx) { super.setPosition(idx); @@ -114,11 +111,17 @@ public void write(Nullable${minor.class}Holder h) { vector.getMutator().setValueCount(idx()+1); } - <#if !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> + <#if minor.class == "Decimal"> + public void writeDecimal(int start, ArrowBuf buffer) { + mutator.setSafe(idx(), 1, start, buffer); + vector.getMutator().setValueCount(idx()+1); + } + <#else> public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { - mutator.setSafe(idx(), <#if mode == "Nullable">1, <#list fields as field>${field.name}<#if field_has_next>, ); + mutator.setSafe(idx()<#if mode == "Nullable">, 1<#list fields as field><#if field.include!true >, ${field.name}); vector.getMutator().setValueCount(idx()+1); } + <#if mode == "Nullable"> @@ -128,7 +131,6 @@ public void writeNull() { } - } <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/writer/${eName}Writer.java" /> @@ -141,7 +143,9 @@ public void writeNull() { public interface ${eName}Writer extends BaseWriter { public void write(${minor.class}Holder h); - <#if !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> + <#if minor.class == "Decimal"> + public void writeDecimal(int start, ArrowBuf buffer); + <#else> public void write${minor.class}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ); } diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java index 18fcac93bb6..fe2b5c5b5bc 100644 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -43,20 +43,42 @@ public final class ${minor.class}Vector extends BaseDataValueVector implements FixedWidthVector{ private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${minor.class}Vector.class); - private final FieldReader reader = new ${minor.class}ReaderImpl(${minor.class}Vector.this); private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION * ${type.width}; private int allocationMonitor = 0; - public ${minor.class}Vector(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); + <#if minor.class == "Decimal"> + + private int precision; + private int scale; + + public ${minor.class}Vector(String name, BufferAllocator allocator, int precision, int scale) { + super(name, allocator); + this.precision = precision; + this.scale = scale; + } + <#else> + public ${minor.class}Vector(String name, BufferAllocator allocator) { + super(name, allocator); + } + + + + @Override + public MinorType getMinorType() { + return MinorType.${minor.class?upper_case}; + } + + @Override + public Field getField() { + throw new UnsupportedOperationException("internal vector"); } @Override public FieldReader getReader(){ - return reader; + throw new UnsupportedOperationException("non-nullable vectors cannot be used in readers"); } @Override @@ -162,7 +184,7 @@ public void reAlloc() { throw new OversizedAllocationException("Unable to expand the buffer. Max allowed buffer size is reached."); } - logger.debug("Reallocating vector [{}]. # of bytes: [{}] -> [{}]", field, allocationSizeInBytes, newAllocationSize); + logger.debug("Reallocating vector [{}]. # of bytes: [{}] -> [{}]", name, allocationSizeInBytes, newAllocationSize); final ArrowBuf newBuf = allocator.buffer((int)newAllocationSize); newBuf.setBytes(0, data, 0, data.capacity()); final int halfNewCapacity = newBuf.capacity() / 2; @@ -181,30 +203,13 @@ public void zeroVector() { data.setZero(0, data.capacity()); } -// @Override -// public void load(SerializedField metadata, ArrowBuf buffer) { -// Preconditions.checkArgument(this.field.getPath().equals(metadata.getNamePart().getName()), "The field %s doesn't match the provided metadata %s.", this.field, metadata); -// final int actualLength = metadata.getBufferLength(); -// final int valueCount = metadata.getValueCount(); -// final int expectedLength = valueCount * ${type.width}; -// assert actualLength == expectedLength : String.format("Expected to load %d bytes but actually loaded %d bytes", expectedLength, actualLength); -// -// clear(); -// if (data != null) { -// data.release(1); -// } -// data = buffer.slice(0, actualLength); -// data.retain(1); -// data.writerIndex(actualLength); -// } - public TransferPair getTransferPair(BufferAllocator allocator){ - return new TransferImpl(getField(), allocator); + return new TransferImpl(name, allocator); } @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator){ - return new TransferImpl(getField().withPath(ref), allocator); + return new TransferImpl(ref, allocator); } @Override @@ -230,8 +235,12 @@ public void splitAndTransferTo(int startIndex, int length, ${minor.class}Vector private class TransferImpl implements TransferPair{ private ${minor.class}Vector to; - public TransferImpl(MaterializedField field, BufferAllocator allocator){ - to = new ${minor.class}Vector(field, allocator); + public TransferImpl(String name, BufferAllocator allocator){ + <#if minor.class == "Decimal"> + to = new ${minor.class}Vector(name, allocator, precision, scale); + <#else> + to = new ${minor.class}Vector(name, allocator); + } public TransferImpl(${minor.class}Vector to) { @@ -260,7 +269,7 @@ public void copyValueSafe(int fromIndex, int toIndex) { } public void copyFrom(int fromIndex, int thisIndex, ${minor.class}Vector from){ - <#if (type.width > 8)> + <#if (type.width > 8 || minor.class == "IntervalDay")> from.data.getBytes(fromIndex * ${type.width}, data, thisIndex * ${type.width}, ${type.width}); <#else> <#-- type.width <= 8 --> data.set${(minor.javaType!type.javaType)?cap_first}(thisIndex * ${type.width}, @@ -298,7 +307,7 @@ public boolean isNull(int index){ return false; } - <#if (type.width > 8)> + <#if (type.width > 8 || minor.class == "IntervalDay")> public ${minor.javaType!type.javaType} get(int index) { return data.slice(index * ${type.width}, ${type.width}); @@ -416,31 +425,30 @@ public StringBuilder getAsStringBuilder(int index) { append(millis)); } - <#elseif (minor.class == "Decimal28Sparse") || (minor.class == "Decimal38Sparse") || (minor.class == "Decimal28Dense") || (minor.class == "Decimal38Dense")> + <#elseif minor.class == "Decimal"> public void get(int index, ${minor.class}Holder holder) { holder.start = index * ${type.width}; holder.buffer = data; - holder.scale = getField().getScale(); - holder.precision = getField().getPrecision(); + holder.scale = scale; + holder.precision = precision; } public void get(int index, Nullable${minor.class}Holder holder) { holder.isSet = 1; holder.start = index * ${type.width}; holder.buffer = data; - holder.scale = getField().getScale(); - holder.precision = getField().getPrecision(); + holder.scale = scale; + holder.precision = precision; } - @Override - public ${friendlyType} getObject(int index) { - <#if (minor.class == "Decimal28Sparse") || (minor.class == "Decimal38Sparse")> - // Get the BigDecimal object - return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromSparse(data, index * ${type.width}, ${minor.nDecimalDigits}, getField().getScale()); - <#else> - return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromDense(data, index * ${type.width}, ${minor.nDecimalDigits}, getField().getScale(), ${minor.maxPrecisionDigits}, ${type.width}); - + @Override + public ${friendlyType} getObject(int index) { + byte[] bytes = new byte[${type.width}]; + int start = ${type.width} * index; + data.getBytes(start, bytes, 0, ${type.width}); + ${friendlyType} value = new BigDecimal(new BigInteger(bytes), scale); + return value; } <#else> @@ -581,7 +589,7 @@ public final class Mutator extends BaseDataValueVector.BaseMutator { * @param index position of the bit to set * @param value value to set */ - <#if (type.width > 8)> + <#if (type.width > 8) || minor.class == "IntervalDay"> public void set(int index, <#if (type.width > 4)>${minor.javaType!type.javaType}<#else>int value) { data.setBytes(index * ${type.width}, value, 0, ${type.width}); } @@ -653,7 +661,7 @@ public void setSafe(int index, Nullable${minor.class}Holder holder){ setSafe(index, holder.days, holder.milliseconds); } - <#elseif (minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse") || (minor.class == "Decimal28Dense") || (minor.class == "Decimal38Dense")> + <#elseif minor.class == "Decimal"> public void set(int index, ${minor.class}Holder holder){ set(index, holder.start, holder.buffer); diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index 3005fca0385..1ed9287b00e 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -19,9 +19,8 @@ <@pp.dropOutputFile /> <#list vv.types as type> <#list type.minor as minor> -<#list ["", "Nullable", "Repeated"] as holderMode> +<#list ["", "Nullable"] as holderMode> <#assign nullMode = holderMode /> -<#if holderMode == "Repeated"><#assign nullMode = "Nullable" /> <#assign lowerName = minor.class?uncap_first /> <#if lowerName == "int" ><#assign lowerName = "integer" /> @@ -50,42 +49,18 @@ public class ${holderMode}${name}HolderReaderImpl extends AbstractFieldReader { private ${nullMode}${name}Holder holder; -<#if holderMode == "Repeated" > - private int index = -1; - private ${holderMode}${name}Holder repeatedHolder; - - public ${holderMode}${name}HolderReaderImpl(${holderMode}${name}Holder holder) { -<#if holderMode == "Repeated" > - this.holder = new ${nullMode}${name}Holder(); - this.repeatedHolder = holder; -<#else> this.holder = holder; - } @Override public int size() { -<#if holderMode == "Repeated"> - return repeatedHolder.end - repeatedHolder.start; -<#else> throw new UnsupportedOperationException("You can't call size on a Holder value reader."); - } @Override public boolean next() { -<#if holderMode == "Repeated"> - if(index + 1 < repeatedHolder.end) { - index++; - repeatedHolder.vector.getAccessor().get(repeatedHolder.start + index, holder); - return true; - } else { - return false; - } -<#else> throw new UnsupportedOperationException("You can't call next on a single value reader."); - } @@ -95,19 +70,13 @@ public void setPosition(int index) { } @Override - public MajorType getType() { -<#if holderMode == "Repeated"> - return this.repeatedHolder.TYPE; -<#else> - return this.holder.TYPE; - + public MinorType getMinorType() { + return MinorType.${name?upper_case}; } @Override public boolean isSet() { - <#if holderMode == "Repeated"> - return this.repeatedHolder.end!=this.repeatedHolder.start; - <#elseif nullMode == "Nullable"> + <#if holderMode == "Nullable"> return this.holder.isSet == 1; <#else> return true; @@ -115,7 +84,6 @@ public boolean isSet() { } -<#if holderMode != "Repeated"> @Override public void read(${name}Holder h) { <#list fields as field> @@ -130,19 +98,7 @@ public void read(Nullable${name}Holder h) { h.isSet = isSet() ? 1 : 0; } - -<#if holderMode == "Repeated"> - @Override - public ${friendlyType} read${safeType}(int index){ - repeatedHolder.vector.getAccessor().get(repeatedHolder.start + index, holder); - ${friendlyType} value = read${safeType}(); - if (this.index > -1) { - repeatedHolder.vector.getAccessor().get(repeatedHolder.start + this.index, holder); - } - return value; - } - @Override public ${friendlyType} read${safeType}(){ @@ -176,29 +132,10 @@ public void read(Nullable${name}Holder h) { Period p = new Period(); return p.plusDays(holder.days).plusMillis(holder.milliseconds); -<#elseif minor.class == "Decimal9" || - minor.class == "Decimal18" > - BigInteger value = BigInteger.valueOf(holder.value); - return new BigDecimal(value, holder.scale); - -<#elseif minor.class == "Decimal28Dense" || - minor.class == "Decimal38Dense"> - return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromDense(holder.buffer, - holder.start, - holder.nDecimalDigits, - holder.scale, - holder.maxPrecision, - holder.WIDTH); - -<#elseif minor.class == "Decimal28Sparse" || - minor.class == "Decimal38Sparse"> - return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromSparse(holder.buffer, - holder.start, - holder.nDecimalDigits, - holder.scale); - <#elseif minor.class == "Bit" > return new Boolean(holder.value != 0); +<#elseif minor.class == "Decimal" > + return (BigDecimal) readSingleObject(); <#else> ${friendlyType} value = new ${friendlyType}(this.holder.value); return value; @@ -208,15 +145,7 @@ public void read(Nullable${name}Holder h) { @Override public Object readObject() { -<#if holderMode == "Repeated" > - List valList = Lists.newArrayList(); - for (int i = repeatedHolder.start; i < repeatedHolder.end; i++) { - valList.add(repeatedHolder.vector.getAccessor().getObject(i)); - } - return valList; -<#else> return readSingleObject(); - } private Object readSingleObject() { @@ -239,6 +168,9 @@ private Object readSingleObject() { Text text = new Text(); text.set(value); return text; +<#elseif minor.class == "Decimal" > + return new BigDecimal(new BigInteger(value), holder.scale); + <#elseif minor.class == "Interval"> @@ -249,11 +181,6 @@ private Object readSingleObject() { Period p = new Period(); return p.plusDays(holder.days).plusMillis(holder.milliseconds); -<#elseif minor.class == "Decimal9" || - minor.class == "Decimal18" > - BigInteger value = BigInteger.valueOf(holder.value); - return new BigDecimal(value, holder.scale); - <#elseif minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense"> return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromDense(holder.buffer, @@ -272,13 +199,18 @@ private Object readSingleObject() { <#elseif minor.class == "Bit" > return new Boolean(holder.value != 0); +<#elseif minor.class == "Decimal"> + byte[] bytes = new byte[${type.width}]; + holder.buffer.getBytes(holder.start, bytes, 0, ${type.width}); + ${friendlyType} value = new BigDecimal(new BigInteger(bytes), holder.scale); + return value; <#else> ${friendlyType} value = new ${friendlyType}(this.holder.value); return value; } -<#if holderMode != "Repeated" && nullMode != "Nullable"> +<#if nullMode != "Nullable"> public void copyAsValue(${minor.class?cap_first}Writer writer){ writer.write(holder); } diff --git a/java/vector/src/main/codegen/templates/ListWriters.java b/java/vector/src/main/codegen/templates/ListWriters.java deleted file mode 100644 index 94b812b83dc..00000000000 --- a/java/vector/src/main/codegen/templates/ListWriters.java +++ /dev/null @@ -1,234 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -<@pp.dropOutputFile /> - -<#list ["Single", "Repeated"] as mode> -<@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${mode}ListWriter.java" /> - - -<#include "/@includes/license.ftl" /> - -package org.apache.arrow.vector.complex.impl; -<#if mode == "Single"> - <#assign containerClass = "AbstractContainerVector" /> - <#assign index = "idx()"> -<#else> - <#assign containerClass = "RepeatedListVector" /> - <#assign index = "currentChildIndex"> - - - -<#include "/@includes/vv_imports.ftl" /> - -/* - * This class is generated using FreeMarker and the ${.template_name} template. - */ -@SuppressWarnings("unused") -public class ${mode}ListWriter extends AbstractFieldWriter { - private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${mode}ListWriter.class); - - static enum Mode { INIT, IN_MAP, IN_LIST <#list vv.types as type><#list type.minor as minor>, IN_${minor.class?upper_case} } - - private final String name; - protected final ${containerClass} container; - private Mode mode = Mode.INIT; - private FieldWriter writer; - protected RepeatedValueVector innerVector; - - <#if mode == "Repeated">private int currentChildIndex = 0; - public ${mode}ListWriter(String name, ${containerClass} container, FieldWriter parent){ - super(parent); - this.name = name; - this.container = container; - } - - public ${mode}ListWriter(${containerClass} container, FieldWriter parent){ - super(parent); - this.name = null; - this.container = container; - } - - @Override - public void allocate() { - if(writer != null) { - writer.allocate(); - } - - <#if mode == "Repeated"> - container.allocateNew(); - - } - - @Override - public void clear() { - if (writer != null) { - writer.clear(); - } - } - - @Override - public void close() { - clear(); - container.close(); - if (innerVector != null) { - innerVector.close(); - } - } - - @Override - public int getValueCapacity() { - return innerVector == null ? 0 : innerVector.getValueCapacity(); - } - - public void setValueCount(int count){ - if(innerVector != null) innerVector.getMutator().setValueCount(count); - } - - @Override - public MapWriter map() { - switch(mode) { - case INIT: - int vectorCount = container.size(); - final RepeatedMapVector vector = container.addOrGet(name, RepeatedMapVector.TYPE, RepeatedMapVector.class); - innerVector = vector; - writer = new RepeatedMapWriter(vector, this); - if(vectorCount != container.size()) { - writer.allocate(); - } - writer.setPosition(${index}); - mode = Mode.IN_MAP; - return writer; - case IN_MAP: - return writer; - } - - throw new RuntimeException(getUnsupportedErrorMsg("MAP", mode.name())); - - } - - @Override - public ListWriter list() { - switch(mode) { - case INIT: - final int vectorCount = container.size(); - final RepeatedListVector vector = container.addOrGet(name, RepeatedListVector.TYPE, RepeatedListVector.class); - innerVector = vector; - writer = new RepeatedListWriter(null, vector, this); - if(vectorCount != container.size()) { - writer.allocate(); - } - writer.setPosition(${index}); - mode = Mode.IN_LIST; - return writer; - case IN_LIST: - return writer; - } - - throw new RuntimeException(getUnsupportedErrorMsg("LIST", mode.name())); - - } - - <#list vv.types as type><#list type.minor as minor> - <#assign lowerName = minor.class?uncap_first /> - <#assign upperName = minor.class?upper_case /> - <#assign capName = minor.class?cap_first /> - <#if lowerName == "int" ><#assign lowerName = "integer" /> - - private static final MajorType ${upperName}_TYPE = Types.repeated(MinorType.${upperName}); - - @Override - public ${capName}Writer ${lowerName}() { - switch(mode) { - case INIT: - final int vectorCount = container.size(); - final Repeated${capName}Vector vector = container.addOrGet(name, ${upperName}_TYPE, Repeated${capName}Vector.class); - innerVector = vector; - writer = new Repeated${capName}WriterImpl(vector, this); - if(vectorCount != container.size()) { - writer.allocate(); - } - writer.setPosition(${index}); - mode = Mode.IN_${upperName}; - return writer; - case IN_${upperName}: - return writer; - } - - throw new RuntimeException(getUnsupportedErrorMsg("${upperName}", mode.name())); - - } - - - public MaterializedField getField() { - return container.getField(); - } - - <#if mode == "Repeated"> - - public void startList() { - final RepeatedListVector list = (RepeatedListVector) container; - final RepeatedListVector.RepeatedMutator mutator = list.getMutator(); - - // make sure that the current vector can support the end position of this list. - if(container.getValueCapacity() <= idx()) { - mutator.setValueCount(idx()+1); - } - - // update the repeated vector to state that there is current+1 objects. - final RepeatedListHolder h = new RepeatedListHolder(); - list.getAccessor().get(idx(), h); - if (h.start >= h.end) { - mutator.startNewValue(idx()); - } - currentChildIndex = container.getMutator().add(idx()); - if(writer != null) { - writer.setPosition(currentChildIndex); - } - } - - public void endList() { - // noop, we initialize state at start rather than end. - } - <#else> - - public void setPosition(int index) { - super.setPosition(index); - if(writer != null) { - writer.setPosition(index); - } - } - - public void startList() { - // noop - } - - public void endList() { - // noop - } - - - private String getUnsupportedErrorMsg(String expected, String found) { - final String f = found.substring(3); - return String.format("In a list of type %s, encountered a value of type %s. "+ - "Arrow does not support lists of different types.", - f, expected - ); - } -} - diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index 42f39820393..af2922826ec 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -17,7 +17,7 @@ */ <@pp.dropOutputFile /> -<#list ["Single", "Repeated"] as mode> +<#list ["Single"] as mode> <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${mode}MapWriter.java" /> <#if mode == "Single"> <#assign containerClass = "MapVector" /> @@ -51,16 +51,8 @@ public class ${mode}MapWriter extends AbstractFieldWriter { private final Map fields = Maps.newHashMap(); <#if mode == "Repeated">private int currentChildIndex = 0; - private final boolean unionEnabled; - - public ${mode}MapWriter(${containerClass} container, FieldWriter parent, boolean unionEnabled) { - super(parent); + public ${mode}MapWriter(${containerClass} container) { this.container = container; - this.unionEnabled = unionEnabled; - } - - public ${mode}MapWriter(${containerClass} container, FieldWriter parent) { - this(container, parent, false); } @Override @@ -74,7 +66,7 @@ public boolean isEmptyMap() { } @Override - public MaterializedField getField() { + public Field getField() { return container.getField(); } @@ -83,12 +75,8 @@ public MapWriter map(String name) { FieldWriter writer = fields.get(name.toLowerCase()); if(writer == null){ int vectorCount=container.size(); - MapVector vector = container.addOrGet(name, MapVector.TYPE, MapVector.class); - if(!unionEnabled){ - writer = new SingleMapWriter(vector, this); - } else { - writer = new PromotableWriter(vector, container); - } + MapVector vector = container.addOrGet(name, MinorType.MAP, MapVector.class); + writer = new PromotableWriter(vector, container); if(vectorCount != container.size()) { writer.allocate(); } @@ -125,11 +113,7 @@ public ListWriter list(String name) { FieldWriter writer = fields.get(name.toLowerCase()); int vectorCount = container.size(); if(writer == null) { - if (!unionEnabled){ - writer = new SingleListWriter(name,container,this); - } else{ - writer = new PromotableWriter(container.addOrGet(name, Types.optional(MinorType.LIST), ListVector.class), container); - } + writer = new PromotableWriter(container.addOrGet(name, MinorType.LIST, ListVector.class), container); if (container.size() > vectorCount) { writer.allocate(); } @@ -206,9 +190,7 @@ public void end() { } public ${minor.class}Writer ${lowerName}(String name, int scale, int precision) { - final MajorType ${upperName}_TYPE = new MajorType(MinorType.${upperName}, DataMode.OPTIONAL, precision, scale, 0, null); <#else> - private static final MajorType ${upperName}_TYPE = Types.optional(MinorType.${upperName}); @Override public ${minor.class}Writer ${lowerName}(String name) { @@ -216,15 +198,9 @@ public void end() { if(writer == null) { ValueVector vector; ValueVector currentVector = container.getChild(name); - if (unionEnabled){ - ${vectName}Vector v = container.addOrGet(name, ${upperName}_TYPE, ${vectName}Vector.class); - writer = new PromotableWriter(v, container); - vector = v; - } else { - ${vectName}Vector v = container.addOrGet(name, ${upperName}_TYPE, ${vectName}Vector.class); - writer = new ${vectName}WriterImpl(v, this); - vector = v; - } + ${vectName}Vector v = container.addOrGet(name, MinorType.${upperName}, ${vectName}Vector.class); + writer = new PromotableWriter(v, container); + vector = v; if (currentVector == null || currentVector != vector) { vector.allocateNewSafe(); } diff --git a/java/vector/src/main/codegen/templates/NullReader.java b/java/vector/src/main/codegen/templates/NullReader.java index 3ef6c7dcc49..ba0c088add7 100644 --- a/java/vector/src/main/codegen/templates/NullReader.java +++ b/java/vector/src/main/codegen/templates/NullReader.java @@ -16,6 +16,9 @@ * limitations under the License. */ +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.Field; + <@pp.dropOutputFile /> <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/NullReader.java" /> @@ -31,25 +34,31 @@ public class NullReader extends AbstractBaseReader implements FieldReader{ public static final NullReader INSTANCE = new NullReader(); - public static final NullReader EMPTY_LIST_INSTANCE = new NullReader(Types.repeated(MinorType.NULL)); - public static final NullReader EMPTY_MAP_INSTANCE = new NullReader(Types.required(MinorType.MAP)); - private MajorType type; + public static final NullReader EMPTY_LIST_INSTANCE = new NullReader(MinorType.NULL); + public static final NullReader EMPTY_MAP_INSTANCE = new NullReader(MinorType.MAP); + private MinorType type; private NullReader(){ super(); - type = Types.required(MinorType.NULL); + type = MinorType.NULL; } - private NullReader(MajorType type){ + private NullReader(MinorType type){ super(); this.type = type; } @Override - public MajorType getType() { + public MinorType getMinorType() { return type; } - + + + @Override + public Field getField() { + return new Field("", true, new Null(), null); + } + public void copyAsValue(MapWriter writer) {} public void copyAsValue(ListWriter writer) {} diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index b0029f7ad4c..df508979c48 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -42,19 +42,79 @@ public final class ${className} extends BaseDataValueVector implements <#if type.major == "VarLen">VariableWidth<#else>FixedWidthVector, NullableVector{ private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); - private final FieldReader reader = new Nullable${minor.class}ReaderImpl(Nullable${minor.class}Vector.this); + private final FieldReader reader = new ${minor.class}ReaderImpl(Nullable${minor.class}Vector.this); - private final MaterializedField bitsField = MaterializedField.create("$bits$", new MajorType(MinorType.UINT1, DataMode.REQUIRED)); - private final MaterializedField valuesField = MaterializedField.create("$values$", new MajorType(field.getType().getMinorType(), DataMode.REQUIRED, field.getPrecision(), field.getScale())); + private final String bitsField = "$bits$"; + private final String valuesField = "$values$"; + private final Field field; final UInt1Vector bits = new UInt1Vector(bitsField, allocator); - final ${valuesName} values = new ${minor.class}Vector(valuesField, allocator); + final ${valuesName} values; - private final Mutator mutator = new Mutator(); - private final Accessor accessor = new Accessor(); + private final Mutator mutator; + private final Accessor accessor; - public ${className}(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); + <#if minor.class == "Decimal"> + private final int precision; + private final int scale; + + public ${className}(String name, BufferAllocator allocator, int precision, int scale) { + super(name, allocator); + values = new ${minor.class}Vector(valuesField, allocator, precision, scale); + this.precision = precision; + this.scale = scale; + mutator = new Mutator(); + accessor = new Accessor(); + field = new Field(name, true, new Decimal(precision, scale), null); + } + <#else> + public ${className}(String name, BufferAllocator allocator) { + super(name, allocator); + values = new ${minor.class}Vector(valuesField, allocator); + mutator = new Mutator(); + accessor = new Accessor(); + <#if minor.class == "TinyInt" || + minor.class == "SmallInt" || + minor.class == "Int" || + minor.class == "BigInt"> + field = new Field(name, true, new Int(${type.width} * 8, true), null); + <#elseif minor.class == "UInt1" || + minor.class == "UInt2" || + minor.class == "UInt4" || + minor.class == "UInt8"> + field = new Field(name, true, new Int(${type.width} * 8, false), null); + <#elseif minor.class == "Date"> + field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Date(), null); + <#elseif minor.class == "Time"> + field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Time(), null); + <#elseif minor.class == "Float4"> + field = new Field(name, true, new FloatingPoint(0), null); + <#elseif minor.class == "Float8"> + field = new Field(name, true, new FloatingPoint(1), null); + <#elseif minor.class == "TimeStamp"> + field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(""), null); + <#elseif minor.class == "IntervalDay"> + field = new Field(name, true, new IntervalDay(), null); + <#elseif minor.class == "IntervalYear"> + field = new Field(name, true, new IntervalYear(), null); + <#elseif minor.class == "VarChar"> + field = new Field(name, true, new Utf8(), null); + <#elseif minor.class == "VarBinary"> + field = new Field(name, true, new Binary(), null); + <#elseif minor.class == "Bit"> + field = new Field(name, true, new Bool(), null); + + } + + + @Override + public Field getField() { + return field; + } + + @Override + public MinorType getMinorType() { + return MinorType.${minor.class?upper_case}; } @Override @@ -240,12 +300,13 @@ public void zeroVector() { @Override public TransferPair getTransferPair(BufferAllocator allocator){ - return new TransferImpl(getField(), allocator); + return new TransferImpl(name, allocator); + } @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator){ - return new TransferImpl(getField().withPath(ref), allocator); + return new TransferImpl(ref, allocator); } @Override @@ -273,8 +334,12 @@ public void splitAndTransferTo(int startIndex, int length, Nullable${minor.class private class TransferImpl implements TransferPair { Nullable${minor.class}Vector to; - public TransferImpl(MaterializedField field, BufferAllocator allocator){ - to = new Nullable${minor.class}Vector(field, allocator); + public TransferImpl(String name, BufferAllocator allocator){ + <#if minor.class == "Decimal"> + to = new Nullable${minor.class}Vector(name, allocator, precision, scale); + <#else> + to = new Nullable${minor.class}Vector(name, allocator); + } public TransferImpl(Nullable${minor.class}Vector to){ @@ -312,17 +377,6 @@ public Mutator getMutator(){ return mutator; } - public ${minor.class}Vector convertToRequiredVector(){ - ${minor.class}Vector v = new ${minor.class}Vector(getField().getOtherNullableVersion(), allocator); - if (v.data != null) { - v.data.release(1); - } - v.data = values.data; - v.data.retain(1); - clear(); - return v; - } - public void copyFrom(int fromIndex, int thisIndex, Nullable${minor.class}Vector from){ final Accessor fromAccessor = from.getAccessor(); if (!fromAccessor.isNull(fromIndex)) { @@ -389,8 +443,8 @@ public void get(int index, Nullable${minor.class}Holder holder){ holder.isSet = bAccessor.get(index); <#if minor.class.startsWith("Decimal")> - holder.scale = getField().getScale(); - holder.precision = getField().getPrecision(); + holder.scale = scale; + holder.precision = precision; } diff --git a/java/vector/src/main/codegen/templates/RepeatedValueVectors.java b/java/vector/src/main/codegen/templates/RepeatedValueVectors.java deleted file mode 100644 index ceae53bbf58..00000000000 --- a/java/vector/src/main/codegen/templates/RepeatedValueVectors.java +++ /dev/null @@ -1,421 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -<@pp.dropOutputFile /> -<#list vv.types as type> -<#list type.minor as minor> -<#assign friendlyType = (minor.friendlyType!minor.boxedType!type.boxedType) /> -<#assign fields = minor.fields!type.fields /> - -<@pp.changeOutputFile name="/org/apache/arrow/vector/Repeated${minor.class}Vector.java" /> -<#include "/@includes/license.ftl" /> - -package org.apache.arrow.vector; - -<#include "/@includes/vv_imports.ftl" /> - -/** - * Repeated${minor.class} implements a vector with multple values per row (e.g. JSON array or - * repeated protobuf field). The implementation uses two additional value vectors; one to convert - * the index offset to the underlying element offset, and another to store the number of values - * in the vector. - * - * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. - */ - -public final class Repeated${minor.class}Vector extends BaseRepeatedValueVector implements Repeated<#if type.major == "VarLen">VariableWidth<#else>FixedWidthVectorLike { - //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(Repeated${minor.class}Vector.class); - - // we maintain local reference to concrete vector type for performance reasons. - ${minor.class}Vector values; - private final FieldReader reader = new Repeated${minor.class}ReaderImpl(Repeated${minor.class}Vector.this); - private final Mutator mutator = new Mutator(); - private final Accessor accessor = new Accessor(); - - public Repeated${minor.class}Vector(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); - addOrGetVector(VectorDescriptor.create(new MajorType(field.getType().getMinorType(), DataMode.REQUIRED))); - } - - @Override - public Mutator getMutator() { - return mutator; - } - - @Override - public Accessor getAccessor() { - return accessor; - } - - @Override - public FieldReader getReader() { - return reader; - } - - @Override - public ${minor.class}Vector getDataVector() { - return values; - } - - @Override - public TransferPair getTransferPair(BufferAllocator allocator) { - return new TransferImpl(getField(), allocator); - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator){ - return new TransferImpl(getField().withPath(ref), allocator); - } - - @Override - public TransferPair makeTransferPair(ValueVector to) { - return new TransferImpl((Repeated${minor.class}Vector) to); - } - - @Override - public AddOrGetResult<${minor.class}Vector> addOrGetVector(VectorDescriptor descriptor) { - final AddOrGetResult<${minor.class}Vector> result = super.addOrGetVector(descriptor); - if (result.isCreated()) { - values = result.getVector(); - } - return result; - } - - public void transferTo(Repeated${minor.class}Vector target) { - target.clear(); - offsets.transferTo(target.offsets); - values.transferTo(target.values); - clear(); - } - - public void splitAndTransferTo(final int startIndex, final int groups, Repeated${minor.class}Vector to) { - final UInt4Vector.Accessor a = offsets.getAccessor(); - final UInt4Vector.Mutator m = to.offsets.getMutator(); - - final int startPos = a.get(startIndex); - final int endPos = a.get(startIndex + groups); - final int valuesToCopy = endPos - startPos; - - values.splitAndTransferTo(startPos, valuesToCopy, to.values); - to.offsets.clear(); - to.offsets.allocateNew(groups + 1); - int normalizedPos = 0; - for (int i=0; i < groups + 1;i++ ) { - normalizedPos = a.get(startIndex+i) - startPos; - m.set(i, normalizedPos); - } - m.setValueCount(groups == 0 ? 0 : groups + 1); - } - - private class TransferImpl implements TransferPair { - final Repeated${minor.class}Vector to; - - public TransferImpl(MaterializedField field, BufferAllocator allocator) { - this.to = new Repeated${minor.class}Vector(field, allocator); - } - - public TransferImpl(Repeated${minor.class}Vector to) { - this.to = to; - } - - @Override - public Repeated${minor.class}Vector getTo() { - return to; - } - - @Override - public void transfer() { - transferTo(to); - } - - @Override - public void splitAndTransfer(int startIndex, int length) { - splitAndTransferTo(startIndex, length, to); - } - - @Override - public void copyValueSafe(int fromIndex, int toIndex) { - to.copyFromSafe(fromIndex, toIndex, Repeated${minor.class}Vector.this); - } - } - - public void copyFrom(int inIndex, int outIndex, Repeated${minor.class}Vector v) { - final Accessor vAccessor = v.getAccessor(); - final int count = vAccessor.getInnerValueCountAt(inIndex); - mutator.startNewValue(outIndex); - for (int i = 0; i < count; i++) { - mutator.add(outIndex, vAccessor.get(inIndex, i)); - } - } - - public void copyFromSafe(int inIndex, int outIndex, Repeated${minor.class}Vector v) { - final Accessor vAccessor = v.getAccessor(); - final int count = vAccessor.getInnerValueCountAt(inIndex); - mutator.startNewValue(outIndex); - for (int i = 0; i < count; i++) { - mutator.addSafe(outIndex, vAccessor.get(inIndex, i)); - } - } - - public boolean allocateNewSafe() { - /* boolean to keep track if all the memory allocation were successful - * Used in the case of composite vectors when we need to allocate multiple - * buffers for multiple vectors. If one of the allocations failed we need to - * clear all the memory that we allocated - */ - boolean success = false; - try { - if(!offsets.allocateNewSafe()) return false; - if(!values.allocateNewSafe()) return false; - success = true; - } finally { - if (!success) { - clear(); - } - } - offsets.zeroVector(); - mutator.reset(); - return true; - } - - @Override - public void allocateNew() { - try { - offsets.allocateNew(); - values.allocateNew(); - } catch (OutOfMemoryException e) { - clear(); - throw e; - } - offsets.zeroVector(); - mutator.reset(); - } - - <#if type.major == "VarLen"> -// @Override -// protected SerializedField.Builder getMetadataBuilder() { -// return super.getMetadataBuilder() -// .setVarByteLength(values.getVarByteLength()); -// } - - public void allocateNew(int totalBytes, int valueCount, int innerValueCount) { - try { - offsets.allocateNew(valueCount + 1); - values.allocateNew(totalBytes, innerValueCount); - } catch (OutOfMemoryException e) { - clear(); - throw e; - } - offsets.zeroVector(); - mutator.reset(); - } - - public int getByteCapacity(){ - return values.getByteCapacity(); - } - - <#else> - - @Override - public void allocateNew(int valueCount, int innerValueCount) { - clear(); - /* boolean to keep track if all the memory allocation were successful - * Used in the case of composite vectors when we need to allocate multiple - * buffers for multiple vectors. If one of the allocations failed we need to// - * clear all the memory that we allocated - */ - boolean success = false; - try { - offsets.allocateNew(valueCount + 1); - values.allocateNew(innerValueCount); - } catch(OutOfMemoryException e){ - clear(); - throw e; - } - offsets.zeroVector(); - mutator.reset(); - } - - - - // This is declared a subclass of the accessor declared inside of FixedWidthVector, this is also used for - // variable length vectors, as they should ahve consistent interface as much as possible, if they need to diverge - // in the future, the interface shold be declared in the respective value vector superclasses for fixed and variable - // and we should refer to each in the generation template - public final class Accessor extends BaseRepeatedValueVector.BaseRepeatedAccessor { - @Override - public List<${friendlyType}> getObject(int index) { - final List<${friendlyType}> vals = new JsonStringArrayList<>(); - final UInt4Vector.Accessor offsetsAccessor = offsets.getAccessor(); - final int start = offsetsAccessor.get(index); - final int end = offsetsAccessor.get(index + 1); - final ${minor.class}Vector.Accessor valuesAccessor = values.getAccessor(); - for(int i = start; i < end; i++) { - vals.add(valuesAccessor.getObject(i)); - } - return vals; - } - - public ${friendlyType} getSingleObject(int index, int arrayIndex) { - final int start = offsets.getAccessor().get(index); - return values.getAccessor().getObject(start + arrayIndex); - } - - /** - * Get a value for the given record. Each element in the repeated field is accessed by - * the positionIndex param. - * - * @param index record containing the repeated field - * @param positionIndex position within the repeated field - * @return element at the given position in the given record - */ - public <#if type.major == "VarLen">byte[] - <#else>${minor.javaType!type.javaType} - get(int index, int positionIndex) { - return values.getAccessor().get(offsets.getAccessor().get(index) + positionIndex); - } - - public void get(int index, Repeated${minor.class}Holder holder) { - holder.start = offsets.getAccessor().get(index); - holder.end = offsets.getAccessor().get(index+1); - holder.vector = values; - } - - public void get(int index, int positionIndex, ${minor.class}Holder holder) { - final int offset = offsets.getAccessor().get(index); - assert offset >= 0; - assert positionIndex < getInnerValueCountAt(index); - values.getAccessor().get(offset + positionIndex, holder); - } - - public void get(int index, int positionIndex, Nullable${minor.class}Holder holder) { - final int offset = offsets.getAccessor().get(index); - assert offset >= 0; - if (positionIndex >= getInnerValueCountAt(index)) { - holder.isSet = 0; - return; - } - values.getAccessor().get(offset + positionIndex, holder); - } - } - - public final class Mutator extends BaseRepeatedValueVector.BaseRepeatedMutator implements RepeatedMutator { - private Mutator() {} - - /** - * Add an element to the given record index. This is similar to the set() method in other - * value vectors, except that it permits setting multiple values for a single record. - * - * @param index record of the element to add - * @param value value to add to the given row - */ - public void add(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width < 4)>int<#else>${minor.javaType!type.javaType} value) { - int nextOffset = offsets.getAccessor().get(index+1); - values.getMutator().set(nextOffset, value); - offsets.getMutator().set(index+1, nextOffset+1); - } - - <#if type.major == "VarLen"> - public void addSafe(int index, byte[] bytes) { - addSafe(index, bytes, 0, bytes.length); - } - - public void addSafe(int index, byte[] bytes, int start, int length) { - final int nextOffset = offsets.getAccessor().get(index+1); - values.getMutator().setSafe(nextOffset, bytes, start, length); - offsets.getMutator().setSafe(index+1, nextOffset+1); - } - - <#else> - - public void addSafe(int index, ${minor.javaType!type.javaType} srcValue) { - final int nextOffset = offsets.getAccessor().get(index+1); - values.getMutator().setSafe(nextOffset, srcValue); - offsets.getMutator().setSafe(index+1, nextOffset+1); - } - - - - public void setSafe(int index, Repeated${minor.class}Holder h) { - final ${minor.class}Holder ih = new ${minor.class}Holder(); - final ${minor.class}Vector.Accessor hVectorAccessor = h.vector.getAccessor(); - mutator.startNewValue(index); - for(int i = h.start; i < h.end; i++){ - hVectorAccessor.get(i, ih); - mutator.addSafe(index, ih); - } - } - - public void addSafe(int index, ${minor.class}Holder holder) { - int nextOffset = offsets.getAccessor().get(index+1); - values.getMutator().setSafe(nextOffset, holder); - offsets.getMutator().setSafe(index+1, nextOffset+1); - } - - public void addSafe(int index, Nullable${minor.class}Holder holder) { - final int nextOffset = offsets.getAccessor().get(index+1); - values.getMutator().setSafe(nextOffset, holder); - offsets.getMutator().setSafe(index+1, nextOffset+1); - } - - <#if (fields?size > 1) && !(minor.class == "Decimal9" || minor.class == "Decimal18" || minor.class == "Decimal28Sparse" || minor.class == "Decimal38Sparse" || minor.class == "Decimal28Dense" || minor.class == "Decimal38Dense")> - public void addSafe(int arrayIndex, <#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { - int nextOffset = offsets.getAccessor().get(arrayIndex+1); - values.getMutator().setSafe(nextOffset, <#list fields as field>${field.name}<#if field_has_next>, ); - offsets.getMutator().setSafe(arrayIndex+1, nextOffset+1); - } - - - protected void add(int index, ${minor.class}Holder holder) { - int nextOffset = offsets.getAccessor().get(index+1); - values.getMutator().set(nextOffset, holder); - offsets.getMutator().set(index+1, nextOffset+1); - } - - public void add(int index, Repeated${minor.class}Holder holder) { - - ${minor.class}Vector.Accessor accessor = holder.vector.getAccessor(); - ${minor.class}Holder innerHolder = new ${minor.class}Holder(); - - for(int i = holder.start; i < holder.end; i++) { - accessor.get(i, innerHolder); - add(index, innerHolder); - } - } - - @Override - public void generateTestData(final int valCount) { - final int[] sizes = {1, 2, 0, 6}; - int size = 0; - int runningOffset = 0; - final UInt4Vector.Mutator offsetsMutator = offsets.getMutator(); - for(int i = 1; i < valCount + 1; i++, size++) { - runningOffset += sizes[size % sizes.length]; - offsetsMutator.set(i, runningOffset); - } - values.getMutator().generateTestData(valCount * 9); - setValueCount(size); - } - - @Override - public void reset() { - } - } -} - - diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index 9a6b08fc561..49d57e716bc 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -43,7 +43,6 @@ public class UnionListWriter extends AbstractFieldWriter { private int lastIndex = 0; public UnionListWriter(ListVector vector) { - super(null); this.vector = vector; this.writer = new PromotableWriter(vector.getDataVector(), vector); this.offsets = vector.getOffsetVector(); @@ -64,10 +63,14 @@ public void clear() { } @Override - public MaterializedField getField() { + public Field getField() { return null; } + public void setValueCount(int count) { + vector.getMutator().setValueCount(count); + } + @Override public int getValueCapacity() { return vector.getValueCapacity(); @@ -78,6 +81,12 @@ public void close() throws Exception { } + @Override + public void setPosition(int index) { + super.setPosition(index); + startList(); + } + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> <#assign uncappedName = name?uncap_first/> @@ -91,7 +100,7 @@ public void close() throws Exception { @Override public ${name}Writer <#if uncappedName == "int">integer<#else>${uncappedName}(String name) { - assert inMap; +// assert inMap; mapName = name; final int nextOffset = offsets.getAccessor().get(idx() + 1); vector.getMutator().setNotNull(idx()); @@ -146,7 +155,7 @@ public void endList() { @Override public void start() { - assert inMap; +// assert inMap; final int nextOffset = offsets.getAccessor().get(idx() + 1); vector.getMutator().setNotNull(idx()); offsets.getMutator().setSafe(idx() + 1, nextOffset); @@ -155,11 +164,11 @@ public void start() { @Override public void end() { - if (inMap) { +// if (inMap) { inMap = false; final int nextOffset = offsets.getAccessor().get(idx() + 1); offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); - } +// } } <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> @@ -170,7 +179,7 @@ public void end() { @Override public void write${name}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { - assert !inMap; +// assert !inMap; final int nextOffset = offsets.getAccessor().get(idx() + 1); vector.getMutator().setNotNull(idx()); writer.setPosition(nextOffset); diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 44c3e55dcc6..7351ae3776f 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -17,6 +17,8 @@ */ +import org.apache.arrow.vector.types.Types.MinorType; + <@pp.dropOutputFile /> <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/UnionReader.java" /> @@ -37,18 +39,18 @@ public UnionReader(UnionVector data) { this.data = data; } - private static MajorType[] TYPES = new MajorType[43]; + public MinorType getMinorType() { + return TYPES[data.getTypeValue(idx())]; + } + + private static MinorType[] TYPES = new MinorType[43]; static { for (MinorType minorType : MinorType.values()) { - TYPES[minorType.ordinal()] = new MajorType(minorType, DataMode.OPTIONAL); + TYPES[minorType.ordinal()] = minorType; } } - public MajorType getType() { - return TYPES[data.getTypeValue(idx())]; - } - public boolean isSet(){ return !data.getAccessor().isNull(idx()); } @@ -69,7 +71,7 @@ private FieldReader getReaderForIndex(int index) { return reader; } switch (MinorType.values()[typeValue]) { - case LATE: + case NULL: return NullReader.INSTANCE; case MAP: return (FieldReader) getMap(); @@ -119,9 +121,9 @@ public void copyAsValue(UnionWriter writer) { writer.data.copyFrom(idx(), writer.idx(), data); } - <#list ["Object", "BigDecimal", "Integer", "Long", "Boolean", - "Character", "DateTime", "Period", "Double", "Float", - "Text", "String", "Byte", "Short", "byte[]"] as friendlyType> + <#list ["Object", "Integer", "Long", "Boolean", + "Character", "DateTime", "Double", "Float", + "Text", "Byte", "Short", "byte[]"] as friendlyType> <#assign safeType=friendlyType /> <#if safeType=="byte[]"><#assign safeType="ByteArray" /> @@ -141,11 +143,11 @@ public void copyAsValue(UnionWriter writer) { <#if safeType=="byte[]"><#assign safeType="ByteArray" /> <#if !minor.class?starts_with("Decimal")> - private Nullable${name}ReaderImpl ${uncappedName}Reader; + private ${name}ReaderImpl ${uncappedName}Reader; - private Nullable${name}ReaderImpl get${name}() { + private ${name}ReaderImpl get${name}() { if (${uncappedName}Reader == null) { - ${uncappedName}Reader = new Nullable${name}ReaderImpl(data.get${name}Vector()); + ${uncappedName}Reader = new ${name}ReaderImpl(data.get${name}Vector()); ${uncappedName}Reader.setPosition(idx()); readers[MinorType.${name?upper_case}.ordinal()] = ${uncappedName}Reader; } diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 0f089b7e915..e2f19f4b33b 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -16,6 +16,16 @@ * limitations under the License. */ +import com.google.flatbuffers.FlatBufferBuilder; +import org.apache.arrow.flatbuf.Field; +import org.apache.arrow.flatbuf.Type; +import org.apache.arrow.flatbuf.Union; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.types.pojo.ArrowType; + +import java.util.ArrayList; +import java.util.List; + <@pp.dropOutputFile /> <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/UnionVector.java" /> @@ -29,7 +39,6 @@ import java.util.Iterator; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.util.CallBack; -import org.apache.arrow.vector.util.BasicTypeHelper; /* * This class is generated using freemarker and the ${.template_name} template. @@ -47,34 +56,30 @@ */ public class UnionVector implements ValueVector { - private MaterializedField field; + private String name; private BufferAllocator allocator; private Accessor accessor = new Accessor(); private Mutator mutator = new Mutator(); int valueCount; MapVector internalMap; - private UInt1Vector typeVector; + UInt1Vector typeVector; private MapVector mapVector; private ListVector listVector; private FieldReader reader; - private NullableBitVector bit; private int singleType = 0; private ValueVector singleVector; - private MajorType majorType; private final CallBack callBack; - public UnionVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { - this.field = field.clone(); + public UnionVector(String name, BufferAllocator allocator, CallBack callBack) { + this.name = name; this.allocator = allocator; this.internalMap = new MapVector("internal", allocator, callBack); - this.typeVector = internalMap.addOrGet("types", new MajorType(MinorType.UINT1, DataMode.REQUIRED), UInt1Vector.class); - this.field.addChild(internalMap.getField().clone()); - this.majorType = field.getType(); + this.typeVector = new UInt1Vector("types", allocator); this.callBack = callBack; } @@ -82,34 +87,20 @@ public BufferAllocator getAllocator() { return allocator; } - public List getSubTypes() { - return majorType.getSubTypes(); - } - - public void addSubType(MinorType type) { - if (majorType.getSubTypes().contains(type)) { - return; - } - List subTypes = this.majorType.getSubTypes(); - List newSubTypes = new ArrayList<>(subTypes); - newSubTypes.add(type); - majorType = new MajorType(this.majorType.getMinorType(), this.majorType.getMode(), this.majorType.getPrecision(), - this.majorType.getScale(), this.majorType.getTimezone(), newSubTypes); - field = MaterializedField.create(field.getName(), majorType); - if (callBack != null) { - callBack.doWork(); - } + @Override + public MinorType getMinorType() { + return MinorType.UNION; } - private static final MajorType MAP_TYPE = new MajorType(MinorType.MAP, DataMode.OPTIONAL); - public MapVector getMap() { if (mapVector == null) { int vectorCount = internalMap.size(); - mapVector = internalMap.addOrGet("map", MAP_TYPE, MapVector.class); - addSubType(MinorType.MAP); + mapVector = internalMap.addOrGet("map", MinorType.MAP, MapVector.class); if (internalMap.size() > vectorCount) { mapVector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } } } return mapVector; @@ -121,15 +112,16 @@ public MapVector getMap() { <#if !minor.class?starts_with("Decimal")> private Nullable${name}Vector ${uncappedName}Vector; - private static final MajorType ${name?upper_case}_TYPE = new MajorType(MinorType.${name?upper_case}, DataMode.OPTIONAL); public Nullable${name}Vector get${name}Vector() { if (${uncappedName}Vector == null) { int vectorCount = internalMap.size(); - ${uncappedName}Vector = internalMap.addOrGet("${uncappedName}", ${name?upper_case}_TYPE, Nullable${name}Vector.class); - addSubType(MinorType.${name?upper_case}); + ${uncappedName}Vector = internalMap.addOrGet("${uncappedName}", MinorType.${name?upper_case}, Nullable${name}Vector.class); if (internalMap.size() > vectorCount) { ${uncappedName}Vector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } } } return ${uncappedName}Vector; @@ -139,15 +131,15 @@ public MapVector getMap() { - private static final MajorType LIST_TYPE = new MajorType(MinorType.LIST, DataMode.OPTIONAL); - public ListVector getList() { if (listVector == null) { int vectorCount = internalMap.size(); - listVector = internalMap.addOrGet("list", LIST_TYPE, ListVector.class); - addSubType(MinorType.LIST); + listVector = internalMap.addOrGet("list", MinorType.LIST, ListVector.class); if (internalMap.size() > vectorCount) { listVector.allocateNew(); + if (callBack != null) { + callBack.doWork(); + } } } return listVector; @@ -164,6 +156,7 @@ public UInt1Vector getTypeVector() { @Override public void allocateNew() throws OutOfMemoryException { internalMap.allocateNew(); + typeVector.allocateNew(); if (typeVector != null) { typeVector.zeroVector(); } @@ -172,6 +165,7 @@ public void allocateNew() throws OutOfMemoryException { @Override public boolean allocateNewSafe() { boolean safe = internalMap.allocateNewSafe(); + safe = safe && typeVector.allocateNewSafe(); if (safe) { if (typeVector != null) { typeVector.zeroVector(); @@ -196,22 +190,27 @@ public void close() { @Override public void clear() { + typeVector.clear(); internalMap.clear(); } @Override - public MaterializedField getField() { - return field; + public Field getField() { + List childFields = new ArrayList<>(); + for (ValueVector v : internalMap.getChildren()) { + childFields.add(v.getField()); + } + return new Field(name, true, new ArrowType.Union(), childFields); } @Override public TransferPair getTransferPair(BufferAllocator allocator) { - return new TransferImpl(field, allocator); + return new TransferImpl(name, allocator); } @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new TransferImpl(field.withPath(ref), allocator); + return new TransferImpl(ref, allocator); } @Override @@ -219,10 +218,9 @@ public TransferPair makeTransferPair(ValueVector target) { return new TransferImpl((UnionVector) target); } - public void transferTo(UnionVector target) { + public void transferTo(org.apache.arrow.vector.complex.UnionVector target) { internalMap.makeTransferPair(target.internalMap).transfer(); target.valueCount = valueCount; - target.majorType = majorType; } public void copyFrom(int inIndex, int outIndex, UnionVector from) { @@ -236,13 +234,14 @@ public void copyFromSafe(int inIndex, int outIndex, UnionVector from) { } public ValueVector addVector(ValueVector v) { - String name = v.getField().getType().getMinorType().name().toLowerCase(); - MajorType type = v.getField().getType(); + String name = v.getMinorType().name().toLowerCase(); Preconditions.checkState(internalMap.getChild(name) == null, String.format("%s vector already exists", name)); - final ValueVector newVector = internalMap.addOrGet(name, type, (Class) BasicTypeHelper.getValueVectorClass(type.getMinorType(), type.getMode())); + final ValueVector newVector = internalMap.addOrGet(name, v.getMinorType(), v.getClass()); v.makeTransferPair(newVector).transfer(); internalMap.putChild(name, newVector); - addSubType(v.getField().getType().getMinorType()); + if (callBack != null) { + callBack.doWork(); + } return newVector; } @@ -250,8 +249,8 @@ private class TransferImpl implements TransferPair { UnionVector to; - public TransferImpl(MaterializedField field, BufferAllocator allocator) { - to = new UnionVector(field, allocator, null); + public TransferImpl(String name, BufferAllocator allocator) { + to = new UnionVector(name, allocator, null); } public TransferImpl(UnionVector to) { @@ -357,7 +356,7 @@ public class Accessor extends BaseValueVector.BaseAccessor { public Object getObject(int index) { int type = typeVector.getAccessor().get(index); switch (MinorType.values()[type]) { - case LATE: + case NULL: return null; <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> @@ -421,7 +420,7 @@ public void setSafe(int index, UnionHolder holder) { writer = new UnionWriter(UnionVector.this); } writer.setPosition(index); - MinorType type = reader.getType().getMinorType(); + MinorType type = reader.getMinorType(); switch (type) { <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> @@ -460,7 +459,7 @@ public void setSafe(int index, Nullable${name}Holder holder) { public void setType(int index, MinorType type) { - typeVector.getMutator().setSafe(index, type.ordinal()); + typeVector.getMutator().setSafe(index, (byte) type.ordinal()); } @Override diff --git a/java/vector/src/main/codegen/templates/UnionWriter.java b/java/vector/src/main/codegen/templates/UnionWriter.java index c9c29e0dd5f..1137e2cb020 100644 --- a/java/vector/src/main/codegen/templates/UnionWriter.java +++ b/java/vector/src/main/codegen/templates/UnionWriter.java @@ -37,17 +37,7 @@ public class UnionWriter extends AbstractFieldWriter implements FieldWriter { private UnionListWriter listWriter; private List writers = Lists.newArrayList(); - public UnionWriter(BufferAllocator allocator) { - super(null); - } - public UnionWriter(UnionVector vector) { - super(null); - data = vector; - } - - public UnionWriter(UnionVector vector, FieldWriter parent) { - super(null); data = vector; } @@ -84,7 +74,7 @@ public void endList() { private MapWriter getMapWriter() { if (mapWriter == null) { - mapWriter = new SingleMapWriter(data.getMap(), null, true); + mapWriter = new SingleMapWriter(data.getMap()); mapWriter.setPosition(idx()); writers.add(mapWriter); } @@ -120,7 +110,7 @@ public ListWriter asList() { private ${name}Writer get${name}Writer() { if (${uncappedName}Writer == null) { - ${uncappedName}Writer = new Nullable${name}WriterImpl(data.get${name}Vector(), null); + ${uncappedName}Writer = new ${name}WriterImpl(data.get${name}Vector()); ${uncappedName}Writer.setPosition(idx()); writers.add(${uncappedName}Writer); } @@ -217,7 +207,7 @@ public void close() throws Exception { } @Override - public MaterializedField getField() { + public Field getField() { return data.getField(); } diff --git a/java/vector/src/main/codegen/templates/ValueHolders.java b/java/vector/src/main/codegen/templates/ValueHolders.java index 2b14194574a..d744c523265 100644 --- a/java/vector/src/main/codegen/templates/ValueHolders.java +++ b/java/vector/src/main/codegen/templates/ValueHolders.java @@ -31,10 +31,6 @@ public final class ${className} implements ValueHolder{ - public static final MajorType TYPE = new MajorType(MinorType.${minor.class?upper_case}, DataMode.${mode.name?upper_case}); - - public MajorType getType() {return TYPE;} - <#if mode.name == "Repeated"> /** The first index (inclusive) into the Vector. **/ @@ -49,48 +45,13 @@ public final class ${className} implements ValueHolder{ <#else> public static final int WIDTH = ${type.width}; - <#if mode.name == "Optional">public int isSet; + <#if mode.name == "Optional">public int isSet; + <#else>public final int isSet = 1; <#assign fields = minor.fields!type.fields /> <#list fields as field> public ${field.type} ${field.name}; - <#if minor.class.startsWith("Decimal")> - public static final int maxPrecision = ${minor.maxPrecisionDigits}; - <#if minor.class.startsWith("Decimal28") || minor.class.startsWith("Decimal38")> - public static final int nDecimalDigits = ${minor.nDecimalDigits}; - - public static int getInteger(int index, int start, ArrowBuf buffer) { - int value = buffer.getInt(start + (index * 4)); - - if (index == 0) { - /* the first byte contains sign bit, return value without it */ - <#if minor.class.endsWith("Sparse")> - value = (value & 0x7FFFFFFF); - <#elseif minor.class.endsWith("Dense")> - value = (value & 0x0000007F); - - } - return value; - } - - public static void setInteger(int index, int value, int start, ArrowBuf buffer) { - buffer.setInt(start + (index * 4), value); - } - - public static void setSign(boolean sign, int start, ArrowBuf buffer) { - // Set MSB to 1 if sign is negative - if (sign == true) { - int value = getInteger(0, start, buffer); - setInteger(0, (value | 0x80000000), start, buffer); - } - } - - public static boolean getSign(int start, ArrowBuf buffer) { - return ((buffer.getInt(start) & 0x80000000) != 0); - } - - @Deprecated public int hashCode(){ throw new UnsupportedOperationException(); diff --git a/java/vector/src/main/codegen/templates/VariableLengthVectors.java b/java/vector/src/main/codegen/templates/VariableLengthVectors.java index 84fb3eb5567..bcd639ab8c3 100644 --- a/java/vector/src/main/codegen/templates/VariableLengthVectors.java +++ b/java/vector/src/main/codegen/templates/VariableLengthVectors.java @@ -56,9 +56,7 @@ public final class ${minor.class}Vector extends BaseDataValueVector implements V private static final int MIN_BYTE_COUNT = 4096; public final static String OFFSETS_VECTOR_NAME = "$offsets$"; - private final MaterializedField offsetsField = MaterializedField.create(OFFSETS_VECTOR_NAME, new MajorType(MinorType.UINT4, DataMode.REQUIRED)); - final UInt${type.width}Vector offsetVector = new UInt${type.width}Vector(offsetsField, allocator); - private final FieldReader reader = new ${minor.class}ReaderImpl(${minor.class}Vector.this); + final UInt${type.width}Vector offsetVector = new UInt${type.width}Vector(OFFSETS_VECTOR_NAME, allocator); private final Accessor accessor; private final Mutator mutator; @@ -68,16 +66,42 @@ public final class ${minor.class}Vector extends BaseDataValueVector implements V private int allocationSizeInBytes = INITIAL_BYTE_COUNT; private int allocationMonitor = 0; - public ${minor.class}Vector(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); + <#if minor.class == "Decimal"> + + private final int precision; + private final int scale; + + public ${minor.class}Vector(String name, BufferAllocator allocator, int precision, int scale) { + super(name, allocator); + this.oAccessor = offsetVector.getAccessor(); + this.accessor = new Accessor(); + this.mutator = new Mutator(); + this.precision = precision; + this.scale = scale; + } + <#else> + + public ${minor.class}Vector(String name, BufferAllocator allocator) { + super(name, allocator); this.oAccessor = offsetVector.getAccessor(); this.accessor = new Accessor(); this.mutator = new Mutator(); } + + + @Override + public Field getField() { + throw new UnsupportedOperationException("internal vector"); + } + + @Override + public MinorType getMinorType() { + return MinorType.${minor.class?upper_case}; + } @Override public FieldReader getReader(){ - return reader; + throw new UnsupportedOperationException("internal vector"); } @Override @@ -125,27 +149,6 @@ public int getVarByteLength(){ return offsetVector.getAccessor().get(valueCount); } -// @Override -// public SerializedField getMetadata() { -// return getMetadataBuilder() // -// .addChild(offsetVector.getMetadata()) -// .setValueCount(getAccessor().getValueCount()) // -// .setBufferLength(getBufferSize()) // -// .build(); -// } -// -// @Override -// public void load(SerializedField metadata, ArrowBuf buffer) { -// the bits vector is the first child (the order in which the children are added in getMetadataBuilder is significant) -// final SerializedField offsetField = metadata.getChild(0); -// offsetVector.load(offsetField, buffer); -// -// final int capacity = buffer.capacity(); -// final int offsetsLength = offsetField.getBufferLength(); -// data = buffer.slice(offsetsLength, capacity - offsetsLength); -// data.retain(); -// } - @Override public void clear() { super.clear(); @@ -175,12 +178,12 @@ public long getOffsetAddr(){ @Override public TransferPair getTransferPair(BufferAllocator allocator){ - return new TransferImpl(getField(), allocator); + return new TransferImpl(name, allocator); } @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator){ - return new TransferImpl(getField().withPath(ref), allocator); + return new TransferImpl(ref, allocator); } @Override @@ -241,8 +244,12 @@ public boolean copyFromSafe(int fromIndex, int thisIndex, ${minor.class}Vector f private class TransferImpl implements TransferPair{ ${minor.class}Vector to; - public TransferImpl(MaterializedField field, BufferAllocator allocator){ - to = new ${minor.class}Vector(field, allocator); + public TransferImpl(String name, BufferAllocator allocator){ + <#if minor.class == "Decimal"> + to = new ${minor.class}Vector(name, allocator, precision, scale); + <#else> + to = new ${minor.class}Vector(name, allocator); + } public TransferImpl(${minor.class}Vector to){ @@ -426,10 +433,10 @@ public void get(int index, Nullable${minor.class}Holder holder){ return text; } <#break> - <#case "Var16Char"> + <#case "Decimal"> @Override public ${friendlyType} getObject(int index) { - return new String(get(index), Charsets.UTF_16); + return new BigDecimal(new BigInteger(get(index)), scale); } <#break> <#default> diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java index b129ea9bcb9..05b7cf10067 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -20,7 +20,6 @@ import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.types.MaterializedField; public abstract class BaseDataValueVector extends BaseValueVector { @@ -29,8 +28,8 @@ public abstract class BaseDataValueVector extends BaseValueVector { protected ArrowBuf data; - public BaseDataValueVector(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); + public BaseDataValueVector(String name, BufferAllocator allocator) { + super(name, allocator); data = allocator.getEmpty(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java index 932e6f13caf..884cdf0910b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseValueVector.java @@ -19,8 +19,8 @@ import java.util.Iterator; +import com.google.flatbuffers.FlatBufferBuilder; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.types.MaterializedField; import org.apache.arrow.vector.util.TransferPair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -38,16 +38,16 @@ public abstract class BaseValueVector implements ValueVector { public static final int INITIAL_VALUE_ALLOCATION = 4096; protected final BufferAllocator allocator; - protected final MaterializedField field; + protected final String name; - protected BaseValueVector(MaterializedField field, BufferAllocator allocator) { - this.field = Preconditions.checkNotNull(field, "field cannot be null"); + protected BaseValueVector(String name, BufferAllocator allocator) { this.allocator = Preconditions.checkNotNull(allocator, "allocator cannot be null"); + this.name = name; } @Override public String toString() { - return super.toString() + "[field = " + field + ", ...]"; + return super.toString() + "[name = " + name + ", ...]"; } @Override @@ -60,30 +60,11 @@ public void close() { clear(); } - @Override - public MaterializedField getField() { - return field; - } - - public MaterializedField getField(String ref){ - return getField().withPath(ref); - } - @Override public TransferPair getTransferPair(BufferAllocator allocator) { - return getTransferPair(getField().getPath(), allocator); + return getTransferPair(name, allocator); } -// public static SerializedField getMetadata(BaseValueVector vector) { -// return getMetadataBuilder(vector).build(); -// } -// -// protected static SerializedField.Builder getMetadataBuilder(BaseValueVector vector) { -// return SerializedFieldHelper.getAsBuilder(vector.getField()) -// .setValueCount(vector.getAccessor().getValueCount()) -// .setBufferLength(vector.getBufferSize()); -// } - public abstract static class BaseAccessor implements ValueVector.Accessor { protected BaseAccessor() { } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index c5bcb2decc4..fee6e9cdef7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -21,11 +21,11 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; -import org.apache.arrow.vector.complex.impl.BitReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.BitHolder; import org.apache.arrow.vector.holders.NullableBitHolder; -import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; @@ -37,7 +37,6 @@ public final class BitVector extends BaseDataValueVector implements FixedWidthVector { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BitVector.class); - private final FieldReader reader = new BitReaderImpl(BitVector.this); private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); @@ -45,13 +44,23 @@ public final class BitVector extends BaseDataValueVector implements FixedWidthVe private int allocationSizeInBytes = INITIAL_VALUE_ALLOCATION; private int allocationMonitor = 0; - public BitVector(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); + public BitVector(String name, BufferAllocator allocator) { + super(name, allocator); + } + + @Override + public Field getField() { + throw new UnsupportedOperationException("internal vector"); + } + + @Override + public MinorType getMinorType() { + return MinorType.BIT; } @Override public FieldReader getReader() { - return reader; + throw new UnsupportedOperationException("internal vector"); } @Override @@ -180,20 +189,6 @@ public boolean copyFromSafe(int inIndex, int outIndex, BitVector from) { return true; } -// @Override -// public void load(SerializedField metadata, DrillBuf buffer) { -// Preconditions.checkArgument(this.field.getPath().equals(metadata.getNamePart().getName()), "The field %s doesn't match the provided metadata %s.", this.field, metadata); -// final int valueCount = metadata.getValueCount(); -// final int expectedLength = getSizeFromCount(valueCount); -// final int actualLength = metadata.getBufferLength(); -// assert expectedLength == actualLength: "expected and actual buffer sizes do not match"; -// -// clear(); -// data = buffer.slice(0, actualLength); -// data.retain(); -// this.valueCount = valueCount; -// } - @Override public Mutator getMutator() { return new Mutator(); @@ -206,12 +201,12 @@ public Accessor getAccessor() { @Override public TransferPair getTransferPair(BufferAllocator allocator) { - return new TransferImpl(getField(), allocator); + return new TransferImpl(name, allocator); } @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new TransferImpl(getField().withPath(ref), allocator); + return new TransferImpl(ref, allocator); } @Override @@ -270,8 +265,8 @@ public void splitAndTransferTo(int startIndex, int length, BitVector target) { private class TransferImpl implements TransferPair { BitVector to; - public TransferImpl(MaterializedField field, BufferAllocator allocator) { - this.to = new BitVector(field, allocator); + public TransferImpl(String name, BufferAllocator allocator) { + this.to = new BitVector(name, allocator); } public TransferImpl(BitVector to) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java deleted file mode 100644 index b806b180e70..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/ObjectVector.java +++ /dev/null @@ -1,220 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector; - -import io.netty.buffer.ArrowBuf; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.OutOfMemoryException; -import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.holders.ObjectHolder; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.util.TransferPair; - -public class ObjectVector extends BaseValueVector { - private final Accessor accessor = new Accessor(); - private final Mutator mutator = new Mutator(); - private int maxCount = 0; - private int count = 0; - private int allocationSize = 4096; - - private List objectArrayList = new ArrayList<>(); - - public ObjectVector(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); - } - - public void addNewArray() { - objectArrayList.add(new Object[allocationSize]); - maxCount += allocationSize; - } - - @Override - public FieldReader getReader() { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - - public final class Mutator implements ValueVector.Mutator { - - public void set(int index, Object obj) { - int listOffset = index / allocationSize; - if (listOffset >= objectArrayList.size()) { - addNewArray(); - } - objectArrayList.get(listOffset)[index % allocationSize] = obj; - } - - public boolean setSafe(int index, long value) { - set(index, value); - return true; - } - - protected void set(int index, ObjectHolder holder) { - set(index, holder.obj); - } - - public boolean setSafe(int index, ObjectHolder holder){ - set(index, holder); - return true; - } - - @Override - public void setValueCount(int valueCount) { - count = valueCount; - } - - @Override - public void reset() { - count = 0; - maxCount = 0; - objectArrayList = new ArrayList<>(); - addNewArray(); - } - - @Override - public void generateTestData(int values) { - } - } - - @Override - public void setInitialCapacity(int numRecords) { - // NoOp - } - - @Override - public void allocateNew() throws OutOfMemoryException { - addNewArray(); - } - - public void allocateNew(int valueCount) throws OutOfMemoryException { - while (maxCount < valueCount) { - addNewArray(); - } - } - - @Override - public boolean allocateNewSafe() { - allocateNew(); - return true; - } - - @Override - public int getBufferSize() { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - - @Override - public int getBufferSizeFor(final int valueCount) { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - - @Override - public void close() { - clear(); - } - - @Override - public void clear() { - objectArrayList.clear(); - maxCount = 0; - count = 0; - } - - @Override - public MaterializedField getField() { - return field; - } - - @Override - public TransferPair getTransferPair(BufferAllocator allocator) { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - - @Override - public TransferPair makeTransferPair(ValueVector to) { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - - @Override - public int getValueCapacity() { - return maxCount; - } - - @Override - public Accessor getAccessor() { - return accessor; - } - - @Override - public ArrowBuf[] getBuffers(boolean clear) { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - -// @Override -// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { -// throw new UnsupportedOperationException("ObjectVector does not support this"); -// } -// -// @Override -// public UserBitShared.SerializedField getMetadata() { -// throw new UnsupportedOperationException("ObjectVector does not support this"); -// } - - @Override - public Mutator getMutator() { - return mutator; - } - - @Override - public Iterator iterator() { - throw new UnsupportedOperationException("ObjectVector does not support this"); - } - - public final class Accessor extends BaseAccessor { - @Override - public Object getObject(int index) { - int listOffset = index / allocationSize; - if (listOffset >= objectArrayList.size()) { - addNewArray(); - } - return objectArrayList.get(listOffset)[index % allocationSize]; - } - - @Override - public int getValueCount() { - return count; - } - - public Object get(int index) { - return getObject(index); - } - - public void get(int index, ObjectHolder holder){ - holder.obj = getObject(index); - } - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java deleted file mode 100644 index 61ce285d61b..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueHolderHelper.java +++ /dev/null @@ -1,203 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector; - -import io.netty.buffer.ArrowBuf; - -import java.math.BigDecimal; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.holders.BigIntHolder; -import org.apache.arrow.vector.holders.BitHolder; -import org.apache.arrow.vector.holders.DateHolder; -import org.apache.arrow.vector.holders.Decimal18Holder; -import org.apache.arrow.vector.holders.Decimal28SparseHolder; -import org.apache.arrow.vector.holders.Decimal38SparseHolder; -import org.apache.arrow.vector.holders.Decimal9Holder; -import org.apache.arrow.vector.holders.Float4Holder; -import org.apache.arrow.vector.holders.Float8Holder; -import org.apache.arrow.vector.holders.IntHolder; -import org.apache.arrow.vector.holders.IntervalDayHolder; -import org.apache.arrow.vector.holders.IntervalYearHolder; -import org.apache.arrow.vector.holders.NullableBitHolder; -import org.apache.arrow.vector.holders.TimeHolder; -import org.apache.arrow.vector.holders.TimeStampHolder; -import org.apache.arrow.vector.holders.VarCharHolder; -import org.apache.arrow.vector.util.DecimalUtility; - -import com.google.common.base.Charsets; - - -public class ValueHolderHelper { - static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ValueHolderHelper.class); - - public static IntHolder getIntHolder(int value) { - IntHolder holder = new IntHolder(); - holder.value = value; - - return holder; - } - - public static BigIntHolder getBigIntHolder(long value) { - BigIntHolder holder = new BigIntHolder(); - holder.value = value; - - return holder; - } - - public static Float4Holder getFloat4Holder(float value) { - Float4Holder holder = new Float4Holder(); - holder.value = value; - - return holder; - } - - public static Float8Holder getFloat8Holder(double value) { - Float8Holder holder = new Float8Holder(); - holder.value = value; - - return holder; - } - - public static DateHolder getDateHolder(long value) { - DateHolder holder = new DateHolder(); - holder.value = value; - return holder; - } - - public static TimeHolder getTimeHolder(int value) { - TimeHolder holder = new TimeHolder(); - holder.value = value; - return holder; - } - - public static TimeStampHolder getTimeStampHolder(long value) { - TimeStampHolder holder = new TimeStampHolder(); - holder.value = value; - return holder; - } - - public static BitHolder getBitHolder(int value) { - BitHolder holder = new BitHolder(); - holder.value = value; - - return holder; - } - - public static NullableBitHolder getNullableBitHolder(boolean isNull, int value) { - NullableBitHolder holder = new NullableBitHolder(); - holder.isSet = isNull? 0 : 1; - if (! isNull) { - holder.value = value; - } - - return holder; - } - - public static VarCharHolder getVarCharHolder(ArrowBuf buf, String s){ - VarCharHolder vch = new VarCharHolder(); - - byte[] b = s.getBytes(Charsets.UTF_8); - vch.start = 0; - vch.end = b.length; - vch.buffer = buf.reallocIfNeeded(b.length); - vch.buffer.setBytes(0, b); - return vch; - } - - public static VarCharHolder getVarCharHolder(BufferAllocator a, String s){ - VarCharHolder vch = new VarCharHolder(); - - byte[] b = s.getBytes(Charsets.UTF_8); - vch.start = 0; - vch.end = b.length; - vch.buffer = a.buffer(b.length); // - vch.buffer.setBytes(0, b); - return vch; - } - - - public static IntervalYearHolder getIntervalYearHolder(int intervalYear) { - IntervalYearHolder holder = new IntervalYearHolder(); - - holder.value = intervalYear; - return holder; - } - - public static IntervalDayHolder getIntervalDayHolder(int days, int millis) { - IntervalDayHolder dch = new IntervalDayHolder(); - - dch.days = days; - dch.milliseconds = millis; - return dch; - } - - public static Decimal9Holder getDecimal9Holder(int decimal, int scale, int precision) { - Decimal9Holder dch = new Decimal9Holder(); - - dch.scale = scale; - dch.precision = precision; - dch.value = decimal; - - return dch; - } - - public static Decimal18Holder getDecimal18Holder(long decimal, int scale, int precision) { - Decimal18Holder dch = new Decimal18Holder(); - - dch.scale = scale; - dch.precision = precision; - dch.value = decimal; - - return dch; - } - - public static Decimal28SparseHolder getDecimal28Holder(ArrowBuf buf, String decimal) { - - Decimal28SparseHolder dch = new Decimal28SparseHolder(); - - BigDecimal bigDecimal = new BigDecimal(decimal); - - dch.scale = bigDecimal.scale(); - dch.precision = bigDecimal.precision(); - Decimal28SparseHolder.setSign(bigDecimal.signum() == -1, dch.start, dch.buffer); - dch.start = 0; - dch.buffer = buf.reallocIfNeeded(5 * DecimalUtility.INTEGER_SIZE); - DecimalUtility - .getSparseFromBigDecimal(bigDecimal, dch.buffer, dch.start, dch.scale, dch.precision, dch.nDecimalDigits); - - return dch; - } - - public static Decimal38SparseHolder getDecimal38Holder(ArrowBuf buf, String decimal) { - - Decimal38SparseHolder dch = new Decimal38SparseHolder(); - - BigDecimal bigDecimal = new BigDecimal(decimal); - - dch.scale = bigDecimal.scale(); - dch.precision = bigDecimal.precision(); - Decimal38SparseHolder.setSign(bigDecimal.signum() == -1, dch.start, dch.buffer); - dch.start = 0; - dch.buffer = buf.reallocIfNeeded(dch.maxPrecision * DecimalUtility.INTEGER_SIZE); - DecimalUtility - .getSparseFromBigDecimal(bigDecimal, dch.buffer, dch.start, dch.scale, dch.precision, dch.nDecimalDigits); - - return dch; - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index a170c59abd7..35321c947db 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -24,8 +24,9 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.types.pojo.Field; /** * An abstraction that is used to store a sequence of values in an individual column. @@ -33,8 +34,7 @@ * A {@link ValueVector value vector} stores underlying data in-memory in a columnar fashion that is compact and * efficient. The column whose data is stored, is referred by {@link #getField()}. * - * A vector when instantiated, relies on a {@link org.apache.drill.exec.record.DeadBuf dead buffer}. It is important - * that vector is allocated before attempting to read or write. + * It is important that vector is allocated before attempting to read or write. * * There are a few "rules" around vectors: * @@ -94,7 +94,9 @@ public interface ValueVector extends Closeable, Iterable { /** * Get information about how this field is materialized. */ - MaterializedField getField(); + Field getField(); + + MinorType getMinorType(); /** * Returns a {@link org.apache.arrow.vector.util.TransferPair transfer pair}, creating a new target vector of diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java deleted file mode 100644 index fdad99a3332..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorDescriptor.java +++ /dev/null @@ -1,83 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector; - -import java.util.Collection; - -import com.google.common.base.Preconditions; - -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.MajorType; - -public class VectorDescriptor { - private static final String DEFAULT_NAME = "NONE"; - - private final MaterializedField field; - - public VectorDescriptor(final MajorType type) { - this(DEFAULT_NAME, type); - } - - public VectorDescriptor(final String name, final MajorType type) { - this(MaterializedField.create(name, type)); - } - - public VectorDescriptor(final MaterializedField field) { - this.field = Preconditions.checkNotNull(field, "field cannot be null"); - } - - public MaterializedField getField() { - return field; - } - - public MajorType getType() { - return field.getType(); - } - - public String getName() { - return field.getLastName(); - } - - public Collection getChildren() { - return field.getChildren(); - } - - public boolean hasName() { - return getName() != DEFAULT_NAME; - } - - public VectorDescriptor withName(final String name) { - return new VectorDescriptor(field.withPath(name)); - } - - public VectorDescriptor withType(final MajorType type) { - return new VectorDescriptor(field.withType(type)); - } - - public static VectorDescriptor create(final String name, final MajorType type) { - return new VectorDescriptor(name, type); - } - - public static VectorDescriptor create(final MajorType type) { - return new VectorDescriptor(type); - } - - public static VectorDescriptor create(final MaterializedField field) { - return new VectorDescriptor(field); - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index c94e8d1db09..705a24b02fe 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -17,18 +17,20 @@ */ package org.apache.arrow.vector; +import com.google.flatbuffers.FlatBufferBuilder; import io.netty.buffer.ArrowBuf; import java.util.Collections; import java.util.Iterator; +import org.apache.arrow.flatbuf.Type; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.complex.impl.NullReader; import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.TransferPair; import com.google.common.collect.Iterators; @@ -36,7 +38,7 @@ public class ZeroVector implements ValueVector { public final static ZeroVector INSTANCE = new ZeroVector(); - private final MaterializedField field = MaterializedField.create("[DEFAULT]", Types.required(MinorType.LATE)); + private final String name = "[DEFAULT]"; private final TransferPair defaultPair = new TransferPair() { @Override @@ -91,24 +93,21 @@ public void close() { } public void clear() { } @Override - public MaterializedField getField() { - return field; + public Field getField() { + return new Field(name, true, new Null(), null); } + @Override + public MinorType getMinorType() { + return MinorType.NULL; + } + + @Override public TransferPair getTransferPair(BufferAllocator allocator) { return defaultPair; } -// @Override -// public UserBitShared.SerializedField getMetadata() { -// return getField() -// .getAsBuilder() -// .setBufferLength(getBufferSize()) -// .setValueCount(getAccessor().getValueCount()) -// .build(); -// } - @Override public Iterator iterator() { return Collections.emptyIterator(); @@ -176,7 +175,4 @@ public Mutator getMutator() { public FieldReader getReader() { return NullReader.INSTANCE; } - -// @Override -// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java index 9fae2382ecb..ed7797576d6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -21,12 +21,10 @@ import javax.annotation.Nullable; +import org.apache.arrow.flatbuf.Field; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.util.CallBack; @@ -43,12 +41,12 @@ public abstract class AbstractContainerVector implements ValueVector { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); - protected MaterializedField field; + protected final String name; protected final BufferAllocator allocator; protected final CallBack callBack; - protected AbstractContainerVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { - this.field = Preconditions.checkNotNull(field); + protected AbstractContainerVector(String name, BufferAllocator allocator, CallBack callBack) { + this.name = name; this.allocator = allocator; this.callBack = callBack; } @@ -64,14 +62,6 @@ public BufferAllocator getAllocator() { return allocator; } - /** - * Returns the field definition of this instance. - */ - @Override - public MaterializedField getField() { - return field; - } - /** * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given field name if exists or null. */ @@ -79,19 +69,6 @@ public ValueVector getChild(String name) { return getChild(name, ValueVector.class); } - /** - * Returns a sequence of field names in the order that they show up in the schema. - */ - protected Collection getChildFieldNames() { - return Sets.newLinkedHashSet(Iterables.transform(field.getChildren(), new Function() { - @Nullable - @Override - public String apply(MaterializedField field) { - return Preconditions.checkNotNull(field).getLastName(); - } - })); - } - /** * Clears out all underlying child vectors. */ @@ -109,22 +86,6 @@ protected T typeify(ValueVector v, Class clazz) { throw new IllegalStateException(String.format("Vector requested [%s] was different than type stored [%s]. Arrow doesn't yet support hetergenous types.", clazz.getSimpleName(), v.getClass().getSimpleName())); } - MajorType getLastPathType() { - if((this.getField().getType().getMinorType() == MinorType.LIST && - this.getField().getType().getMode() == DataMode.REPEATED)) { // Use Repeated scalar type instead of Required List. - VectorWithOrdinal vord = getChildVectorWithOrdinal(null); - ValueVector v = vord.vector; - if (! (v instanceof AbstractContainerVector)) { - return v.getField().getType(); - } - } else if (this.getField().getType().getMinorType() == MinorType.MAP && - this.getField().getType().getMode() == DataMode.REPEATED) { // Use Required Map - return new MajorType(MinorType.MAP, DataMode.REQUIRED); - } - - return this.getField().getType(); - } - protected boolean supportsDirectRead() { return false; } @@ -133,7 +94,7 @@ protected boolean supportsDirectRead() { public abstract int size(); // add a new vector with the input MajorType or return the existing vector if we already added one with the same type - public abstract T addOrGet(String name, MajorType type, Class clazz); + public abstract T addOrGet(String name, MinorType minorType, Class clazz, int... precisionScale); // return the child vector with the input name public abstract T getChild(String name, Class clazz); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java index de6ae829b47..5964f800791 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java @@ -17,17 +17,17 @@ */ package org.apache.arrow.vector.complex; +import com.google.common.collect.ImmutableList; import io.netty.buffer.ArrowBuf; -import java.util.Collection; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import org.apache.arrow.flatbuf.Field; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.util.BasicTypeHelper; +import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.MapWithOrdinal; @@ -43,17 +43,8 @@ public abstract class AbstractMapVector extends AbstractContainerVector { // Maintains a map with key as field name and value is the vector itself private final MapWithOrdinal vectors = new MapWithOrdinal<>(); - protected AbstractMapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { - super(field.clone(), allocator, callBack); - MaterializedField clonedField = field.clone(); - // create the hierarchy of the child vectors based on the materialized field - for (MaterializedField child : clonedField.getChildren()) { - if (!child.equals(BaseRepeatedValueVector.OFFSETS_FIELD)) { - final String fieldName = child.getLastName(); - final ValueVector v = BasicTypeHelper.getNewVector(child, allocator, callBack); - putVector(fieldName, v); - } - } + protected AbstractMapVector(String name, BufferAllocator allocator, CallBack callBack) { + super(name, allocator, callBack); } @Override @@ -109,8 +100,8 @@ public boolean allocateNewSafe() { * * * - * @param name name of the field - * @param type type of the field + * @param name the name of the field + * @param minorType the minorType for the vector * @param clazz class of expected vector type * @param class type of expected vector type * @throws java.lang.IllegalStateException raised if there is a hard schema change @@ -118,7 +109,7 @@ public boolean allocateNewSafe() { * @return resultant {@link org.apache.arrow.vector.ValueVector} */ @Override - public T addOrGet(String name, MajorType type, Class clazz) { + public T addOrGet(String name, MinorType minorType, Class clazz, int... precisionScale) { final ValueVector existing = getChild(name); boolean create = false; if (existing == null) { @@ -130,7 +121,7 @@ public T addOrGet(String name, MajorType type, Class create = true; } if (create) { - final T vector = (T) BasicTypeHelper.getNewVector(name, allocator, type, callBack); + final T vector = (T) minorType.getNewVector(name, allocator, callBack, precisionScale); putChild(name, vector); if (callBack!=null) { callBack.doWork(); @@ -177,7 +168,6 @@ public T getChild(String name, Class clazz) { */ protected void putChild(String name, ValueVector vector) { putVector(name, vector); - field.addChild(vector.getField()); } /** @@ -199,8 +189,21 @@ protected void putVector(String name, ValueVector vector) { /** * Returns a sequence of underlying child vectors. */ - protected Collection getChildren() { - return vectors.values(); + protected List getChildren() { + int size = vectors.size(); + List children = new ArrayList<>(); + for (int i = 0; i < size; i++) { + children.add(vectors.getByOrdinal(i)); + } + return children; + } + + protected List getChildFieldNames() { + ImmutableList.Builder builder = ImmutableList.builder(); + for (ValueVector child : getChildren()) { + builder.add(child.getField().getName()); + } + return builder.build(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 6518897fb78..42262741df9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -22,22 +22,18 @@ import java.util.Collections; import java.util.Iterator; +import org.apache.arrow.flatbuf.Type; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.AddOrGetResult; import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VectorDescriptor; import org.apache.arrow.vector.ZeroVector; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.BasicTypeHelper; -import org.apache.arrow.vector.util.SchemaChangeRuntimeException; import com.google.common.base.Preconditions; import com.google.common.collect.ObjectArrays; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; public abstract class BaseRepeatedValueVector extends BaseValueVector implements RepeatedValueVector { @@ -45,19 +41,16 @@ public abstract class BaseRepeatedValueVector extends BaseValueVector implements public final static String OFFSETS_VECTOR_NAME = "$offsets$"; public final static String DATA_VECTOR_NAME = "$data$"; - public final static MaterializedField OFFSETS_FIELD = - MaterializedField.create(OFFSETS_VECTOR_NAME, new MajorType(MinorType.UINT4, DataMode.REQUIRED)); - protected final UInt4Vector offsets; protected ValueVector vector; - protected BaseRepeatedValueVector(MaterializedField field, BufferAllocator allocator) { - this(field, allocator, DEFAULT_DATA_VECTOR); + protected BaseRepeatedValueVector(String name, BufferAllocator allocator) { + this(name, allocator, DEFAULT_DATA_VECTOR); } - protected BaseRepeatedValueVector(MaterializedField field, BufferAllocator allocator, ValueVector vector) { - super(field, allocator); - this.offsets = new UInt4Vector(OFFSETS_FIELD, allocator); + protected BaseRepeatedValueVector(String name, BufferAllocator allocator, ValueVector vector) { + super(name, allocator); + this.offsets = new UInt4Vector(OFFSETS_VECTOR_NAME, allocator); this.vector = Preconditions.checkNotNull(vector, "data vector cannot be null"); } @@ -109,13 +102,6 @@ public int getValueCapacity() { return Math.min(vector.getValueCapacity(), offsetValueCapacity); } -// @Override -// protected UserBitShared.SerializedField.Builder getMetadataBuilder() { -// return super.getMetadataBuilder() -// .addChild(offsets.getMetadata()) -// .addChild(vector.getMetadata()); -// } - @Override public int getBufferSize() { if (getAccessor().getValueCount() == 0) { @@ -157,47 +143,24 @@ public ArrowBuf[] getBuffers(boolean clear) { return buffers; } -// @Override -// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { -// final UserBitShared.SerializedField offsetMetadata = metadata.getChild(0); -// offsets.load(offsetMetadata, buffer); -// -// final UserBitShared.SerializedField vectorMetadata = metadata.getChild(1); -// if (getDataVector() == DEFAULT_DATA_VECTOR) { -// addOrGetVector(VectorDescriptor.create(vectorMetadata.getMajorType())); -// } -// -// final int offsetLength = offsetMetadata.getBufferLength(); -// final int vectorLength = vectorMetadata.getBufferLength(); -// vector.load(vectorMetadata, buffer.slice(offsetLength, vectorLength)); -// } - /** * Returns 1 if inner vector is explicitly set via #addOrGetVector else 0 - * - * @see {@link ContainerVectorLike#size} */ - @Override public int size() { return vector == DEFAULT_DATA_VECTOR ? 0:1; } - @Override - public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { + public AddOrGetResult addOrGetVector(MinorType minorType) { boolean created = false; - if (vector == DEFAULT_DATA_VECTOR && descriptor.getType().getMinorType() != MinorType.LATE) { - final MaterializedField field = descriptor.withName(DATA_VECTOR_NAME).getField(); - vector = BasicTypeHelper.getNewVector(field, allocator); + if (vector instanceof ZeroVector) { + vector = minorType.getNewVector(DATA_VECTOR_NAME, allocator, null); // returned vector must have the same field - assert field.equals(vector.getField()); - getField().addChild(field); created = true; } - final MajorType actual = vector.getField().getType(); - if (!actual.equals(descriptor.getType())) { + if (vector.getField().getType().getTypeType() != minorType.getType().getTypeType()) { final String msg = String.format("Inner vector type mismatch. Requested type: [%s], actual type: [%s]", - descriptor.getType(), actual); + Type.name(minorType.getType().getTypeType()), Type.name(vector.getField().getType().getTypeType())); throw new SchemaChangeRuntimeException(msg); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java deleted file mode 100644 index 655b55a6aa2..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ContainerVectorLike.java +++ /dev/null @@ -1,43 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector.complex; - -import org.apache.arrow.vector.AddOrGetResult; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VectorDescriptor; - -/** - * A mix-in used for introducing container vector-like behaviour. - */ -public interface ContainerVectorLike { - - /** - * Creates and adds a child vector if none with the same name exists, else returns the vector instance. - * - * @param descriptor vector descriptor - * @return result of operation wrapping vector corresponding to the given descriptor and whether it's newly created - * @throws org.apache.arrow.vector.util.SchemaChangeRuntimeException - * if schema change is not permissible between the given and existing data vector types. - */ - AddOrGetResult addOrGetVector(VectorDescriptor descriptor); - - /** - * Returns the number of child vectors in this container vector-like instance. - */ - int size(); -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 3e60c768023..c6c6b090db6 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -18,6 +18,8 @@ ******************************************************************************/ package org.apache.arrow.vector.complex; +import com.google.common.collect.ImmutableList; +import com.google.flatbuffers.FlatBufferBuilder; import io.netty.buffer.ArrowBuf; import java.util.List; @@ -28,17 +30,14 @@ import org.apache.arrow.vector.UInt1Vector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VectorDescriptor; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.FieldWriter; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.JsonStringArrayList; import org.apache.arrow.vector.util.TransferPair; @@ -55,11 +54,10 @@ public class ListVector extends BaseRepeatedValueVector { private UnionListReader reader; private CallBack callBack; - public ListVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { - super(field, allocator); - this.bits = new UInt1Vector(MaterializedField.create("$bits$", new MajorType(MinorType.UINT1, DataMode.REQUIRED)), allocator); + public ListVector(String name, BufferAllocator allocator, CallBack callBack) { + super(name, allocator); + this.bits = new UInt1Vector("$bits$", allocator); offsets = getOffsetVector(); - this.field.addChild(getDataVector().getField()); this.writer = new UnionListWriter(this); this.reader = new UnionListReader(this); this.callBack = callBack; @@ -75,15 +73,6 @@ public void allocateNew() throws OutOfMemoryException { bits.allocateNewSafe(); } - public void transferTo(ListVector target) { - offsets.makeTransferPair(target.offsets).transfer(); - bits.makeTransferPair(target.bits).transfer(); - if (target.getDataVector() instanceof ZeroVector) { - target.addOrGetVector(new VectorDescriptor(vector.getField().getType())); - } - getDataVector().makeTransferPair(target.getDataVector()).transfer(); - } - public void copyFromSafe(int inIndex, int outIndex, ListVector from) { copyFrom(inIndex, outIndex, from); } @@ -103,7 +92,7 @@ public ValueVector getDataVector() { @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new TransferImpl(field.withPath(ref), allocator); + return new TransferImpl(ref, allocator); } @Override @@ -114,20 +103,28 @@ public TransferPair makeTransferPair(ValueVector target) { private class TransferImpl implements TransferPair { ListVector to; + TransferPair pairs[] = new TransferPair[3]; - public TransferImpl(MaterializedField field, BufferAllocator allocator) { - to = new ListVector(field, allocator, null); - to.addOrGetVector(new VectorDescriptor(vector.getField().getType())); + public TransferImpl(String name, BufferAllocator allocator) { + this(new ListVector(name, allocator, null)); } public TransferImpl(ListVector to) { this.to = to; - to.addOrGetVector(new VectorDescriptor(vector.getField().getType())); + to.addOrGetVector(vector.getMinorType()); + pairs[0] = offsets.makeTransferPair(to.offsets); + pairs[1] = bits.makeTransferPair(to.bits); + if (to.getDataVector() instanceof ZeroVector) { + to.addOrGetVector(vector.getMinorType()); + } + pairs[2] = getDataVector().makeTransferPair(to.getDataVector()); } @Override public void transfer() { - transferTo(to); + for (TransferPair pair : pairs) { + pair.transfer(); + } } @Override @@ -190,17 +187,8 @@ public boolean allocateNewSafe() { return success; } -// @Override -// protected UserBitShared.SerializedField.Builder getMetadataBuilder() { -// return getField().getAsBuilder() -// .setValueCount(getAccessor().getValueCount()) -// .setBufferLength(getBufferSize()) -// .addChild(offsets.getMetadata()) -// .addChild(bits.getMetadata()) -// .addChild(vector.getMetadata()); -// } - public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { - AddOrGetResult result = super.addOrGetVector(descriptor); + public AddOrGetResult addOrGetVector(MinorType minorType) { + AddOrGetResult result = super.addOrGetVector(minorType); reader = new UnionListReader(this); return result; } @@ -213,6 +201,17 @@ public int getBufferSize() { return offsets.getBufferSize() + bits.getBufferSize() + vector.getBufferSize(); } + @Override + public Field getField() { + return new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.List(), + ImmutableList.of(getDataVector().getField())); + } + + @Override + public MinorType getMinorType() { + return MinorType.LIST; + } + @Override public void clear() { offsets.clear(); @@ -235,28 +234,8 @@ public ArrowBuf[] getBuffers(boolean clear) { return buffers; } -// @Override -// public void load(UserBitShared.SerializedField metadata, DrillBuf buffer) { -// final UserBitShared.SerializedField offsetMetadata = metadata.getChild(0); -// offsets.load(offsetMetadata, buffer); -// -// final int offsetLength = offsetMetadata.getBufferLength(); -// final UserBitShared.SerializedField bitMetadata = metadata.getChild(1); -// final int bitLength = bitMetadata.getBufferLength(); -// bits.load(bitMetadata, buffer.slice(offsetLength, bitLength)); -// -// final UserBitShared.SerializedField vectorMetadata = metadata.getChild(2); -// if (getDataVector() == DEFAULT_DATA_VECTOR) { -// addOrGetVector(VectorDescriptor.create(vectorMetadata.getMajorType())); -// } -// -// final int vectorLength = vectorMetadata.getBufferLength(); -// vector.load(vectorMetadata, buffer.slice(offsetLength + bitLength, vectorLength)); -// } - public UnionVector promoteToUnion() { - MaterializedField newField = MaterializedField.create(getField().getPath(), new MajorType(MinorType.UNION, DataMode.OPTIONAL)); - UnionVector vector = new UnionVector(newField, allocator, null); + UnionVector vector = new UnionVector(name, allocator, null); replaceDataVector(vector); reader = new UnionListReader(this); return vector; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index cc0953a1af8..0cb613e2f7a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -19,8 +19,10 @@ import io.netty.buffer.ArrowBuf; +import java.util.ArrayList; import java.util.Collection; import java.util.Iterator; +import java.util.List; import java.util.Map; import javax.annotation.Nullable; @@ -28,14 +30,13 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.complex.RepeatedMapVector.MapSingleCopier; import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.ComplexHolder; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.JsonStringHashMap; import org.apache.arrow.vector.util.TransferPair; @@ -47,19 +48,13 @@ public class MapVector extends AbstractMapVector { //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MapVector.class); - public final static MajorType TYPE = new MajorType(MinorType.MAP, DataMode.OPTIONAL); - private final SingleMapReaderImpl reader = new SingleMapReaderImpl(MapVector.this); private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); int valueCount; - public MapVector(String path, BufferAllocator allocator, CallBack callBack){ - this(MaterializedField.create(path, TYPE), allocator, callBack); - } - - public MapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ - super(field, allocator, callBack); + public MapVector(String name, BufferAllocator allocator, CallBack callBack){ + super(name, allocator, callBack); } @Override @@ -69,7 +64,6 @@ public FieldReader getReader() { } transient private MapTransferPair ephPair; - transient private MapSingleCopier ephPair2; public void copyFromSafe(int fromIndex, int thisIndex, MapVector from) { if(ephPair == null || ephPair.from != from) { @@ -78,13 +72,6 @@ public void copyFromSafe(int fromIndex, int thisIndex, MapVector from) { ephPair.copyValueSafe(fromIndex, thisIndex); } - public void copyFromSafe(int fromSubIndex, int thisIndex, RepeatedMapVector from) { - if(ephPair2 == null || ephPair2.from != from) { - ephPair2 = from.makeSingularCopier(this); - } - ephPair2.copySafe(fromSubIndex, thisIndex); - } - @Override protected boolean supportsDirectRead() { return true; @@ -139,7 +126,7 @@ public ArrowBuf[] getBuffers(boolean clear) { @Override public TransferPair getTransferPair(BufferAllocator allocator) { - return new MapTransferPair(this, getField().getPath(), allocator); + return new MapTransferPair(this, name, allocator); } @Override @@ -157,8 +144,8 @@ protected static class MapTransferPair implements TransferPair{ private final MapVector from; private final MapVector to; - public MapTransferPair(MapVector from, String path, BufferAllocator allocator) { - this(from, new MapVector(MaterializedField.create(path, TYPE), allocator, from.callBack), false); + public MapTransferPair(MapVector from, String name, BufferAllocator allocator) { + this(from, new MapVector(name, allocator, from.callBack), false); } public MapTransferPair(MapVector from, MapVector to) { @@ -170,7 +157,6 @@ protected MapTransferPair(MapVector from, MapVector to, boolean allocate) { this.to = to; this.pairs = new TransferPair[from.size()]; this.to.ephPair = null; - this.to.ephPair2 = null; int i = 0; ValueVector vector; @@ -189,7 +175,7 @@ protected MapTransferPair(MapVector from, MapVector to, boolean allocate) { // (This is similar to what happens in ScanBatch where the children cannot be added till they are // read). To take care of this, we ensure that the hashCode of the MaterializedField does not // include the hashCode of the children but is based only on MaterializedField$key. - final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); + final ValueVector newVector = to.addOrGet(child, vector.getMinorType(), vector.getClass()); if (allocate && to.size() != preSize) { newVector.allocateNew(); } @@ -251,46 +237,6 @@ public Accessor getAccessor() { return accessor; } -// @Override -// public void load(SerializedField metadata, DrillBuf buf) { -// final List fields = metadata.getChildList(); -// valueCount = metadata.getValueCount(); -// -// int bufOffset = 0; -// for (final SerializedField child : fields) { -// final MaterializedField fieldDef = SerializedFieldHelper.create(child); -// -// ValueVector vector = getChild(fieldDef.getLastName()); -// if (vector == null) { -// if we arrive here, we didn't have a matching vector. -// vector = BasicTypeHelper.getNewVector(fieldDef, allocator); -// putChild(fieldDef.getLastName(), vector); -// } -// if (child.getValueCount() == 0) { -// vector.clear(); -// } else { -// vector.load(child, buf.slice(bufOffset, child.getBufferLength())); -// } -// bufOffset += child.getBufferLength(); -// } -// -// assert bufOffset == buf.capacity(); -// } -// -// @Override -// public SerializedField getMetadata() { -// SerializedField.Builder b = getField() // -// .getAsBuilder() // -// .setBufferLength(getBufferSize()) // -// .setValueCount(valueCount); -// -// -// for(ValueVector v : getChildren()) { -// b.addChild(v.getMetadata()); -// } -// return b.build(); -// } - @Override public Mutator getMutator() { return mutator; @@ -303,13 +249,6 @@ public Object getObject(int index) { Map vv = new JsonStringHashMap<>(); for (String child:getChildFieldNames()) { ValueVector v = getChild(child); - // TODO(DRILL-4001): Resolve this hack: - // The index/value count check in the following if statement is a hack - // to work around the current fact that RecordBatchLoader.load and - // MapVector.load leave child vectors with a length of zero (as opposed - // to matching the lengths of siblings and the parent map vector) - // because they don't remove (or set the lengths of) vectors from - // previous batches that aren't in the current batch. if (v != null && index < v.getAccessor().getValueCount()) { Object value = v.getAccessor().getObject(index); if (value != null) { @@ -360,6 +299,20 @@ public void clear() { valueCount = 0; } + @Override + public Field getField() { + List children = new ArrayList<>(); + for (ValueVector child : getChildren()) { + children.add(child.getField()); + } + return new Field(name, false, Tuple.INSTANCE, children); + } + + @Override + public MinorType getMinorType() { + return MinorType.MAP; + } + @Override public void close() { final Collection vectors = getChildren(); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java deleted file mode 100644 index f337f9c4a60..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedListVector.java +++ /dev/null @@ -1,427 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector.complex; - -import io.netty.buffer.ArrowBuf; - -import java.util.Iterator; -import java.util.List; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.OutOfMemoryException; -import org.apache.arrow.vector.AddOrGetResult; -import org.apache.arrow.vector.UInt4Vector; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VectorDescriptor; -import org.apache.arrow.vector.complex.impl.NullReader; -import org.apache.arrow.vector.complex.impl.RepeatedListReaderImpl; -import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.holders.ComplexHolder; -import org.apache.arrow.vector.holders.RepeatedListHolder; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.CallBack; -import org.apache.arrow.vector.util.JsonStringArrayList; -import org.apache.arrow.vector.util.TransferPair; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -public class RepeatedListVector extends AbstractContainerVector - implements RepeatedValueVector, RepeatedFixedWidthVectorLike { - - public final static MajorType TYPE = new MajorType(MinorType.LIST, DataMode.REPEATED); - private final RepeatedListReaderImpl reader = new RepeatedListReaderImpl(null, this); - final DelegateRepeatedVector delegate; - - protected static class DelegateRepeatedVector extends BaseRepeatedValueVector { - - private final RepeatedListAccessor accessor = new RepeatedListAccessor(); - private final RepeatedListMutator mutator = new RepeatedListMutator(); - private final EmptyValuePopulator emptyPopulator; - private transient DelegateTransferPair ephPair; - - public class RepeatedListAccessor extends BaseRepeatedValueVector.BaseRepeatedAccessor { - - @Override - public Object getObject(int index) { - final List list = new JsonStringArrayList<>(); - final int start = offsets.getAccessor().get(index); - final int until = offsets.getAccessor().get(index+1); - for (int i = start; i < until; i++) { - list.add(vector.getAccessor().getObject(i)); - } - return list; - } - - public void get(int index, RepeatedListHolder holder) { - assert index <= getValueCapacity(); - holder.start = getOffsetVector().getAccessor().get(index); - holder.end = getOffsetVector().getAccessor().get(index+1); - } - - public void get(int index, ComplexHolder holder) { - final FieldReader reader = getReader(); - reader.setPosition(index); - holder.reader = reader; - } - - public void get(int index, int arrayIndex, ComplexHolder holder) { - final RepeatedListHolder listHolder = new RepeatedListHolder(); - get(index, listHolder); - int offset = listHolder.start + arrayIndex; - if (offset >= listHolder.end) { - holder.reader = NullReader.INSTANCE; - } else { - FieldReader r = getDataVector().getReader(); - r.setPosition(offset); - holder.reader = r; - } - } - } - - public class RepeatedListMutator extends BaseRepeatedValueVector.BaseRepeatedMutator { - - public int add(int index) { - final int curEnd = getOffsetVector().getAccessor().get(index+1); - getOffsetVector().getMutator().setSafe(index + 1, curEnd + 1); - return curEnd; - } - - @Override - public void startNewValue(int index) { - emptyPopulator.populate(index+1); - super.startNewValue(index); - } - - @Override - public void setValueCount(int valueCount) { - emptyPopulator.populate(valueCount); - super.setValueCount(valueCount); - } - } - - - public class DelegateTransferPair implements TransferPair { - private final DelegateRepeatedVector target; - private final TransferPair[] children; - - public DelegateTransferPair(DelegateRepeatedVector target) { - this.target = Preconditions.checkNotNull(target); - if (target.getDataVector() == DEFAULT_DATA_VECTOR) { - target.addOrGetVector(VectorDescriptor.create(getDataVector().getField())); - target.getDataVector().allocateNew(); - } - this.children = new TransferPair[] { - getOffsetVector().makeTransferPair(target.getOffsetVector()), - getDataVector().makeTransferPair(target.getDataVector()) - }; - } - - @Override - public void transfer() { - for (TransferPair child:children) { - child.transfer(); - } - } - - @Override - public ValueVector getTo() { - return target; - } - - @Override - public void splitAndTransfer(int startIndex, int length) { - target.allocateNew(); - for (int i = 0; i < length; i++) { - copyValueSafe(startIndex + i, i); - } - } - - @Override - public void copyValueSafe(int srcIndex, int destIndex) { - final RepeatedListHolder holder = new RepeatedListHolder(); - getAccessor().get(srcIndex, holder); - target.emptyPopulator.populate(destIndex+1); - final TransferPair vectorTransfer = children[1]; - int newIndex = target.getOffsetVector().getAccessor().get(destIndex); - //todo: make this a bulk copy. - for (int i = holder.start; i < holder.end; i++, newIndex++) { - vectorTransfer.copyValueSafe(i, newIndex); - } - target.getOffsetVector().getMutator().setSafe(destIndex + 1, newIndex); - } - } - - public DelegateRepeatedVector(String path, BufferAllocator allocator) { - this(MaterializedField.create(path, TYPE), allocator); - } - - public DelegateRepeatedVector(MaterializedField field, BufferAllocator allocator) { - super(field, allocator); - emptyPopulator = new EmptyValuePopulator(getOffsetVector()); - } - - @Override - public void allocateNew() throws OutOfMemoryException { - if (!allocateNewSafe()) { - throw new OutOfMemoryException(); - } - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return makeTransferPair(new DelegateRepeatedVector(ref, allocator)); - } - - @Override - public TransferPair makeTransferPair(ValueVector target) { - return new DelegateTransferPair(DelegateRepeatedVector.class.cast(target)); - } - - @Override - public RepeatedListAccessor getAccessor() { - return accessor; - } - - @Override - public RepeatedListMutator getMutator() { - return mutator; - } - - @Override - public FieldReader getReader() { - throw new UnsupportedOperationException(); - } - - public void copyFromSafe(int fromIndex, int thisIndex, DelegateRepeatedVector from) { - if(ephPair == null || ephPair.target != from) { - ephPair = DelegateTransferPair.class.cast(from.makeTransferPair(this)); - } - ephPair.copyValueSafe(fromIndex, thisIndex); - } - - } - - protected class RepeatedListTransferPair implements TransferPair { - private final TransferPair delegate; - - public RepeatedListTransferPair(TransferPair delegate) { - this.delegate = delegate; - } - - public void transfer() { - delegate.transfer(); - } - - @Override - public void splitAndTransfer(int startIndex, int length) { - delegate.splitAndTransfer(startIndex, length); - } - - @Override - public ValueVector getTo() { - final DelegateRepeatedVector delegateVector = DelegateRepeatedVector.class.cast(delegate.getTo()); - return new RepeatedListVector(getField(), allocator, callBack, delegateVector); - } - - @Override - public void copyValueSafe(int from, int to) { - delegate.copyValueSafe(from, to); - } - } - - public RepeatedListVector(String path, BufferAllocator allocator, CallBack callBack) { - this(MaterializedField.create(path, TYPE), allocator, callBack); - } - - public RepeatedListVector(MaterializedField field, BufferAllocator allocator, CallBack callBack) { - this(field, allocator, callBack, new DelegateRepeatedVector(field, allocator)); - } - - protected RepeatedListVector(MaterializedField field, BufferAllocator allocator, CallBack callBack, DelegateRepeatedVector delegate) { - super(field, allocator, callBack); - this.delegate = Preconditions.checkNotNull(delegate); - - final List children = Lists.newArrayList(field.getChildren()); - final int childSize = children.size(); - assert childSize < 3; - final boolean hasChild = childSize > 0; - if (hasChild) { - // the last field is data field - final MaterializedField child = children.get(childSize-1); - addOrGetVector(VectorDescriptor.create(child)); - } - } - - - @Override - public RepeatedListReaderImpl getReader() { - return reader; - } - - @Override - public DelegateRepeatedVector.RepeatedListAccessor getAccessor() { - return delegate.getAccessor(); - } - - @Override - public DelegateRepeatedVector.RepeatedListMutator getMutator() { - return delegate.getMutator(); - } - - @Override - public UInt4Vector getOffsetVector() { - return delegate.getOffsetVector(); - } - - @Override - public ValueVector getDataVector() { - return delegate.getDataVector(); - } - - @Override - public void allocateNew() throws OutOfMemoryException { - delegate.allocateNew(); - } - - @Override - public boolean allocateNewSafe() { - return delegate.allocateNewSafe(); - } - - @Override - public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { - final AddOrGetResult result = delegate.addOrGetVector(descriptor); - if (result.isCreated() && callBack != null) { - callBack.doWork(); - } - return result; - } - - @Override - public int size() { - return delegate.size(); - } - - @Override - public int getBufferSize() { - return delegate.getBufferSize(); - } - - @Override - public int getBufferSizeFor(final int valueCount) { - return delegate.getBufferSizeFor(valueCount); - } - - @Override - public void close() { - delegate.close(); - } - - @Override - public void clear() { - delegate.clear(); - } - - @Override - public TransferPair getTransferPair(BufferAllocator allocator) { - return new RepeatedListTransferPair(delegate.getTransferPair(allocator)); - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new RepeatedListTransferPair(delegate.getTransferPair(ref, allocator)); - } - - @Override - public TransferPair makeTransferPair(ValueVector to) { - final RepeatedListVector target = RepeatedListVector.class.cast(to); - return new RepeatedListTransferPair(delegate.makeTransferPair(target.delegate)); - } - - @Override - public int getValueCapacity() { - return delegate.getValueCapacity(); - } - - @Override - public ArrowBuf[] getBuffers(boolean clear) { - return delegate.getBuffers(clear); - } - - -// @Override -// public void load(SerializedField metadata, DrillBuf buf) { -// delegate.load(metadata, buf); -// } - -// @Override -// public SerializedField getMetadata() { -// return delegate.getMetadata(); -// } - - @Override - public Iterator iterator() { - return delegate.iterator(); - } - - @Override - public void setInitialCapacity(int numRecords) { - delegate.setInitialCapacity(numRecords); - } - - /** - * @deprecated - * prefer using {@link #addOrGetVector(org.apache.arrow.vector.VectorDescriptor)} instead. - */ - @Override - public T addOrGet(String name, MajorType type, Class clazz) { - final AddOrGetResult result = addOrGetVector(VectorDescriptor.create(type)); - return result.getVector(); - } - - @Override - public T getChild(String name, Class clazz) { - if (name != null) { - return null; - } - return typeify(delegate.getDataVector(), clazz); - } - - @Override - public void allocateNew(int valueCount, int innerValueCount) { - clear(); - getOffsetVector().allocateNew(valueCount + 1); - getMutator().reset(); - } - - @Override - public VectorWithOrdinal getChildVectorWithOrdinal(String name) { - if (name != null) { - return null; - } - return new VectorWithOrdinal(delegate.getDataVector(), 0); - } - - public void copyFromSafe(int fromIndex, int thisIndex, RepeatedListVector from) { - delegate.copyFromSafe(fromIndex, thisIndex, from.delegate); - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java deleted file mode 100644 index 686414e71ca..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedMapVector.java +++ /dev/null @@ -1,584 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector.complex; - -import io.netty.buffer.ArrowBuf; - -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.OutOfMemoryException; -import org.apache.arrow.vector.AddOrGetResult; -import org.apache.arrow.vector.AllocationHelper; -import org.apache.arrow.vector.UInt4Vector; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VectorDescriptor; -import org.apache.arrow.vector.complex.impl.NullReader; -import org.apache.arrow.vector.complex.impl.RepeatedMapReaderImpl; -import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.holders.ComplexHolder; -import org.apache.arrow.vector.holders.RepeatedMapHolder; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.CallBack; -import org.apache.arrow.vector.util.JsonStringArrayList; -import org.apache.arrow.vector.util.TransferPair; -import org.apache.commons.lang3.ArrayUtils; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -public class RepeatedMapVector extends AbstractMapVector - implements RepeatedValueVector, RepeatedFixedWidthVectorLike { - //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RepeatedMapVector.class); - - public final static MajorType TYPE = new MajorType(MinorType.MAP, DataMode.REPEATED); - - final UInt4Vector offsets; // offsets to start of each record (considering record indices are 0-indexed) - private final RepeatedMapReaderImpl reader = new RepeatedMapReaderImpl(RepeatedMapVector.this); - private final RepeatedMapAccessor accessor = new RepeatedMapAccessor(); - private final Mutator mutator = new Mutator(); - private final EmptyValuePopulator emptyPopulator; - - public RepeatedMapVector(MaterializedField field, BufferAllocator allocator, CallBack callBack){ - super(field, allocator, callBack); - this.offsets = new UInt4Vector(BaseRepeatedValueVector.OFFSETS_FIELD, allocator); - this.emptyPopulator = new EmptyValuePopulator(offsets); - } - - @Override - public UInt4Vector getOffsetVector() { - return offsets; - } - - @Override - public ValueVector getDataVector() { - throw new UnsupportedOperationException(); - } - - @Override - public AddOrGetResult addOrGetVector(VectorDescriptor descriptor) { - throw new UnsupportedOperationException(); - } - - @Override - public void setInitialCapacity(int numRecords) { - offsets.setInitialCapacity(numRecords + 1); - for(final ValueVector v : (Iterable) this) { - v.setInitialCapacity(numRecords * RepeatedValueVector.DEFAULT_REPEAT_PER_RECORD); - } - } - - @Override - public RepeatedMapReaderImpl getReader() { - return reader; - } - - @Override - public void allocateNew(int groupCount, int innerValueCount) { - clear(); - try { - offsets.allocateNew(groupCount + 1); - for (ValueVector v : getChildren()) { - AllocationHelper.allocatePrecomputedChildCount(v, groupCount, 50, innerValueCount); - } - } catch (OutOfMemoryException e){ - clear(); - throw e; - } - offsets.zeroVector(); - mutator.reset(); - } - - public Iterator fieldNameIterator() { - return getChildFieldNames().iterator(); - } - - @Override - public List getPrimitiveVectors() { - final List primitiveVectors = super.getPrimitiveVectors(); - primitiveVectors.add(offsets); - return primitiveVectors; - } - - @Override - public int getBufferSize() { - if (getAccessor().getValueCount() == 0) { - return 0; - } - long bufferSize = offsets.getBufferSize(); - for (final ValueVector v : (Iterable) this) { - bufferSize += v.getBufferSize(); - } - return (int) bufferSize; - } - - @Override - public int getBufferSizeFor(final int valueCount) { - if (valueCount == 0) { - return 0; - } - - long bufferSize = 0; - for (final ValueVector v : (Iterable) this) { - bufferSize += v.getBufferSizeFor(valueCount); - } - - return (int) bufferSize; - } - - @Override - public void close() { - offsets.close(); - super.close(); - } - - @Override - public TransferPair getTransferPair(BufferAllocator allocator) { - return new RepeatedMapTransferPair(this, getField().getPath(), allocator); - } - - @Override - public TransferPair makeTransferPair(ValueVector to) { - return new RepeatedMapTransferPair(this, (RepeatedMapVector)to); - } - - MapSingleCopier makeSingularCopier(MapVector to) { - return new MapSingleCopier(this, to); - } - - protected static class MapSingleCopier { - private final TransferPair[] pairs; - public final RepeatedMapVector from; - - public MapSingleCopier(RepeatedMapVector from, MapVector to) { - this.from = from; - this.pairs = new TransferPair[from.size()]; - - int i = 0; - ValueVector vector; - for (final String child:from.getChildFieldNames()) { - int preSize = to.size(); - vector = from.getChild(child); - if (vector == null) { - continue; - } - final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); - if (to.size() != preSize) { - newVector.allocateNew(); - } - pairs[i++] = vector.makeTransferPair(newVector); - } - } - - public void copySafe(int fromSubIndex, int toIndex) { - for (TransferPair p : pairs) { - p.copyValueSafe(fromSubIndex, toIndex); - } - } - } - - public TransferPair getTransferPairToSingleMap(String reference, BufferAllocator allocator) { - return new SingleMapTransferPair(this, reference, allocator); - } - - @Override - public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new RepeatedMapTransferPair(this, ref, allocator); - } - - @Override - public boolean allocateNewSafe() { - /* boolean to keep track if all the memory allocation were successful - * Used in the case of composite vectors when we need to allocate multiple - * buffers for multiple vectors. If one of the allocations failed we need to - * clear all the memory that we allocated - */ - boolean success = false; - try { - if (!offsets.allocateNewSafe()) { - return false; - } - success = super.allocateNewSafe(); - } finally { - if (!success) { - clear(); - } - } - offsets.zeroVector(); - return success; - } - - protected static class SingleMapTransferPair implements TransferPair { - private final TransferPair[] pairs; - private final RepeatedMapVector from; - private final MapVector to; - private static final MajorType MAP_TYPE = new MajorType(MinorType.MAP, DataMode.REQUIRED); - - public SingleMapTransferPair(RepeatedMapVector from, String path, BufferAllocator allocator) { - this(from, new MapVector(MaterializedField.create(path, MAP_TYPE), allocator, from.callBack), false); - } - - public SingleMapTransferPair(RepeatedMapVector from, MapVector to) { - this(from, to, true); - } - - public SingleMapTransferPair(RepeatedMapVector from, MapVector to, boolean allocate) { - this.from = from; - this.to = to; - this.pairs = new TransferPair[from.size()]; - int i = 0; - ValueVector vector; - for (final String child : from.getChildFieldNames()) { - int preSize = to.size(); - vector = from.getChild(child); - if (vector == null) { - continue; - } - final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); - if (allocate && to.size() != preSize) { - newVector.allocateNew(); - } - pairs[i++] = vector.makeTransferPair(newVector); - } - } - - - @Override - public void transfer() { - for (TransferPair p : pairs) { - p.transfer(); - } - to.getMutator().setValueCount(from.getAccessor().getValueCount()); - from.clear(); - } - - @Override - public ValueVector getTo() { - return to; - } - - @Override - public void copyValueSafe(int from, int to) { - for (TransferPair p : pairs) { - p.copyValueSafe(from, to); - } - } - - @Override - public void splitAndTransfer(int startIndex, int length) { - for (TransferPair p : pairs) { - p.splitAndTransfer(startIndex, length); - } - to.getMutator().setValueCount(length); - } - } - - private static class RepeatedMapTransferPair implements TransferPair{ - - private final TransferPair[] pairs; - private final RepeatedMapVector to; - private final RepeatedMapVector from; - - public RepeatedMapTransferPair(RepeatedMapVector from, String path, BufferAllocator allocator) { - this(from, new RepeatedMapVector(MaterializedField.create(path, TYPE), allocator, from.callBack), false); - } - - public RepeatedMapTransferPair(RepeatedMapVector from, RepeatedMapVector to) { - this(from, to, true); - } - - public RepeatedMapTransferPair(RepeatedMapVector from, RepeatedMapVector to, boolean allocate) { - this.from = from; - this.to = to; - this.pairs = new TransferPair[from.size()]; - this.to.ephPair = null; - - int i = 0; - ValueVector vector; - for (final String child : from.getChildFieldNames()) { - final int preSize = to.size(); - vector = from.getChild(child); - if (vector == null) { - continue; - } - - final ValueVector newVector = to.addOrGet(child, vector.getField().getType(), vector.getClass()); - if (to.size() != preSize) { - newVector.allocateNew(); - } - - pairs[i++] = vector.makeTransferPair(newVector); - } - } - - @Override - public void transfer() { - from.offsets.transferTo(to.offsets); - for (TransferPair p : pairs) { - p.transfer(); - } - from.clear(); - } - - @Override - public ValueVector getTo() { - return to; - } - - @Override - public void copyValueSafe(int srcIndex, int destIndex) { - RepeatedMapHolder holder = new RepeatedMapHolder(); - from.getAccessor().get(srcIndex, holder); - to.emptyPopulator.populate(destIndex + 1); - int newIndex = to.offsets.getAccessor().get(destIndex); - //todo: make these bulk copies - for (int i = holder.start; i < holder.end; i++, newIndex++) { - for (TransferPair p : pairs) { - p.copyValueSafe(i, newIndex); - } - } - to.offsets.getMutator().setSafe(destIndex + 1, newIndex); - } - - @Override - public void splitAndTransfer(final int groupStart, final int groups) { - final UInt4Vector.Accessor a = from.offsets.getAccessor(); - final UInt4Vector.Mutator m = to.offsets.getMutator(); - - final int startPos = a.get(groupStart); - final int endPos = a.get(groupStart + groups); - final int valuesToCopy = endPos - startPos; - - to.offsets.clear(); - to.offsets.allocateNew(groups + 1); - - int normalizedPos; - for (int i = 0; i < groups + 1; i++) { - normalizedPos = a.get(groupStart + i) - startPos; - m.set(i, normalizedPos); - } - - m.setValueCount(groups + 1); - to.emptyPopulator.populate(groups); - - for (final TransferPair p : pairs) { - p.splitAndTransfer(startPos, valuesToCopy); - } - } - } - - - transient private RepeatedMapTransferPair ephPair; - - public void copyFromSafe(int fromIndex, int thisIndex, RepeatedMapVector from) { - if (ephPair == null || ephPair.from != from) { - ephPair = (RepeatedMapTransferPair) from.makeTransferPair(this); - } - ephPair.copyValueSafe(fromIndex, thisIndex); - } - - @Override - public int getValueCapacity() { - return Math.max(offsets.getValueCapacity() - 1, 0); - } - - @Override - public RepeatedMapAccessor getAccessor() { - return accessor; - } - - @Override - public ArrowBuf[] getBuffers(boolean clear) { - final int expectedBufferSize = getBufferSize(); - final int actualBufferSize = super.getBufferSize(); - - Preconditions.checkArgument(expectedBufferSize == actualBufferSize + offsets.getBufferSize()); - return ArrayUtils.addAll(offsets.getBuffers(clear), super.getBuffers(clear)); - } - - -// @Override -// public void load(SerializedField metadata, DrillBuf buffer) { -// final List children = metadata.getChildList(); -// -// final SerializedField offsetField = children.get(0); -// offsets.load(offsetField, buffer); -// int bufOffset = offsetField.getBufferLength(); -// -// for (int i = 1; i < children.size(); i++) { -// final SerializedField child = children.get(i); -// final MaterializedField fieldDef = SerializedFieldHelper.create(child); -// ValueVector vector = getChild(fieldDef.getLastName()); -// if (vector == null) { - // if we arrive here, we didn't have a matching vector. -// vector = BasicTypeHelper.getNewVector(fieldDef, allocator); -// putChild(fieldDef.getLastName(), vector); -// } -// final int vectorLength = child.getBufferLength(); -// vector.load(child, buffer.slice(bufOffset, vectorLength)); -// bufOffset += vectorLength; -// } -// -// assert bufOffset == buffer.capacity(); -// } -// -// -// @Override -// public SerializedField getMetadata() { -// SerializedField.Builder builder = getField() // -// .getAsBuilder() // -// .setBufferLength(getBufferSize()) // - // while we don't need to actually read this on load, we need it to make sure we don't skip deserialization of this vector -// .setValueCount(accessor.getValueCount()); -// builder.addChild(offsets.getMetadata()); -// for (final ValueVector child : getChildren()) { -// builder.addChild(child.getMetadata()); -// } -// return builder.build(); -// } - - @Override - public Mutator getMutator() { - return mutator; - } - - public class RepeatedMapAccessor implements RepeatedAccessor { - @Override - public Object getObject(int index) { - final List list = new JsonStringArrayList<>(); - final int end = offsets.getAccessor().get(index+1); - String fieldName; - for (int i = offsets.getAccessor().get(index); i < end; i++) { - final Map vv = Maps.newLinkedHashMap(); - for (final MaterializedField field : getField().getChildren()) { - if (!field.equals(BaseRepeatedValueVector.OFFSETS_FIELD)) { - fieldName = field.getLastName(); - final Object value = getChild(fieldName).getAccessor().getObject(i); - if (value != null) { - vv.put(fieldName, value); - } - } - } - list.add(vv); - } - return list; - } - - @Override - public int getValueCount() { - return Math.max(offsets.getAccessor().getValueCount() - 1, 0); - } - - @Override - public int getInnerValueCount() { - final int valueCount = getValueCount(); - if (valueCount == 0) { - return 0; - } - return offsets.getAccessor().get(valueCount); - } - - @Override - public int getInnerValueCountAt(int index) { - return offsets.getAccessor().get(index+1) - offsets.getAccessor().get(index); - } - - @Override - public boolean isEmpty(int index) { - return false; - } - - @Override - public boolean isNull(int index) { - return false; - } - - public void get(int index, RepeatedMapHolder holder) { - assert index < getValueCapacity() : - String.format("Attempted to access index %d when value capacity is %d", - index, getValueCapacity()); - final UInt4Vector.Accessor offsetsAccessor = offsets.getAccessor(); - holder.start = offsetsAccessor.get(index); - holder.end = offsetsAccessor.get(index + 1); - } - - public void get(int index, ComplexHolder holder) { - final FieldReader reader = getReader(); - reader.setPosition(index); - holder.reader = reader; - } - - public void get(int index, int arrayIndex, ComplexHolder holder) { - final RepeatedMapHolder h = new RepeatedMapHolder(); - get(index, h); - final int offset = h.start + arrayIndex; - - if (offset >= h.end) { - holder.reader = NullReader.INSTANCE; - } else { - reader.setSinglePosition(index, arrayIndex); - holder.reader = reader; - } - } - } - - public class Mutator implements RepeatedMutator { - @Override - public void startNewValue(int index) { - emptyPopulator.populate(index + 1); - offsets.getMutator().setSafe(index + 1, offsets.getAccessor().get(index)); - } - - @Override - public void setValueCount(int topLevelValueCount) { - emptyPopulator.populate(topLevelValueCount); - offsets.getMutator().setValueCount(topLevelValueCount == 0 ? 0 : topLevelValueCount + 1); - int childValueCount = offsets.getAccessor().get(topLevelValueCount); - for (final ValueVector v : getChildren()) { - v.getMutator().setValueCount(childValueCount); - } - } - - @Override - public void reset() {} - - @Override - public void generateTestData(int values) {} - - public int add(int index) { - final int prevEnd = offsets.getAccessor().get(index + 1); - offsets.getMutator().setSafe(index + 1, prevEnd + 1); - return prevEnd; - } - } - - @Override - public void clear() { - getMutator().reset(); - - offsets.clear(); - for(final ValueVector vector : getChildren()) { - vector.clear(); - } - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java index 99c0a0aeb1e..54db393e831 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/RepeatedValueVector.java @@ -28,7 +28,7 @@ * uses the offset vector to determine the sequence of cells pertaining to an individual value. * */ -public interface RepeatedValueVector extends ValueVector, ContainerVectorLike { +public interface RepeatedValueVector extends ValueVector { final static int DEFAULT_REPEAT_PER_RECORD = 5; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java index 264e241e739..259a954233c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java @@ -19,20 +19,20 @@ import java.util.Iterator; +import com.google.flatbuffers.FlatBufferBuilder; +import org.apache.arrow.flatbuf.Type; +import org.apache.arrow.flatbuf.Union; +import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.holders.UnionHolder; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; abstract class AbstractBaseReader implements FieldReader{ static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseReader.class); - private static final MajorType LATE_BIND_TYPE = new MajorType(MinorType.LATE, DataMode.OPTIONAL); private int index; @@ -58,15 +58,6 @@ public Iterator iterator() { throw new IllegalStateException("The current reader doesn't support reading as a map."); } - public MajorType getType(){ - throw new IllegalStateException("The current reader doesn't support getting type information."); - } - - @Override - public MaterializedField getField() { - return MaterializedField.create("unknown", LATE_BIND_TYPE); - } - @Override public boolean next() { throw new IllegalStateException("The current reader doesn't support getting next information."); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java index 4e1e103a12e..e6cf098f16f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseWriter.java @@ -23,25 +23,11 @@ abstract class AbstractBaseWriter implements FieldWriter { //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractBaseWriter.class); - final FieldWriter parent; private int index; - public AbstractBaseWriter(FieldWriter parent) { - this.parent = parent; - } - @Override public String toString() { - return super.toString() + "[index = " + index + ", parent = " + parent + "]"; - } - - @Override - public FieldWriter getParent() { - return parent; - } - - public boolean isRoot() { - return parent == null; + return super.toString() + "[index = " + index + "]"; } int idx() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java index 4e2051fd4ef..4d2adfb3256 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -17,20 +17,20 @@ */ package org.apache.arrow.vector.complex.impl; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.StateTool; import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; import com.google.common.base.Preconditions; +import org.apache.arrow.vector.types.pojo.Field; public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWriter { // private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ComplexWriterImpl.class); private SingleMapWriter mapRoot; - private SingleListWriter listRoot; + private UnionListWriter listRoot; private final MapVector container; Mode mode = Mode.INIT; @@ -40,7 +40,6 @@ public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWri private enum Mode { INIT, MAP, LIST }; public ComplexWriterImpl(String name, MapVector container, boolean unionEnabled){ - super(null); this.name = name; this.container = container; this.unionEnabled = unionEnabled; @@ -51,7 +50,7 @@ public ComplexWriterImpl(String name, MapVector container){ } @Override - public MaterializedField getField() { + public Field getField() { return container.getField(); } @@ -123,7 +122,7 @@ public MapWriter directMap(){ case INIT: MapVector map = (MapVector) container; - mapRoot = new SingleMapWriter(map, this, unionEnabled); + mapRoot = new SingleMapWriter(map); mapRoot.setPosition(idx()); mode = Mode.MAP; break; @@ -143,8 +142,8 @@ public MapWriter rootAsMap() { switch(mode){ case INIT: - MapVector map = container.addOrGet(name, Types.required(MinorType.MAP), MapVector.class); - mapRoot = new SingleMapWriter(map, this, unionEnabled); + MapVector map = container.addOrGet(name, MinorType.MAP, MapVector.class); + mapRoot = new SingleMapWriter(map); mapRoot.setPosition(idx()); mode = Mode.MAP; break; @@ -174,7 +173,12 @@ public ListWriter rootAsList() { switch(mode){ case INIT: - listRoot = new SingleListWriter(name, container, this); + int vectorCount = container.size(); + ListVector listVector = container.addOrGet(name, MinorType.LIST, ListVector.class); + if (container.size() > vectorCount) { + listVector.allocateNew(); + } + listRoot = new UnionListWriter(listVector); listRoot.setPosition(idx()); mode = Mode.LIST; break; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index 462ec9dd86a..586b1283fe8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -17,20 +17,14 @@ */ package org.apache.arrow.vector.complex.impl; -import java.lang.reflect.Constructor; - import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.VectorDescriptor; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.AbstractMapVector; import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.writer.FieldWriter; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.BasicTypeHelper; +import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.TransferPair; /** @@ -56,14 +50,12 @@ private enum State { private FieldWriter writer; public PromotableWriter(ValueVector v, AbstractMapVector parentContainer) { - super(null); this.parentContainer = parentContainer; this.listVector = null; init(v); } public PromotableWriter(ValueVector v, ListVector listVector) { - super(null); this.listVector = listVector; this.parentContainer = null; init(v); @@ -84,30 +76,8 @@ private void init(ValueVector v) { private void setWriter(ValueVector v) { state = State.SINGLE; vector = v; - type = v.getField().getType().getMinorType(); - Class writerClass = BasicTypeHelper - .getWriterImpl(v.getField().getType().getMinorType(), v.getField().getDataMode()); - if (writerClass.equals(SingleListWriter.class)) { - writerClass = UnionListWriter.class; - } - Class vectorClass = BasicTypeHelper.getValueVectorClass(v.getField().getType().getMinorType(), v.getField() - .getDataMode()); - try { - Constructor constructor = null; - for (Constructor c : writerClass.getConstructors()) { - if (c.getParameterTypes().length == 3) { - constructor = c; - } - } - if (constructor == null) { - constructor = writerClass.getConstructor(vectorClass, AbstractFieldWriter.class); - writer = (FieldWriter) constructor.newInstance(vector, null); - } else { - writer = (FieldWriter) constructor.newInstance(vector, null, true); - } - } catch (ReflectiveOperationException e) { - throw new RuntimeException(e); - } + type = v.getMinorType(); + writer = type.getNewFieldWriter(vector); } @Override @@ -129,7 +99,7 @@ protected FieldWriter getWriter(MinorType type) { if (type == null) { return null; } - ValueVector v = listVector.addOrGetVector(new VectorDescriptor(new MajorType(type, DataMode.OPTIONAL))).getVector(); + ValueVector v = listVector.addOrGetVector(type).getVector(); v.allocateNew(); setWriter(v); writer.setPosition(position); @@ -150,11 +120,11 @@ protected FieldWriter getWriter() { } private FieldWriter promoteToUnion() { - String name = vector.getField().getLastName(); - TransferPair tp = vector.getTransferPair(vector.getField().getType().getMinorType().name().toLowerCase(), vector.getAllocator()); + String name = vector.getField().getName(); + TransferPair tp = vector.getTransferPair(vector.getMinorType().name().toLowerCase(), vector.getAllocator()); tp.transfer(); if (parentContainer != null) { - unionVector = parentContainer.addOrGet(name, new MajorType(MinorType.UNION, DataMode.OPTIONAL), UnionVector.class); + unionVector = parentContainer.addOrGet(name, MinorType.UNION, UnionVector.class); unionVector.allocateNew(); } else if (listVector != null) { unionVector = listVector.promoteToUnion(); @@ -163,7 +133,7 @@ private FieldWriter promoteToUnion() { writer = new UnionWriter(unionVector); writer.setPosition(idx()); for (int i = 0; i < idx(); i++) { - unionVector.getMutator().setType(i, vector.getField().getType().getMinorType()); + unionVector.getMutator().setType(i, vector.getMinorType()); } vector = null; state = State.UNION; @@ -181,7 +151,7 @@ public void clear() { } @Override - public MaterializedField getField() { + public Field getField() { return getWriter().getField(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java deleted file mode 100644 index dd1a152e2f6..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedListReaderImpl.java +++ /dev/null @@ -1,145 +0,0 @@ -/******************************************************************************* - - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package org.apache.arrow.vector.complex.impl; - - -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.complex.RepeatedListVector; -import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; -import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; -import org.apache.arrow.vector.holders.RepeatedListHolder; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.types.Types.MinorType; - -public class RepeatedListReaderImpl extends AbstractFieldReader{ - private static final int NO_VALUES = Integer.MAX_VALUE - 1; - private static final MajorType TYPE = new MajorType(MinorType.LIST, DataMode.REPEATED); - private final String name; - private final RepeatedListVector container; - private FieldReader reader; - - public RepeatedListReaderImpl(String name, RepeatedListVector container) { - super(); - this.name = name; - this.container = container; - } - - @Override - public MajorType getType() { - return TYPE; - } - - @Override - public void copyAsValue(ListWriter writer) { - if (currentOffset == NO_VALUES) { - return; - } - RepeatedListWriter impl = (RepeatedListWriter) writer; - impl.container.copyFromSafe(idx(), impl.idx(), container); - } - - @Override - public void copyAsField(String name, MapWriter writer) { - if (currentOffset == NO_VALUES) { - return; - } - RepeatedListWriter impl = (RepeatedListWriter) writer.list(name); - impl.container.copyFromSafe(idx(), impl.idx(), container); - } - - private int currentOffset; - private int maxOffset; - - @Override - public void reset() { - super.reset(); - currentOffset = 0; - maxOffset = 0; - if (reader != null) { - reader.reset(); - } - reader = null; - } - - @Override - public int size() { - return maxOffset - currentOffset; - } - - @Override - public void setPosition(int index) { - if (index < 0 || index == NO_VALUES) { - currentOffset = NO_VALUES; - return; - } - - super.setPosition(index); - RepeatedListHolder h = new RepeatedListHolder(); - container.getAccessor().get(index, h); - if (h.start == h.end) { - currentOffset = NO_VALUES; - } else { - currentOffset = h.start-1; - maxOffset = h.end; - if(reader != null) { - reader.setPosition(currentOffset); - } - } - } - - @Override - public boolean next() { - if (currentOffset +1 < maxOffset) { - currentOffset++; - if (reader != null) { - reader.setPosition(currentOffset); - } - return true; - } else { - currentOffset = NO_VALUES; - return false; - } - } - - @Override - public Object readObject() { - return container.getAccessor().getObject(idx()); - } - - @Override - public FieldReader reader() { - if (reader == null) { - ValueVector child = container.getChild(name); - if (child == null) { - reader = NullReader.INSTANCE; - } else { - reader = child.getReader(); - } - reader.setPosition(currentOffset); - } - return reader; - } - - public boolean isSet() { - return true; - } - -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java deleted file mode 100644 index 09a831d8329..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/RepeatedMapReaderImpl.java +++ /dev/null @@ -1,192 +0,0 @@ -/******************************************************************************* - - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - ******************************************************************************/ -package org.apache.arrow.vector.complex.impl; - -import java.util.Map; - -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.complex.RepeatedMapVector; -import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; -import org.apache.arrow.vector.holders.RepeatedMapHolder; -import org.apache.arrow.vector.types.Types.MajorType; - -import com.google.common.collect.Maps; - -@SuppressWarnings("unused") -public class RepeatedMapReaderImpl extends AbstractFieldReader{ - private static final int NO_VALUES = Integer.MAX_VALUE - 1; - - private final RepeatedMapVector vector; - private final Map fields = Maps.newHashMap(); - - public RepeatedMapReaderImpl(RepeatedMapVector vector) { - this.vector = vector; - } - - private void setChildrenPosition(int index) { - for (FieldReader r : fields.values()) { - r.setPosition(index); - } - } - - @Override - public FieldReader reader(String name) { - FieldReader reader = fields.get(name); - if (reader == null) { - ValueVector child = vector.getChild(name); - if (child == null) { - reader = NullReader.INSTANCE; - } else { - reader = child.getReader(); - } - fields.put(name, reader); - reader.setPosition(currentOffset); - } - return reader; - } - - @Override - public FieldReader reader() { - if (currentOffset == NO_VALUES) { - return NullReader.INSTANCE; - } - - setChildrenPosition(currentOffset); - return new SingleLikeRepeatedMapReaderImpl(vector, this); - } - - private int currentOffset; - private int maxOffset; - - @Override - public void reset() { - super.reset(); - currentOffset = 0; - maxOffset = 0; - for (FieldReader reader:fields.values()) { - reader.reset(); - } - fields.clear(); - } - - @Override - public int size() { - if (isNull()) { - return 0; - } - return maxOffset - (currentOffset < 0 ? 0 : currentOffset); - } - - @Override - public void setPosition(int index) { - if (index < 0 || index == NO_VALUES) { - currentOffset = NO_VALUES; - return; - } - - super.setPosition(index); - RepeatedMapHolder h = new RepeatedMapHolder(); - vector.getAccessor().get(index, h); - if (h.start == h.end) { - currentOffset = NO_VALUES; - } else { - currentOffset = h.start-1; - maxOffset = h.end; - setChildrenPosition(currentOffset); - } - } - - public void setSinglePosition(int index, int childIndex) { - super.setPosition(index); - RepeatedMapHolder h = new RepeatedMapHolder(); - vector.getAccessor().get(index, h); - if (h.start == h.end) { - currentOffset = NO_VALUES; - } else { - int singleOffset = h.start + childIndex; - assert singleOffset < h.end; - currentOffset = singleOffset; - maxOffset = singleOffset + 1; - setChildrenPosition(singleOffset); - } - } - - @Override - public boolean next() { - if (currentOffset +1 < maxOffset) { - setChildrenPosition(++currentOffset); - return true; - } else { - currentOffset = NO_VALUES; - return false; - } - } - - public boolean isNull() { - return currentOffset == NO_VALUES; - } - - @Override - public Object readObject() { - return vector.getAccessor().getObject(idx()); - } - - @Override - public MajorType getType() { - return vector.getField().getType(); - } - - @Override - public java.util.Iterator iterator() { - return vector.fieldNameIterator(); - } - - @Override - public boolean isSet() { - return true; - } - - @Override - public void copyAsValue(MapWriter writer) { - if (currentOffset == NO_VALUES) { - return; - } - RepeatedMapWriter impl = (RepeatedMapWriter) writer; - impl.container.copyFromSafe(idx(), impl.idx(), vector); - } - - public void copyAsValueSingle(MapWriter writer) { - if (currentOffset == NO_VALUES) { - return; - } - SingleMapWriter impl = (SingleMapWriter) writer; - impl.container.copyFromSafe(currentOffset, impl.idx(), vector); - } - - @Override - public void copyAsField(String name, MapWriter writer) { - if (currentOffset == NO_VALUES) { - return; - } - RepeatedMapWriter impl = (RepeatedMapWriter) writer.map(name); - impl.container.copyFromSafe(idx(), impl.idx(), vector); - } - -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java deleted file mode 100644 index 086d26e1194..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleLikeRepeatedMapReaderImpl.java +++ /dev/null @@ -1,89 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.vector.complex.impl; - -import java.util.Iterator; - -import org.apache.arrow.vector.complex.RepeatedMapVector; -import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; -import org.apache.arrow.vector.types.Types; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.types.Types.MinorType; - -public class SingleLikeRepeatedMapReaderImpl extends AbstractFieldReader{ - - private RepeatedMapReaderImpl delegate; - - public SingleLikeRepeatedMapReaderImpl(RepeatedMapVector vector, FieldReader delegate) { - this.delegate = (RepeatedMapReaderImpl) delegate; - } - - @Override - public int size() { - throw new UnsupportedOperationException("You can't call size on a single map reader."); - } - - @Override - public boolean next() { - throw new UnsupportedOperationException("You can't call next on a single map reader."); - } - - @Override - public MajorType getType() { - return Types.required(MinorType.MAP); - } - - - @Override - public void copyAsValue(MapWriter writer) { - delegate.copyAsValueSingle(writer); - } - - public void copyAsValueSingle(MapWriter writer){ - delegate.copyAsValueSingle(writer); - } - - @Override - public FieldReader reader(String name) { - return delegate.reader(name); - } - - @Override - public void setPosition(int index) { - delegate.setPosition(index); - } - - @Override - public Object readObject() { - return delegate.readObject(); - } - - @Override - public Iterator iterator() { - return delegate.iterator(); - } - - @Override - public boolean isSet() { - return ! delegate.isNull(); - } - - -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java index f16f628603d..b8f58658eae 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleListReaderImpl.java @@ -24,14 +24,11 @@ import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; -import org.apache.arrow.vector.types.Types; -import org.apache.arrow.vector.types.Types.MajorType; import org.apache.arrow.vector.types.Types.MinorType; @SuppressWarnings("unused") public class SingleListReaderImpl extends AbstractFieldReader{ - private static final MajorType TYPE = Types.optional(MinorType.LIST); private final String name; private final AbstractContainerVector container; private FieldReader reader; @@ -42,12 +39,6 @@ public SingleListReaderImpl(String name, AbstractContainerVector container) { this.container = container; } - @Override - public MajorType getType() { - return TYPE; - } - - @Override public void setPosition(int index) { super.setPosition(index); @@ -70,6 +61,11 @@ public FieldReader reader() { return reader; } + @Override + public MinorType getMinorType() { + return MinorType.LIST; + } + @Override public boolean isSet() { return false; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java index 84b99801419..1c43240901c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java @@ -27,9 +27,9 @@ import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; -import org.apache.arrow.vector.types.Types.MajorType; import com.google.common.collect.Maps; +import org.apache.arrow.vector.types.Types.MinorType; @SuppressWarnings("unused") public class SingleMapReaderImpl extends AbstractFieldReader{ @@ -77,13 +77,13 @@ public Object readObject() { } @Override - public boolean isSet() { - return true; + public MinorType getMinorType() { + return MinorType.MAP; } @Override - public MajorType getType(){ - return vector.getField().getType(); + public boolean isSet() { + return true; } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java index 9b54d02e571..39cf0042115 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -25,8 +25,6 @@ import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.holders.UnionHolder; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; import org.apache.arrow.vector.types.Types.MinorType; public class UnionListReader extends AbstractFieldReader { @@ -46,12 +44,6 @@ public boolean isSet() { return true; } - MajorType type = new MajorType(MinorType.LIST, DataMode.OPTIONAL); - - public MajorType getType() { - return type; - } - private int currentOffset; private int maxOffset; @@ -72,6 +64,11 @@ public Object readObject() { return vector.getAccessor().getObject(idx()); } + @Override + public MinorType getMinorType() { + return MinorType.LIST; + } + @Override public void read(int index, UnionHolder holder) { setPosition(idx()); @@ -82,6 +79,12 @@ public void read(int index, UnionHolder holder) { holder.isSet = data.getReader().isSet() ? 1 : 0; } + @Override + public int size() { + int size = maxOffset - currentOffset - 1; + return size < 0 ? 0 : size; + } + @Override public boolean next() { if (currentOffset + 1 < maxOffset) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java deleted file mode 100644 index 5a5fe0305d8..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/ObjectHolder.java +++ /dev/null @@ -1,38 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.arrow.vector.holders; - -import org.apache.arrow.vector.types.Types; - -/* - * Holder class for the vector ObjectVector. This holder internally stores a - * reference to an object. The ObjectVector maintains an array of these objects. - * This holder can be used only as workspace variables in aggregate functions. - * Using this holder should be avoided and we should stick to native holder types. - */ -@Deprecated -public class ObjectHolder implements ValueHolder { - public static final Types.MajorType TYPE = Types.required(Types.MinorType.GENERIC_OBJECT); - - public Types.MajorType getType() { - return TYPE; - } - - public Object obj; -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java index b868a620f98..b1b695e58a9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/holders/UnionHolder.java @@ -18,17 +18,14 @@ package org.apache.arrow.vector.holders; import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; import org.apache.arrow.vector.types.Types.MinorType; public class UnionHolder implements ValueHolder { - public static final MajorType TYPE = new MajorType(MinorType.UNION, DataMode.OPTIONAL); public FieldReader reader; public int isSet; - public MajorType getType() { - return reader.getType(); + public MinorType getMinorType() { + return reader.getMinorType(); } public boolean isSet() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java b/java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java deleted file mode 100644 index c73098b2a85..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/MaterializedField.java +++ /dev/null @@ -1,217 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector.types; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.LinkedHashSet; -import java.util.Objects; - -import org.apache.arrow.vector.types.Types.DataMode; -import org.apache.arrow.vector.types.Types.MajorType; -import org.apache.arrow.vector.util.BasicTypeHelper; - - -public class MaterializedField { - private final String name; - private final MajorType type; - // use an ordered set as existing code relies on order (e,g. parquet writer) - private final LinkedHashSet children; - - MaterializedField(String name, MajorType type, LinkedHashSet children) { - this.name = name; - this.type = type; - this.children = children; - } - - public Collection getChildren() { - return new ArrayList<>(children); - } - - public MaterializedField newWithChild(MaterializedField child) { - MaterializedField newField = clone(); - newField.addChild(child); - return newField; - } - - public void addChild(MaterializedField field){ - children.add(field); - } - - public MaterializedField clone() { - return withPathAndType(name, getType()); - } - - public MaterializedField withType(MajorType type) { - return withPathAndType(name, type); - } - - public MaterializedField withPath(String name) { - return withPathAndType(name, getType()); - } - - public MaterializedField withPathAndType(String name, final MajorType type) { - final LinkedHashSet newChildren = new LinkedHashSet<>(children.size()); - for (final MaterializedField child:children) { - newChildren.add(child.clone()); - } - return new MaterializedField(name, type, newChildren); - } - -// public String getLastName(){ -// PathSegment seg = key.path.getRootSegment(); -// while (seg.getChild() != null) { -// seg = seg.getChild(); -// } -// return seg.getNameSegment().getPath(); -// } - - - // TODO: rewrite without as direct match rather than conversion then match. -// public boolean matches(SerializedField booleanfield){ -// MaterializedField f = create(field); -// return f.equals(this); -// } - - public static MaterializedField create(String name, MajorType type){ - return new MaterializedField(name, type, new LinkedHashSet()); - } - -// public String getName(){ -// StringBuilder sb = new StringBuilder(); -// boolean first = true; -// for(NamePart np : def.getNameList()){ -// if(np.getType() == Type.ARRAY){ -// sb.append("[]"); -// }else{ -// if(first){ -// first = false; -// }else{ -// sb.append("."); -// } -// sb.append('`'); -// sb.append(np.getName()); -// sb.append('`'); -// -// } -// } -// return sb.toString(); -// } - - public String getPath() { - return getName(); - } - - public String getLastName() { - return getName(); - } - - public String getName() { - return name; - } - -// public int getWidth() { -// return type.getWidth(); -// } - - public MajorType getType() { - return type; - } - - public int getScale() { - return type.getScale(); - } - public int getPrecision() { - return type.getPrecision(); - } - public boolean isNullable() { - return type.getMode() == DataMode.OPTIONAL; - } - - public DataMode getDataMode() { - return type.getMode(); - } - - public MaterializedField getOtherNullableVersion(){ - MajorType mt = type; - DataMode newDataMode = null; - switch (mt.getMode()){ - case OPTIONAL: - newDataMode = DataMode.REQUIRED; - break; - case REQUIRED: - newDataMode = DataMode.OPTIONAL; - break; - default: - throw new UnsupportedOperationException(); - } - return new MaterializedField(name, new MajorType(mt.getMinorType(), newDataMode, mt.getPrecision(), mt.getScale(), mt.getTimezone(), mt.getSubTypes()), children); - } - - public Class getValueClass() { - return BasicTypeHelper.getValueVectorClass(getType().getMinorType(), getDataMode()); - } - - @Override - public int hashCode() { - return Objects.hash(this.name, this.type, this.children); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; - } - if (obj == null) { - return false; - } - if (getClass() != obj.getClass()) { - return false; - } - MaterializedField other = (MaterializedField) obj; - // DRILL-1872: Compute equals only on key. See also the comment - // in MapVector$MapTransferPair - - return this.name.equalsIgnoreCase(other.name) && - Objects.equals(this.type, other.type); - } - - - @Override - public String toString() { - final int maxLen = 10; - String childStr = children != null && !children.isEmpty() ? toString(children, maxLen) : ""; - return name + "(" + type.getMinorType().name() + ":" + type.getMode().name() + ")" + childStr; - } - - - private String toString(Collection collection, int maxLen) { - StringBuilder builder = new StringBuilder(); - builder.append("["); - int i = 0; - for (Iterator iterator = collection.iterator(); iterator.hasNext() && i < maxLen; i++) { - if (i > 0){ - builder.append(", "); - } - builder.append(iterator.next()); - } - builder.append("]"); - return builder.toString(); - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 88999cb8f5a..5ea1456a051 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -17,150 +17,508 @@ */ package org.apache.arrow.vector.types; -import java.util.ArrayList; -import java.util.List; +import org.apache.arrow.flatbuf.Type; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.NullableBigIntVector; +import org.apache.arrow.vector.NullableBitVector; +import org.apache.arrow.vector.NullableDateVector; +import org.apache.arrow.vector.NullableDecimalVector; +import org.apache.arrow.vector.NullableFloat4Vector; +import org.apache.arrow.vector.NullableFloat8Vector; +import org.apache.arrow.vector.NullableIntVector; +import org.apache.arrow.vector.NullableIntervalDayVector; +import org.apache.arrow.vector.NullableIntervalYearVector; +import org.apache.arrow.vector.NullableSmallIntVector; +import org.apache.arrow.vector.NullableTimeStampVector; +import org.apache.arrow.vector.NullableTimeVector; +import org.apache.arrow.vector.NullableTinyIntVector; +import org.apache.arrow.vector.NullableUInt1Vector; +import org.apache.arrow.vector.NullableUInt2Vector; +import org.apache.arrow.vector.NullableUInt4Vector; +import org.apache.arrow.vector.NullableUInt8Vector; +import org.apache.arrow.vector.NullableVarBinaryVector; +import org.apache.arrow.vector.NullableVarCharVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.BigIntWriterImpl; +import org.apache.arrow.vector.complex.impl.BitWriterImpl; +import org.apache.arrow.vector.complex.impl.DateWriterImpl; +import org.apache.arrow.vector.complex.impl.Float4WriterImpl; +import org.apache.arrow.vector.complex.impl.Float8WriterImpl; +import org.apache.arrow.vector.complex.impl.IntWriterImpl; +import org.apache.arrow.vector.complex.impl.IntervalDayWriterImpl; +import org.apache.arrow.vector.complex.impl.IntervalYearWriterImpl; +import org.apache.arrow.vector.complex.impl.SingleMapWriter; +import org.apache.arrow.vector.complex.impl.SmallIntWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeStampWriterImpl; +import org.apache.arrow.vector.complex.impl.TimeWriterImpl; +import org.apache.arrow.vector.complex.impl.TinyIntWriterImpl; +import org.apache.arrow.vector.complex.impl.UInt1WriterImpl; +import org.apache.arrow.vector.complex.impl.UInt2WriterImpl; +import org.apache.arrow.vector.complex.impl.UInt4WriterImpl; +import org.apache.arrow.vector.complex.impl.UInt8WriterImpl; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.impl.UnionWriter; +import org.apache.arrow.vector.complex.impl.VarBinaryWriterImpl; +import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; +import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.Bool; +import org.apache.arrow.vector.types.pojo.ArrowType.Date; +import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.IntervalDay; +import org.apache.arrow.vector.types.pojo.ArrowType.IntervalYear; +import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.Time; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.CallBack; + +import java.util.HashMap; import java.util.Map; -import java.util.Objects; public class Types { + + public static final Field NULL_FIELD = new Field("", true, Null.INSTANCE, null); + public static final Field TINYINT_FIELD = new Field("", true, new Int(8, true), null); + public static final Field SMALLINT_FIELD = new Field("", true, new Int(16, true), null); + public static final Field INT_FIELD = new Field("", true, new Int(32, true), null); + public static final Field BIGINT_FIELD = new Field("", true, new Int(64, true), null); + public static final Field UINT1_FIELD = new Field("", true, new Int(8, false), null); + public static final Field UINT2_FIELD = new Field("", true, new Int(16, false), null); + public static final Field UINT4_FIELD = new Field("", true, new Int(32, false), null); + public static final Field UINT8_FIELD = new Field("", true, new Int(64, false), null); + public static final Field DATE_FIELD = new Field("", true, Date.INSTANCE, null); + public static final Field TIME_FIELD = new Field("", true, Time.INSTANCE, null); + public static final Field TIMESTAMP_FIELD = new Field("", true, new Timestamp(""), null); + public static final Field INTERVALDAY_FIELD = new Field("", true, IntervalDay.INSTANCE, null); + public static final Field INTERVALYEAR_FIELD = new Field("", true, IntervalYear.INSTANCE, null); + public static final Field FLOAT4_FIELD = new Field("", true, new FloatingPoint(0), null); + public static final Field FLOAT8_FIELD = new Field("", true, new FloatingPoint(1), null); + public static final Field LIST_FIELD = new Field("", true, List.INSTANCE, null); + public static final Field VARCHAR_FIELD = new Field("", true, Utf8.INSTANCE, null); + public static final Field VARBINARY_FIELD = new Field("", true, Binary.INSTANCE, null); + public static final Field BIT_FIELD = new Field("", true, Bool.INSTANCE, null); + + public enum MinorType { - LATE, // late binding type - MAP, // an empty map column. Useful for conceptual setup. Children listed within here - - TINYINT, // single byte signed integer - SMALLINT, // two byte signed integer - INT, // four byte signed integer - BIGINT, // eight byte signed integer - DECIMAL9, // a decimal supporting precision between 1 and 9 - DECIMAL18, // a decimal supporting precision between 10 and 18 - DECIMAL28SPARSE, // a decimal supporting precision between 19 and 28 - DECIMAL38SPARSE, // a decimal supporting precision between 29 and 38 - MONEY, // signed decimal with two digit precision - DATE, // days since 4713bc - TIME, // time in micros before or after 2000/1/1 - TIMETZ, // time in micros before or after 2000/1/1 with timezone - TIMESTAMPTZ, // unix epoch time in millis - TIMESTAMP, // TBD - INTERVAL, // TBD - FLOAT4, // 4 byte ieee 754 - FLOAT8, // 8 byte ieee 754 - BIT, // single bit value (boolean) - FIXEDCHAR, // utf8 fixed length string, padded with spaces - FIXED16CHAR, - FIXEDBINARY, // fixed length binary, padded with 0 bytes - VARCHAR, // utf8 variable length string - VAR16CHAR, // utf16 variable length string - VARBINARY, // variable length binary - UINT1, // unsigned 1 byte integer - UINT2, // unsigned 2 byte integer - UINT4, // unsigned 4 byte integer - UINT8, // unsigned 8 byte integer - DECIMAL28DENSE, // dense decimal representation, supporting precision between 19 and 28 - DECIMAL38DENSE, // dense decimal representation, supporting precision between 28 and 38 - NULL, // a value of unknown type (e.g. a missing reference). - INTERVALYEAR, // Interval type specifying YEAR to MONTH - INTERVALDAY, // Interval type specifying DAY to SECONDS - LIST, - GENERIC_OBJECT, - UNION - } + NULL(Null.INSTANCE) { + @Override + public Field getField() { + return NULL_FIELD; + } - public enum DataMode { - REQUIRED, - OPTIONAL, - REPEATED - } + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return ZeroVector.INSTANCE; + } - public static class MajorType { - private MinorType minorType; - private DataMode mode; - private int precision; - private int scale; - private int timezone; - private int width; - private List subTypes; - - public MajorType(MinorType minorType, DataMode mode) { - this(minorType, mode, 0, 0, 0, 0, null); - } + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return null; + } + }, + MAP(Tuple.INSTANCE) { + @Override + public Field getField() { + throw new UnsupportedOperationException("Cannot get simple field for Map type"); + } - public MajorType(MinorType minorType, DataMode mode, int precision, int scale) { - this(minorType, mode, precision, scale, 0, 0, null); - } + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new MapVector(name, allocator, callBack); + } - public MajorType(MinorType minorType, DataMode mode, int precision, int scale, int timezone, List subTypes) { - this(minorType, mode, precision, scale, timezone, 0, subTypes); - } + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new SingleMapWriter((MapVector) vector); + } + }, // an empty map column. Useful for conceptual setup. Children listed within here - public MajorType(MinorType minorType, DataMode mode, int precision, int scale, int timezone, int width, List subTypes) { - this.minorType = minorType; - this.mode = mode; - this.precision = precision; - this.scale = scale; - this.timezone = timezone; - this.width = width; - this.subTypes = subTypes; - if (subTypes == null) { - this.subTypes = new ArrayList<>(); + TINYINT(new Int(8, true)) { + @Override + public Field getField() { + return TINYINT_FIELD; } - } - public MinorType getMinorType() { - return minorType; - } + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableTinyIntVector(name, allocator); + } - public DataMode getMode() { - return mode; - } + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TinyIntWriterImpl((NullableTinyIntVector) vector); + } + }, // single byte signed integer + SMALLINT(new Int(16, true)) { + @Override + public Field getField() { + return SMALLINT_FIELD; + } - public int getPrecision() { - return precision; - } + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new SmallIntVector(name, allocator); + } - public int getScale() { - return scale; - } + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new SmallIntWriterImpl((NullableSmallIntVector) vector); + } + }, // two byte signed integer + INT(new Int(32, true)) { + @Override + public Field getField() { + return INT_FIELD; + } - public int getTimezone() { - return timezone; - } + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableIntVector(name, allocator); + } - public List getSubTypes() { - return subTypes; - } + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new IntWriterImpl((NullableIntVector) vector); + } + }, // four byte signed integer + BIGINT(new Int(64, true)) { + @Override + public Field getField() { + return BIGINT_FIELD; + } - public int getWidth() { - return width; - } + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableBigIntVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new BigIntWriterImpl((NullableBigIntVector) vector); + } + }, // eight byte signed integer + DATE(Date.INSTANCE) { + @Override + public Field getField() { + return DATE_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableDateVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new DateWriterImpl((NullableDateVector) vector); + } + }, // days since 4713bc + TIME(Time.INSTANCE) { + @Override + public Field getField() { + return TIME_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableTimeVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeWriterImpl((NullableTimeVector) vector); + } + }, // time in micros before or after 2000/1/1 + TIMESTAMP(new Timestamp("")) { + @Override + public Field getField() { + return TIMESTAMP_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableTimeStampVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new TimeStampWriterImpl((NullableTimeStampVector) vector); + } + }, + INTERVALDAY(IntervalDay.INSTANCE) { + @Override + public Field getField() { + return INTERVALDAY_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableIntervalDayVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new IntervalDayWriterImpl((NullableIntervalDayVector) vector); + } + }, + INTERVALYEAR(IntervalYear.INSTANCE) { + @Override + public Field getField() { + return INTERVALYEAR_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableIntervalDayVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new IntervalYearWriterImpl((NullableIntervalYearVector) vector); + } + }, + FLOAT4(new FloatingPoint(0)) { + @Override + public Field getField() { + return FLOAT4_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableFloat4Vector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new Float4WriterImpl((NullableFloat4Vector) vector); + } + }, // 4 byte ieee 754 + FLOAT8(new FloatingPoint(1)) { + @Override + public Field getField() { + return FLOAT8_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableFloat8Vector(name, allocator); + } + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new Float8WriterImpl((NullableFloat8Vector) vector); + } + }, // 8 byte ieee 754 + BIT(Bool.INSTANCE) { + @Override + public Field getField() { + return BIT_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableBitVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new BitWriterImpl((NullableBitVector) vector); + } + }, // single bit value (boolean) + VARCHAR(Utf8.INSTANCE) { + @Override + public Field getField() { + return VARCHAR_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableVarCharVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new VarCharWriterImpl((NullableVarCharVector) vector); + } + }, // utf8 variable length string + VARBINARY(Binary.INSTANCE) { + @Override + public Field getField() { + return VARBINARY_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableVarBinaryVector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new VarBinaryWriterImpl((NullableVarBinaryVector) vector); + } + }, // variable length binary + DECIMAL(null) { + @Override + public ArrowType getType() { + throw new UnsupportedOperationException("Cannot get simple type for Decimal type"); + } + @Override + public Field getField() { + throw new UnsupportedOperationException("Cannot get simple field for Decimal type"); + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableDecimalVector(name, allocator, precisionScale[0], precisionScale[1]); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new VarBinaryWriterImpl((NullableVarBinaryVector) vector); + } + }, // variable length binary + UINT1(new Int(8, false)) { + @Override + public Field getField() { + return UINT1_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableUInt1Vector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt1WriterImpl((NullableUInt1Vector) vector); + } + }, // unsigned 1 byte integer + UINT2(new Int(16, false)) { + @Override + public Field getField() { + return UINT2_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableUInt2Vector(name, allocator); + } - @Override - public boolean equals(Object other) { - if (other == null) { - return false; + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt2WriterImpl((NullableUInt2Vector) vector); } - if (!(other instanceof MajorType)) { - return false; + }, // unsigned 2 byte integer + UINT4(new Int(32, false)) { + @Override + public Field getField() { + return UINT8_FIELD; } - MajorType that = (MajorType) other; - return this.minorType == that.minorType && - this.mode == that.mode && - this.precision == that.precision && - this.scale == that.scale && - this.timezone == that.timezone && - this.width == that.width && - Objects.equals(this.subTypes, that.subTypes); + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableUInt4Vector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt4WriterImpl((NullableUInt4Vector) vector); + } + }, // unsigned 4 byte integer + UINT8(new Int(64, false)) { + @Override + public Field getField() { + return UINT8_FIELD; + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableUInt8Vector(name, allocator); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UInt8WriterImpl((NullableUInt8Vector) vector); + } + }, // unsigned 8 byte integer + LIST(List.INSTANCE) { + @Override + public Field getField() { + throw new UnsupportedOperationException("Cannot get simple field for List type"); + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new ListVector(name, allocator, callBack); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionListWriter((ListVector) vector); + } + }, + UNION(Union.INSTANCE) { + @Override + public Field getField() { + throw new UnsupportedOperationException("Cannot get simple field for Union type"); + } + + @Override + public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new UnionVector(name, allocator, callBack); + } + + @Override + public FieldWriter getNewFieldWriter(ValueVector vector) { + return new UnionWriter((UnionVector) vector); + } + }; + + private final ArrowType type; + + MinorType(ArrowType type) { + this.type = type; } - } + public ArrowType getType() { + return type; + } + + public abstract Field getField(); - public static MajorType required(MinorType minorType) { - return new MajorType(minorType, DataMode.REQUIRED); + public abstract ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale); + + public abstract FieldWriter getNewFieldWriter(ValueVector vector); } - public static MajorType optional(MinorType minorType) { - return new MajorType(minorType, DataMode.OPTIONAL); + + private static final Map ARROW_TYPE_MINOR_TYPE_MAP; + + public static MinorType getMinorTypeForArrowType(ArrowType arrowType) { + if (arrowType.getTypeType() == Type.Decimal) { + return MinorType.DECIMAL; + } + return ARROW_TYPE_MINOR_TYPE_MAP.get(arrowType); } - public static MajorType repeated(MinorType minorType) { - return new MajorType(minorType, DataMode.REPEATED); + + static { + ARROW_TYPE_MINOR_TYPE_MAP = new HashMap<>(); + for (MinorType minorType : MinorType.values()) { + if (minorType != MinorType.DECIMAL) { + ARROW_TYPE_MINOR_TYPE_MAP.put(minorType.getType(), minorType); + } + } } + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java new file mode 100644 index 00000000000..49d0503e470 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -0,0 +1,105 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.types.pojo; + + +import com.google.common.collect.ImmutableList; +import com.google.flatbuffers.FlatBufferBuilder; + +import java.util.List; +import java.util.Objects; + +import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; + +public class Field { + private final String name; + private final boolean nullable; + private final ArrowType type; + private final List children; + + public Field(String name, boolean nullable, ArrowType type, List children) { + this.name = name; + this.nullable = nullable; + this.type = type; + if (children == null) { + this.children = ImmutableList.of(); + } else { + this.children = children; + } + } + + public static Field convertField(org.apache.arrow.flatbuf.Field field) { + String name = field.name(); + boolean nullable = field.nullable(); + ArrowType type = getTypeForField(field); + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + for (int i = 0; i < field.childrenLength(); i++) { + childrenBuilder.add(convertField(field.children(i))); + } + List children = childrenBuilder.build(); + return new Field(name, nullable, type, children); + } + + public int getField(FlatBufferBuilder builder) { + int nameOffset = builder.createString(name); + int typeOffset = type.getType(builder); + int[] childrenData = new int[children.size()]; + for (int i = 0; i < children.size(); i++) { + childrenData[i] = children.get(i).getField(builder); + } + int childrenOffset = org.apache.arrow.flatbuf.Field.createChildrenVector(builder, childrenData); + org.apache.arrow.flatbuf.Field.startField(builder); + org.apache.arrow.flatbuf.Field.addName(builder, nameOffset); + org.apache.arrow.flatbuf.Field.addNullable(builder, nullable); + org.apache.arrow.flatbuf.Field.addTypeType(builder, type.getTypeType()); + org.apache.arrow.flatbuf.Field.addType(builder, typeOffset); + org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset); + return org.apache.arrow.flatbuf.Field.endField(builder); + } + + public String getName() { + return name; + } + + public boolean isNullable() { + return nullable; + } + + public ArrowType getType() { + return type; + } + + public List getChildren() { + return children; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Field)) { + return false; + } + Field that = (Field) obj; + return Objects.equals(this.name, that.name) && + Objects.equals(this.nullable, that.nullable) && + Objects.equals(this.type, that.type) && + (Objects.equals(this.children, that.children) || + (this.children == null && that.children.size() == 0) || + (this.children.size() == 0 && that.children == null)); + + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java new file mode 100644 index 00000000000..9e2894170b2 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java @@ -0,0 +1,74 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.types.pojo; + + +import com.google.common.collect.ImmutableList; +import com.google.flatbuffers.FlatBufferBuilder; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.Objects; + +import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; +import static org.apache.arrow.vector.types.pojo.Field.convertField; + +public class Schema { + private List fields; + + public Schema(List fields) { + this.fields = ImmutableList.copyOf(fields); + } + + public int getSchema(FlatBufferBuilder builder) { + int[] fieldOffsets = new int[fields.size()]; + for (int i = 0; i < fields.size(); i++) { + fieldOffsets[i] = fields.get(i).getField(builder); + } + int fieldsOffset = org.apache.arrow.flatbuf.Schema.createFieldsVector(builder, fieldOffsets); + org.apache.arrow.flatbuf.Schema.startSchema(builder); + org.apache.arrow.flatbuf.Schema.addFields(builder, fieldsOffset); + return org.apache.arrow.flatbuf.Schema.endSchema(builder); + } + + public List getFields() { + return fields; + } + + @Override + public int hashCode() { + return Objects.hashCode(fields); + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof Schema)) { + return false; + } + return Objects.equals(this.fields, ((Schema) obj).fields); + } + + public static Schema convertSchema(org.apache.arrow.flatbuf.Schema schema) { + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + for (int i = 0; i < schema.fieldsLength(); i++) { + childrenBuilder.add(convertField(schema.fields(i))); + } + List fields = childrenBuilder.build(); + return new Schema(fields); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java index b6dd13a06a8..68b9fb25f21 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/ByteFunctionHelpers.java @@ -180,54 +180,4 @@ private static final int memcmp(final long laddr, int lStart, int lEnd, final by return lLen > rLen ? 1 : -1; } - /* - * Following are helper functions to interact with sparse decimal represented in a byte array. - */ - - // Get the integer ignore the sign - public static int getInteger(byte[] b, int index) { - return getInteger(b, index, true); - } - // Get the integer, ignore the sign - public static int getInteger(byte[] b, int index, boolean ignoreSign) { - int startIndex = index * DecimalUtility.INTEGER_SIZE; - - if (index == 0 && ignoreSign == true) { - return (b[startIndex + 3] & 0xFF) | - (b[startIndex + 2] & 0xFF) << 8 | - (b[startIndex + 1] & 0xFF) << 16 | - (b[startIndex] & 0x7F) << 24; - } - - return ((b[startIndex + 3] & 0xFF) | - (b[startIndex + 2] & 0xFF) << 8 | - (b[startIndex + 1] & 0xFF) << 16 | - (b[startIndex] & 0xFF) << 24); - - } - - // Set integer in the byte array - public static void setInteger(byte[] b, int index, int value) { - int startIndex = index * DecimalUtility.INTEGER_SIZE; - b[startIndex] = (byte) ((value >> 24) & 0xFF); - b[startIndex + 1] = (byte) ((value >> 16) & 0xFF); - b[startIndex + 2] = (byte) ((value >> 8) & 0xFF); - b[startIndex + 3] = (byte) ((value) & 0xFF); - } - - // Set the sign in a sparse decimal representation - public static void setSign(byte[] b, boolean sign) { - int value = getInteger(b, 0); - if (sign == true) { - setInteger(b, 0, value | 0x80000000); - } else { - setInteger(b, 0, value & 0x7FFFFFFF); - } - } - - // Get the sign - public static boolean getSign(byte[] b) { - return ((getInteger(b, 0, false) & 0x80000000) != 0); - } - } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java deleted file mode 100644 index 1eb2c13cd4c..00000000000 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/CoreDecimalUtility.java +++ /dev/null @@ -1,91 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.arrow.vector.util; - -import java.math.BigDecimal; - -import org.apache.arrow.vector.types.Types; - -public class CoreDecimalUtility { - static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(CoreDecimalUtility.class); - - public static long getDecimal18FromBigDecimal(BigDecimal input, int scale, int precision) { - // Truncate or pad to set the input to the correct scale - input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); - - return (input.unscaledValue().longValue()); - } - - public static int getMaxPrecision(Types.MinorType decimalType) { - if (decimalType == Types.MinorType.DECIMAL9) { - return 9; - } else if (decimalType == Types.MinorType.DECIMAL18) { - return 18; - } else if (decimalType == Types.MinorType.DECIMAL28SPARSE) { - return 28; - } else if (decimalType == Types.MinorType.DECIMAL38SPARSE) { - return 38; - } - return 0; - } - - /* - * Function returns the Minor decimal type given the precision - */ - public static Types.MinorType getDecimalDataType(int precision) { - if (precision <= 9) { - return Types.MinorType.DECIMAL9; - } else if (precision <= 18) { - return Types.MinorType.DECIMAL18; - } else if (precision <= 28) { - return Types.MinorType.DECIMAL28SPARSE; - } else { - return Types.MinorType.DECIMAL38SPARSE; - } - } - - /* - * Given a precision it provides the max precision of that decimal data type; - * For eg: given the precision 12, we would use DECIMAL18 to store the data - * which has a max precision range of 18 digits - */ - public static int getPrecisionRange(int precision) { - return getMaxPrecision(getDecimalDataType(precision)); - } - public static int getDecimal9FromBigDecimal(BigDecimal input, int scale, int precision) { - // Truncate/ or pad to set the input to the correct scale - input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); - - return (input.unscaledValue().intValue()); - } - - /* - * Helper function to detect if the given data type is Decimal - */ - public static boolean isDecimalType(Types.MajorType type) { - return isDecimalType(type.getMinorType()); - } - - public static boolean isDecimalType(Types.MinorType minorType) { - if (minorType == Types.MinorType.DECIMAL9 || minorType == Types.MinorType.DECIMAL18 || - minorType == Types.MinorType.DECIMAL28SPARSE || minorType == Types.MinorType.DECIMAL38SPARSE) { - return true; - } - return false; - } -} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index a3763cd34f1..4eb0d9f2216 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -26,140 +26,139 @@ import java.nio.ByteBuffer; import java.util.Arrays; -import org.apache.arrow.vector.holders.Decimal38SparseHolder; - -public class DecimalUtility extends CoreDecimalUtility{ - - public final static int MAX_DIGITS = 9; - public final static int DIGITS_BASE = 1000000000; - public final static int DIGITS_MAX = 999999999; - public final static int INTEGER_SIZE = (Integer.SIZE/8); - - public final static String[] decimalToString = {"", - "0", - "00", - "000", - "0000", - "00000", - "000000", - "0000000", - "00000000", - "000000000"}; - - public final static long[] scale_long_constants = { - 1, - 10, - 100, - 1000, - 10000, - 100000, - 1000000, - 10000000, - 100000000, - 1000000000, - 10000000000l, - 100000000000l, - 1000000000000l, - 10000000000000l, - 100000000000000l, - 1000000000000000l, - 10000000000000000l, - 100000000000000000l, - 1000000000000000000l}; - - /* - * Simple function that returns the static precomputed - * power of ten, instead of using Math.pow - */ - public static long getPowerOfTen(int power) { - assert power >= 0 && power < scale_long_constants.length; - return scale_long_constants[(power)]; - } - - /* - * Math.pow returns a double and while multiplying with large digits - * in the decimal data type we encounter noise. So instead of multiplying - * with Math.pow we use the static constants to perform the multiplication - */ - public static long adjustScaleMultiply(long input, int factor) { - int index = Math.abs(factor); - assert index >= 0 && index < scale_long_constants.length; - if (factor >= 0) { - return input * scale_long_constants[index]; - } else { - return input / scale_long_constants[index]; - } - } - public static long adjustScaleDivide(long input, int factor) { - int index = Math.abs(factor); - assert index >= 0 && index < scale_long_constants.length; - if (factor >= 0) { - return input / scale_long_constants[index]; - } else { - return input * scale_long_constants[index]; - } +public class DecimalUtility { + + public final static int MAX_DIGITS = 9; + public final static int DIGITS_BASE = 1000000000; + public final static int DIGITS_MAX = 999999999; + public final static int INTEGER_SIZE = (Integer.SIZE/8); + + public final static String[] decimalToString = {"", + "0", + "00", + "000", + "0000", + "00000", + "000000", + "0000000", + "00000000", + "000000000"}; + + public final static long[] scale_long_constants = { + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000, + 10000000000l, + 100000000000l, + 1000000000000l, + 10000000000000l, + 100000000000000l, + 1000000000000000l, + 10000000000000000l, + 100000000000000000l, + 1000000000000000000l}; + + /* + * Simple function that returns the static precomputed + * power of ten, instead of using Math.pow + */ + public static long getPowerOfTen(int power) { + assert power >= 0 && power < scale_long_constants.length; + return scale_long_constants[(power)]; + } + + /* + * Math.pow returns a double and while multiplying with large digits + * in the decimal data type we encounter noise. So instead of multiplying + * with Math.pow we use the static constants to perform the multiplication + */ + public static long adjustScaleMultiply(long input, int factor) { + int index = Math.abs(factor); + assert index >= 0 && index < scale_long_constants.length; + if (factor >= 0) { + return input * scale_long_constants[index]; + } else { + return input / scale_long_constants[index]; } + } - /* Given the number of actual digits this function returns the - * number of indexes it will occupy in the array of integers - * which are stored in base 1 billion - */ - public static int roundUp(int ndigits) { - return (ndigits + MAX_DIGITS - 1)/MAX_DIGITS; + public static long adjustScaleDivide(long input, int factor) { + int index = Math.abs(factor); + assert index >= 0 && index < scale_long_constants.length; + if (factor >= 0) { + return input / scale_long_constants[index]; + } else { + return input * scale_long_constants[index]; } + } - /* Returns a string representation of the given integer - * If the length of the given integer is less than the - * passed length, this function will prepend zeroes to the string - */ - public static StringBuilder toStringWithZeroes(int number, int desiredLength) { - String value = ((Integer) number).toString(); - int length = value.length(); + /* Given the number of actual digits this function returns the + * number of indexes it will occupy in the array of integers + * which are stored in base 1 billion + */ + public static int roundUp(int ndigits) { + return (ndigits + MAX_DIGITS - 1)/MAX_DIGITS; + } - StringBuilder str = new StringBuilder(); - str.append(decimalToString[desiredLength - length]); - str.append(value); + /* Returns a string representation of the given integer + * If the length of the given integer is less than the + * passed length, this function will prepend zeroes to the string + */ + public static StringBuilder toStringWithZeroes(int number, int desiredLength) { + String value = ((Integer) number).toString(); + int length = value.length(); - return str; - } + StringBuilder str = new StringBuilder(); + str.append(decimalToString[desiredLength - length]); + str.append(value); - public static StringBuilder toStringWithZeroes(long number, int desiredLength) { - String value = ((Long) number).toString(); - int length = value.length(); + return str; + } - StringBuilder str = new StringBuilder(); + public static StringBuilder toStringWithZeroes(long number, int desiredLength) { + String value = ((Long) number).toString(); + int length = value.length(); - // Desired length can be > MAX_DIGITS - int zeroesLength = desiredLength - length; - while (zeroesLength > MAX_DIGITS) { - str.append(decimalToString[MAX_DIGITS]); - zeroesLength -= MAX_DIGITS; - } - str.append(decimalToString[zeroesLength]); - str.append(value); + StringBuilder str = new StringBuilder(); - return str; + // Desired length can be > MAX_DIGITS + int zeroesLength = desiredLength - length; + while (zeroesLength > MAX_DIGITS) { + str.append(decimalToString[MAX_DIGITS]); + zeroesLength -= MAX_DIGITS; } + str.append(decimalToString[zeroesLength]); + str.append(value); + + return str; + } public static BigDecimal getBigDecimalFromIntermediate(ByteBuf data, int startIndex, int nDecimalDigits, int scale) { - // In the intermediate representation we don't pad the scale with zeroes, so set truncate = false - return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, false); - } + // In the intermediate representation we don't pad the scale with zeroes, so set truncate = false + return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, false); + } - public static BigDecimal getBigDecimalFromSparse(ArrowBuf data, int startIndex, int nDecimalDigits, int scale) { + public static BigDecimal getBigDecimalFromSparse(ArrowBuf data, int startIndex, int nDecimalDigits, int scale) { - // In the sparse representation we pad the scale with zeroes for ease of arithmetic, need to truncate - return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, true); - } + // In the sparse representation we pad the scale with zeroes for ease of arithmetic, need to truncate + return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, true); + } - public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int start, int length, int scale) { - byte[] value = new byte[length]; - bytebuf.getBytes(start, value, 0, length); - BigInteger unscaledValue = new BigInteger(value); - return new BigDecimal(unscaledValue, scale); - } + public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int start, int length, int scale) { + byte[] value = new byte[length]; + bytebuf.getBytes(start, value, 0, length); + BigInteger unscaledValue = new BigInteger(value); + return new BigDecimal(unscaledValue, scale); + } public static BigDecimal getBigDecimalFromByteBuffer(ByteBuffer bytebuf, int start, int length, int scale) { byte[] value = new byte[length]; @@ -168,115 +167,123 @@ public static BigDecimal getBigDecimalFromByteBuffer(ByteBuffer bytebuf, int sta return new BigDecimal(unscaledValue, scale); } - /* Create a BigDecimal object using the data in the ArrowBuf. - * This function assumes that data is provided in a non-dense format - * It works on both sparse and intermediate representations. - */ + public static void writeBigDecimalToArrowBuf(ArrowBuf bytebuf, int startIndex, BigDecimal value) { + byte[] bytes = value.unscaledValue().toByteArray(); + if (bytes.length > 16) { + throw new UnsupportedOperationException("Decimal size greater than 16 bytes"); + } + bytebuf.setBytes(startIndex + 16 - bytes.length, bytes, 0, bytes.length); + } + + /* Create a BigDecimal object using the data in the ArrowBuf. + * This function assumes that data is provided in a non-dense format + * It works on both sparse and intermediate representations. + */ public static BigDecimal getBigDecimalFromArrowBuf(ByteBuf data, int startIndex, int nDecimalDigits, int scale, - boolean truncateScale) { + boolean truncateScale) { - // For sparse decimal type we have padded zeroes at the end, strip them while converting to BigDecimal. - int actualDigits; + // For sparse decimal type we have padded zeroes at the end, strip them while converting to BigDecimal. + int actualDigits; - // Initialize the BigDecimal, first digit in the ArrowBuf has the sign so mask it out - BigInteger decimalDigits = BigInteger.valueOf((data.getInt(startIndex)) & 0x7FFFFFFF); + // Initialize the BigDecimal, first digit in the ArrowBuf has the sign so mask it out + BigInteger decimalDigits = BigInteger.valueOf((data.getInt(startIndex)) & 0x7FFFFFFF); - BigInteger base = BigInteger.valueOf(DIGITS_BASE); + BigInteger base = BigInteger.valueOf(DIGITS_BASE); - for (int i = 1; i < nDecimalDigits; i++) { + for (int i = 1; i < nDecimalDigits; i++) { - BigInteger temp = BigInteger.valueOf(data.getInt(startIndex + (i * INTEGER_SIZE))); - decimalDigits = decimalDigits.multiply(base); - decimalDigits = decimalDigits.add(temp); - } + BigInteger temp = BigInteger.valueOf(data.getInt(startIndex + (i * INTEGER_SIZE))); + decimalDigits = decimalDigits.multiply(base); + decimalDigits = decimalDigits.add(temp); + } - // Truncate any additional padding we might have added - if (truncateScale == true && scale > 0 && (actualDigits = scale % MAX_DIGITS) != 0) { - BigInteger truncate = BigInteger.valueOf((int)Math.pow(10, (MAX_DIGITS - actualDigits))); - decimalDigits = decimalDigits.divide(truncate); - } + // Truncate any additional padding we might have added + if (truncateScale == true && scale > 0 && (actualDigits = scale % MAX_DIGITS) != 0) { + BigInteger truncate = BigInteger.valueOf((int)Math.pow(10, (MAX_DIGITS - actualDigits))); + decimalDigits = decimalDigits.divide(truncate); + } - // set the sign - if ((data.getInt(startIndex) & 0x80000000) != 0) { - decimalDigits = decimalDigits.negate(); - } + // set the sign + if ((data.getInt(startIndex) & 0x80000000) != 0) { + decimalDigits = decimalDigits.negate(); + } - BigDecimal decimal = new BigDecimal(decimalDigits, scale); + BigDecimal decimal = new BigDecimal(decimalDigits, scale); - return decimal; - } + return decimal; + } - /* This function returns a BigDecimal object from the dense decimal representation. - * First step is to convert the dense representation into an intermediate representation - * and then invoke getBigDecimalFromArrowBuf() to get the BigDecimal object - */ - public static BigDecimal getBigDecimalFromDense(ArrowBuf data, int startIndex, int nDecimalDigits, int scale, int maxPrecision, int width) { + /* This function returns a BigDecimal object from the dense decimal representation. + * First step is to convert the dense representation into an intermediate representation + * and then invoke getBigDecimalFromArrowBuf() to get the BigDecimal object + */ + public static BigDecimal getBigDecimalFromDense(ArrowBuf data, int startIndex, int nDecimalDigits, int scale, int maxPrecision, int width) { /* This method converts the dense representation to * an intermediate representation. The intermediate * representation has one more integer than the dense * representation. */ - byte[] intermediateBytes = new byte[((nDecimalDigits + 1) * INTEGER_SIZE)]; - - // Start storing from the least significant byte of the first integer - int intermediateIndex = 3; - - int[] mask = {0x03, 0x0F, 0x3F, 0xFF}; - int[] reverseMask = {0xFC, 0xF0, 0xC0, 0x00}; - - int maskIndex; - int shiftOrder; - byte shiftBits; - - // TODO: Some of the logic here is common with casting from Dense to Sparse types, factor out common code - if (maxPrecision == 38) { - maskIndex = 0; - shiftOrder = 6; - shiftBits = 0x00; - intermediateBytes[intermediateIndex++] = (byte) (data.getByte(startIndex) & 0x7F); - } else if (maxPrecision == 28) { - maskIndex = 1; - shiftOrder = 4; - shiftBits = (byte) ((data.getByte(startIndex) & 0x03) << shiftOrder); - intermediateBytes[intermediateIndex++] = (byte) (((data.getByte(startIndex) & 0x3C) & 0xFF) >>> 2); - } else { - throw new UnsupportedOperationException("Dense types with max precision 38 and 28 are only supported"); - } + byte[] intermediateBytes = new byte[((nDecimalDigits + 1) * INTEGER_SIZE)]; + + // Start storing from the least significant byte of the first integer + int intermediateIndex = 3; + + int[] mask = {0x03, 0x0F, 0x3F, 0xFF}; + int[] reverseMask = {0xFC, 0xF0, 0xC0, 0x00}; + + int maskIndex; + int shiftOrder; + byte shiftBits; + + // TODO: Some of the logic here is common with casting from Dense to Sparse types, factor out common code + if (maxPrecision == 38) { + maskIndex = 0; + shiftOrder = 6; + shiftBits = 0x00; + intermediateBytes[intermediateIndex++] = (byte) (data.getByte(startIndex) & 0x7F); + } else if (maxPrecision == 28) { + maskIndex = 1; + shiftOrder = 4; + shiftBits = (byte) ((data.getByte(startIndex) & 0x03) << shiftOrder); + intermediateBytes[intermediateIndex++] = (byte) (((data.getByte(startIndex) & 0x3C) & 0xFF) >>> 2); + } else { + throw new UnsupportedOperationException("Dense types with max precision 38 and 28 are only supported"); + } - int inputIndex = 1; - boolean sign = false; + int inputIndex = 1; + boolean sign = false; - if ((data.getByte(startIndex) & 0x80) != 0) { - sign = true; - } + if ((data.getByte(startIndex) & 0x80) != 0) { + sign = true; + } - while (inputIndex < width) { + while (inputIndex < width) { - intermediateBytes[intermediateIndex] = (byte) ((shiftBits) | (((data.getByte(startIndex + inputIndex) & reverseMask[maskIndex]) & 0xFF) >>> (8 - shiftOrder))); + intermediateBytes[intermediateIndex] = (byte) ((shiftBits) | (((data.getByte(startIndex + inputIndex) & reverseMask[maskIndex]) & 0xFF) >>> (8 - shiftOrder))); - shiftBits = (byte) ((data.getByte(startIndex + inputIndex) & mask[maskIndex]) << shiftOrder); + shiftBits = (byte) ((data.getByte(startIndex + inputIndex) & mask[maskIndex]) << shiftOrder); - inputIndex++; - intermediateIndex++; + inputIndex++; + intermediateIndex++; - if (((inputIndex - 1) % INTEGER_SIZE) == 0) { - shiftBits = (byte) ((shiftBits & 0xFF) >>> 2); - maskIndex++; - shiftOrder -= 2; - } + if (((inputIndex - 1) % INTEGER_SIZE) == 0) { + shiftBits = (byte) ((shiftBits & 0xFF) >>> 2); + maskIndex++; + shiftOrder -= 2; + } - } + } /* copy the last byte */ - intermediateBytes[intermediateIndex] = shiftBits; + intermediateBytes[intermediateIndex] = shiftBits; - if (sign == true) { - intermediateBytes[0] = (byte) (intermediateBytes[0] | 0x80); - } + if (sign == true) { + intermediateBytes[0] = (byte) (intermediateBytes[0] | 0x80); + } final ByteBuf intermediate = UnpooledByteBufAllocator.DEFAULT.buffer(intermediateBytes.length); try { - intermediate.setBytes(0, intermediateBytes); + intermediate.setBytes(0, intermediateBytes); BigDecimal ret = getBigDecimalFromIntermediate(intermediate, 0, nDecimalDigits + 1, scale); return ret; @@ -284,299 +291,296 @@ public static BigDecimal getBigDecimalFromDense(ArrowBuf data, int startIndex, i intermediate.release(); } - } + } - /* - * Function converts the BigDecimal and stores it in out internal sparse representation - */ - public static void getSparseFromBigDecimal(BigDecimal input, ByteBuf data, int startIndex, int scale, int precision, - int nDecimalDigits) { + public static void getSparseFromBigDecimal(BigDecimal input, ByteBuf data, int startIndex, int scale, int precision, + int nDecimalDigits) { - // Initialize the buffer - for (int i = 0; i < nDecimalDigits; i++) { - data.setInt(startIndex + (i * INTEGER_SIZE), 0); - } + // Initialize the buffer + for (int i = 0; i < nDecimalDigits; i++) { + data.setInt(startIndex + (i * INTEGER_SIZE), 0); + } - boolean sign = false; + boolean sign = false; - if (input.signum() == -1) { - // negative input - sign = true; - input = input.abs(); - } + if (input.signum() == -1) { + // negative input + sign = true; + input = input.abs(); + } - // Truncate the input as per the scale provided - input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); + // Truncate the input as per the scale provided + input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); - // Separate out the integer part - BigDecimal integerPart = input.setScale(0, BigDecimal.ROUND_DOWN); + // Separate out the integer part + BigDecimal integerPart = input.setScale(0, BigDecimal.ROUND_DOWN); - int destIndex = nDecimalDigits - roundUp(scale) - 1; + int destIndex = nDecimalDigits - roundUp(scale) - 1; - // we use base 1 billion integer digits for out integernal representation - BigDecimal base = new BigDecimal(DIGITS_BASE); + // we use base 1 billion integer digits for out integernal representation + BigDecimal base = new BigDecimal(DIGITS_BASE); - while (integerPart.compareTo(BigDecimal.ZERO) == 1) { - // store the modulo as the integer value - data.setInt(startIndex + (destIndex * INTEGER_SIZE), (integerPart.remainder(base)).intValue()); - destIndex--; - // Divide by base 1 billion - integerPart = (integerPart.divide(base)).setScale(0, BigDecimal.ROUND_DOWN); - } + while (integerPart.compareTo(BigDecimal.ZERO) == 1) { + // store the modulo as the integer value + data.setInt(startIndex + (destIndex * INTEGER_SIZE), (integerPart.remainder(base)).intValue()); + destIndex--; + // Divide by base 1 billion + integerPart = (integerPart.divide(base)).setScale(0, BigDecimal.ROUND_DOWN); + } /* Sparse representation contains padding of additional zeroes * so each digit contains MAX_DIGITS for ease of arithmetic */ - int actualDigits; - if ((actualDigits = (scale % MAX_DIGITS)) != 0) { - // Pad additional zeroes - scale = scale + (MAX_DIGITS - actualDigits); - input = input.setScale(scale, BigDecimal.ROUND_DOWN); - } - - //separate out the fractional part - BigDecimal fractionalPart = input.remainder(BigDecimal.ONE).movePointRight(scale); + int actualDigits; + if ((actualDigits = (scale % MAX_DIGITS)) != 0) { + // Pad additional zeroes + scale = scale + (MAX_DIGITS - actualDigits); + input = input.setScale(scale, BigDecimal.ROUND_DOWN); + } - destIndex = nDecimalDigits - 1; + //separate out the fractional part + BigDecimal fractionalPart = input.remainder(BigDecimal.ONE).movePointRight(scale); - while (scale > 0) { - // Get next set of MAX_DIGITS (9) store it in the ArrowBuf - fractionalPart = fractionalPart.movePointLeft(MAX_DIGITS); - BigDecimal temp = fractionalPart.remainder(BigDecimal.ONE); + destIndex = nDecimalDigits - 1; - data.setInt(startIndex + (destIndex * INTEGER_SIZE), (temp.unscaledValue().intValue())); - destIndex--; + while (scale > 0) { + // Get next set of MAX_DIGITS (9) store it in the ArrowBuf + fractionalPart = fractionalPart.movePointLeft(MAX_DIGITS); + BigDecimal temp = fractionalPart.remainder(BigDecimal.ONE); - fractionalPart = fractionalPart.setScale(0, BigDecimal.ROUND_DOWN); - scale -= MAX_DIGITS; - } + data.setInt(startIndex + (destIndex * INTEGER_SIZE), (temp.unscaledValue().intValue())); + destIndex--; - // Set the negative sign - if (sign == true) { - data.setInt(startIndex, data.getInt(startIndex) | 0x80000000); - } + fractionalPart = fractionalPart.setScale(0, BigDecimal.ROUND_DOWN); + scale -= MAX_DIGITS; + } + // Set the negative sign + if (sign == true) { + data.setInt(startIndex, data.getInt(startIndex) | 0x80000000); } + } - public static long getDecimal18FromBigDecimal(BigDecimal input, int scale, int precision) { - // Truncate or pad to set the input to the correct scale - input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); - return (input.unscaledValue().longValue()); - } + public static long getDecimal18FromBigDecimal(BigDecimal input, int scale, int precision) { + // Truncate or pad to set the input to the correct scale + input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); - public static BigDecimal getBigDecimalFromPrimitiveTypes(int input, int scale, int precision) { - return BigDecimal.valueOf(input, scale); - } + return (input.unscaledValue().longValue()); + } - public static BigDecimal getBigDecimalFromPrimitiveTypes(long input, int scale, int precision) { - return BigDecimal.valueOf(input, scale); - } + public static BigDecimal getBigDecimalFromPrimitiveTypes(int input, int scale, int precision) { + return BigDecimal.valueOf(input, scale); + } + + public static BigDecimal getBigDecimalFromPrimitiveTypes(long input, int scale, int precision) { + return BigDecimal.valueOf(input, scale); + } - public static int compareDenseBytes(ArrowBuf left, int leftStart, boolean leftSign, ArrowBuf right, int rightStart, boolean rightSign, int width) { + public static int compareDenseBytes(ArrowBuf left, int leftStart, boolean leftSign, ArrowBuf right, int rightStart, boolean rightSign, int width) { - int invert = 1; + int invert = 1; /* If signs are different then simply look at the * sign of the two inputs and determine which is greater */ - if (leftSign != rightSign) { + if (leftSign != rightSign) { - return((leftSign == true) ? -1 : 1); - } else if(leftSign == true) { + return((leftSign == true) ? -1 : 1); + } else if(leftSign == true) { /* Both inputs are negative, at the end we will * have to invert the comparison */ - invert = -1; - } - - int cmp = 0; - - for (int i = 0; i < width; i++) { - byte leftByte = left.getByte(leftStart + i); - byte rightByte = right.getByte(rightStart + i); - // Unsigned byte comparison - if ((leftByte & 0xFF) > (rightByte & 0xFF)) { - cmp = 1; - break; - } else if ((leftByte & 0xFF) < (rightByte & 0xFF)) { - cmp = -1; - break; - } - } - cmp *= invert; // invert the comparison if both were negative values - - return cmp; + invert = -1; } - public static int getIntegerFromSparseBuffer(ArrowBuf buffer, int start, int index) { - int value = buffer.getInt(start + (index * 4)); + int cmp = 0; - if (index == 0) { - /* the first byte contains sign bit, return value without it */ - value = (value & 0x7FFFFFFF); + for (int i = 0; i < width; i++) { + byte leftByte = left.getByte(leftStart + i); + byte rightByte = right.getByte(rightStart + i); + // Unsigned byte comparison + if ((leftByte & 0xFF) > (rightByte & 0xFF)) { + cmp = 1; + break; + } else if ((leftByte & 0xFF) < (rightByte & 0xFF)) { + cmp = -1; + break; } - return value; } + cmp *= invert; // invert the comparison if both were negative values - public static void setInteger(ArrowBuf buffer, int start, int index, int value) { - buffer.setInt(start + (index * 4), value); + return cmp; + } + + public static int getIntegerFromSparseBuffer(ArrowBuf buffer, int start, int index) { + int value = buffer.getInt(start + (index * 4)); + + if (index == 0) { + /* the first byte contains sign bit, return value without it */ + value = (value & 0x7FFFFFFF); } + return value; + } - public static int compareSparseBytes(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits, boolean absCompare) { + public static void setInteger(ArrowBuf buffer, int start, int index, int value) { + buffer.setInt(start + (index * 4), value); + } - int invert = 1; + public static int compareSparseBytes(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits, boolean absCompare) { - if (absCompare == false) { - if (leftSign != rightSign) { - return (leftSign == true) ? -1 : 1; - } + int invert = 1; - // Both values are negative invert the outcome of the comparison - if (leftSign == true) { - invert = -1; - } + if (absCompare == false) { + if (leftSign != rightSign) { + return (leftSign == true) ? -1 : 1; } - int cmp = compareSparseBytesInner(left, leftStart, leftSign, leftScale, leftPrecision, right, rightStart, rightSign, rightPrecision, rightScale, width, nDecimalDigits); - return cmp * invert; + // Both values are negative invert the outcome of the comparison + if (leftSign == true) { + invert = -1; + } } - public static int compareSparseBytesInner(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits) { + + int cmp = compareSparseBytesInner(left, leftStart, leftSign, leftScale, leftPrecision, right, rightStart, rightSign, rightPrecision, rightScale, width, nDecimalDigits); + return cmp * invert; + } + public static int compareSparseBytesInner(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits) { /* compute the number of integer digits in each decimal */ - int leftInt = leftPrecision - leftScale; - int rightInt = rightPrecision - rightScale; + int leftInt = leftPrecision - leftScale; + int rightInt = rightPrecision - rightScale; /* compute the number of indexes required for storing integer digits */ - int leftIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftInt); - int rightIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightInt); + int leftIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftInt); + int rightIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightInt); /* compute number of indexes required for storing scale */ - int leftScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftScale); - int rightScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightScale); + int leftScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftScale); + int rightScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightScale); /* compute index of the most significant integer digits */ - int leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; - int rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; + int leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; + int rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; - int leftStopIndex = nDecimalDigits - leftScaleRoundedUp; - int rightStopIndex = nDecimalDigits - rightScaleRoundedUp; + int leftStopIndex = nDecimalDigits - leftScaleRoundedUp; + int rightStopIndex = nDecimalDigits - rightScaleRoundedUp; /* Discard the zeroes in the integer part */ - while (leftIndex1 < leftStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { - break; - } + while (leftIndex1 < leftStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { + break; + } /* Digit in this location is zero, decrement the actual number * of integer digits */ - leftIntRoundedUp--; - leftIndex1++; - } + leftIntRoundedUp--; + leftIndex1++; + } /* If we reached the stop index then the number of integers is zero */ - if (leftIndex1 == leftStopIndex) { - leftIntRoundedUp = 0; - } + if (leftIndex1 == leftStopIndex) { + leftIntRoundedUp = 0; + } - while (rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { - break; - } + while (rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { + break; + } /* Digit in this location is zero, decrement the actual number * of integer digits */ - rightIntRoundedUp--; - rightIndex1++; - } + rightIntRoundedUp--; + rightIndex1++; + } - if (rightIndex1 == rightStopIndex) { - rightIntRoundedUp = 0; - } + if (rightIndex1 == rightStopIndex) { + rightIntRoundedUp = 0; + } /* We have the accurate number of non-zero integer digits, * if the number of integer digits are different then we can determine * which decimal is larger and needn't go down to comparing individual values */ - if (leftIntRoundedUp > rightIntRoundedUp) { - return 1; - } - else if (rightIntRoundedUp > leftIntRoundedUp) { - return -1; - } + if (leftIntRoundedUp > rightIntRoundedUp) { + return 1; + } + else if (rightIntRoundedUp > leftIntRoundedUp) { + return -1; + } /* The number of integer digits are the same, set the each index * to the first non-zero integer and compare each digit */ - leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; - rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; + leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; + rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; - while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { - return 1; - } - else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { - return -1; - } - - leftIndex1++; - rightIndex1++; + while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { + return 1; + } + else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { + return -1; } + leftIndex1++; + rightIndex1++; + } + /* The integer part of both the decimal's are equal, now compare * each individual fractional part. Set the index to be at the * beginning of the fractional part */ - leftIndex1 = leftStopIndex; - rightIndex1 = rightStopIndex; + leftIndex1 = leftStopIndex; + rightIndex1 = rightStopIndex; /* Stop indexes will be the end of the array */ - leftStopIndex = nDecimalDigits; - rightStopIndex = nDecimalDigits; + leftStopIndex = nDecimalDigits; + rightStopIndex = nDecimalDigits; /* compare the two fractional parts of the decimal */ - while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { - return 1; - } - else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { - return -1; - } - - leftIndex1++; - rightIndex1++; + while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { + return 1; + } + else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { + return -1; } + leftIndex1++; + rightIndex1++; + } + /* Till now the fractional part of the decimals are equal, check * if one of the decimal has fractional part that is remaining * and is non-zero */ - while (leftIndex1 < leftStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { - return 1; - } - leftIndex1++; + while (leftIndex1 < leftStopIndex) { + if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { + return 1; } + leftIndex1++; + } - while(rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { - return -1; - } - rightIndex1++; + while(rightIndex1 < rightStopIndex) { + if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { + return -1; } + rightIndex1++; + } /* Both decimal values are equal */ - return 0; - } + return 0; + } - public static BigDecimal getBigDecimalFromByteArray(byte[] bytes, int start, int length, int scale) { - byte[] value = Arrays.copyOfRange(bytes, start, start + length); - BigInteger unscaledValue = new BigInteger(value); - return new BigDecimal(unscaledValue, scale); - } + public static BigDecimal getBigDecimalFromByteArray(byte[] bytes, int start, int length, int scale) { + byte[] value = Arrays.copyOfRange(bytes, start, start + length); + BigInteger unscaledValue = new BigInteger(value); + return new BigDecimal(unscaledValue, scale); + } public static void roundDecimal(ArrowBuf result, int start, int nDecimalDigits, int desiredScale, int currentScale) { int newScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(desiredScale); @@ -704,34 +708,6 @@ public static int getFirstFractionalDigit(ArrowBuf data, int scale, int start, i int index = nDecimalDigits - roundUp(scale); return (int) (adjustScaleDivide(data.getInt(start + (index * INTEGER_SIZE)), MAX_DIGITS - 1)); } - - public static int compareSparseSamePrecScale(ArrowBuf left, int lStart, byte[] right, int length) { - // check the sign first - boolean lSign = (left.getInt(lStart) & 0x80000000) != 0; - boolean rSign = ByteFunctionHelpers.getSign(right); - int cmp = 0; - - if (lSign != rSign) { - return (lSign == false) ? 1 : -1; - } - - // invert the comparison if we are comparing negative numbers - int invert = (lSign == true) ? -1 : 1; - - // compare byte by byte - int n = 0; - int lPos = lStart; - int rPos = 0; - while (n < length/4) { - int leftInt = Decimal38SparseHolder.getInteger(n, lStart, left); - int rightInt = ByteFunctionHelpers.getInteger(right, n); - if (leftInt != rightInt) { - cmp = (leftInt - rightInt ) > 0 ? 1 : -1; - break; - } - n++; - } - return cmp * invert; - } } + diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java index dea433e99e8..d7f9d382e48 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/MapWithOrdinal.java @@ -18,7 +18,9 @@ package org.apache.arrow.vector.util; import java.util.AbstractMap; +import java.util.ArrayList; import java.util.Collection; +import java.util.List; import java.util.Map; import java.util.Set; @@ -241,6 +243,16 @@ public Set keySet() { return delegate.keySet(); } + public List keyList() { + int size = size(); + Set keys = keySet(); + List children = new ArrayList<>(size); + for (K key : keys) { + children.add(getOrdinal(key), key); + } + return children; + } + @Override public Set> entrySet() { return delegate.entrySet(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java new file mode 100644 index 00000000000..7ab7db3117b --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.util.DecimalUtility; +import org.junit.Test; + +import java.math.BigDecimal; +import java.math.BigInteger; + +import static org.junit.Assert.assertEquals; + +public class TestDecimalVector { + + private static long[] intValues; + + static { + intValues = new long[30]; + for (int i = 0; i < intValues.length; i++) { + intValues[i] = 1 << i + 1; + } + } + private int scale = 3; + + @Test + public void test() { + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + NullableDecimalVector decimalVector = new NullableDecimalVector("decimal", allocator, 10, scale); + decimalVector.allocateNew(); + BigDecimal[] values = new BigDecimal[intValues.length]; + for (int i = 0; i < intValues.length; i++) { + BigDecimal decimal = new BigDecimal(BigInteger.valueOf(intValues[i]), scale); + values[i] = decimal; + decimalVector.getMutator().setIndexDefined(i); + DecimalUtility.writeBigDecimalToArrowBuf(decimalVector.getBuffer(), i * 16, decimal); + } + + decimalVector.getMutator().setValueCount(intValues.length); + + for (int i = 0; i < intValues.length; i++) { + BigDecimal value = decimalVector.getAccessor().getObject(i); + assertEquals(values[i], value); + } + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java index 4dee86c9d59..9baebc5a299 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestOversizedAllocationForValueVector.java @@ -20,8 +20,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.holders.UInt4Holder; -import org.apache.arrow.vector.types.MaterializedField; import org.apache.arrow.vector.util.OversizedAllocationException; import org.junit.After; import org.junit.Before; @@ -53,8 +51,7 @@ public void terminate() throws Exception { @Test(expected = OversizedAllocationException.class) public void testFixedVectorReallocation() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - final UInt4Vector vector = new UInt4Vector(field, allocator); + final UInt4Vector vector = new UInt4Vector(EMPTY_SCHEMA_PATH, allocator); // edge case 1: buffer size = max value capacity final int expectedValueCapacity = BaseValueVector.MAX_ALLOCATION_SIZE / 4; try { @@ -78,8 +75,7 @@ public void testFixedVectorReallocation() { @Test(expected = OversizedAllocationException.class) public void testBitVectorReallocation() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - final BitVector vector = new BitVector(field, allocator); + final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator); // edge case 1: buffer size ~ max value capacity final int expectedValueCapacity = 1 << 29; try { @@ -109,8 +105,7 @@ public void testBitVectorReallocation() { @Test(expected = OversizedAllocationException.class) public void testVariableVectorReallocation() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - final VarCharVector vector = new VarCharVector(field, allocator); + final VarCharVector vector = new VarCharVector(EMPTY_SCHEMA_PATH, allocator); // edge case 1: value count = MAX_VALUE_ALLOCATION final int expectedAllocationInBytes = BaseValueVector.MAX_ALLOCATION_SIZE; final int expectedOffsetSize = 10; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java index e4d28c3f88c..1bb50b73a90 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestUnionVector.java @@ -22,8 +22,6 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.holders.NullableUInt4Holder; -import org.apache.arrow.vector.holders.UInt4Holder; -import org.apache.arrow.vector.types.MaterializedField; import org.apache.arrow.vector.types.Types; import org.junit.After; import org.junit.Before; @@ -46,13 +44,12 @@ public void terminate() throws Exception { @Test public void testUnionVector() throws Exception { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); final NullableUInt4Holder uInt4Holder = new NullableUInt4Holder(); uInt4Holder.value = 100; uInt4Holder.isSet = 1; - try (UnionVector unionVector = new UnionVector(field, allocator, null)) { + try (UnionVector unionVector = new UnionVector(EMPTY_SCHEMA_PATH, allocator, null)) { unionVector.allocateNew(); // write some data diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index ce091ab1ed0..21cdc4f4d8d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -19,15 +19,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.complex.MapVector; -import org.apache.arrow.vector.complex.RepeatedListVector; -import org.apache.arrow.vector.complex.RepeatedMapVector; -import org.apache.arrow.vector.holders.*; -import org.apache.arrow.vector.types.MaterializedField; -import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.BasicTypeHelper; import org.apache.arrow.vector.util.OversizedAllocationException; import org.junit.After; import org.junit.Before; @@ -50,9 +42,9 @@ public void init() { } private final static Charset utf8Charset = Charset.forName("UTF-8"); - private final static byte[] STR1 = new String("AAAAA1").getBytes(utf8Charset); - private final static byte[] STR2 = new String("BBBBBBBBB2").getBytes(utf8Charset); - private final static byte[] STR3 = new String("CCCC3").getBytes(utf8Charset); + private final static byte[] STR1 = "AAAAA1".getBytes(utf8Charset); + private final static byte[] STR2 = "BBBBBBBBB2".getBytes(utf8Charset); + private final static byte[] STR3 = "CCCC3".getBytes(utf8Charset); @After public void terminate() throws Exception { @@ -61,10 +53,9 @@ public void terminate() throws Exception { @Test public void testFixedType() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); // Create a new value vector for 1024 integers. - try (final UInt4Vector vector = new UInt4Vector(field, allocator)) { + try (final UInt4Vector vector = new UInt4Vector(EMPTY_SCHEMA_PATH, allocator)) { final UInt4Vector.Mutator m = vector.getMutator(); vector.allocateNew(1024); @@ -86,10 +77,9 @@ public void testFixedType() { @Test public void testNullableVarLen2() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableVarCharHolder.TYPE); // Create a new value vector for 1024 integers. - try (final NullableVarCharVector vector = new NullableVarCharVector(field, allocator)) { + try (final NullableVarCharVector vector = new NullableVarCharVector(EMPTY_SCHEMA_PATH, allocator)) { final NullableVarCharVector.Mutator m = vector.getMutator(); vector.allocateNew(1024 * 10, 1024); @@ -115,45 +105,11 @@ public void testNullableVarLen2() { } } - @Test - public void testRepeatedIntVector() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedIntHolder.TYPE); - - // Create a new value vector. - try (final RepeatedIntVector vector1 = new RepeatedIntVector(field, allocator)) { - - // Populate the vector. - final int[] values = {2, 3, 5, 7, 11, 13, 17, 19, 23, 27}; // some tricksy primes - final int nRecords = 7; - final int nElements = values.length; - vector1.allocateNew(nRecords, nRecords * nElements); - final RepeatedIntVector.Mutator mutator = vector1.getMutator(); - for (int recordIndex = 0; recordIndex < nRecords; ++recordIndex) { - mutator.startNewValue(recordIndex); - for (int elementIndex = 0; elementIndex < nElements; ++elementIndex) { - mutator.add(recordIndex, recordIndex * values[elementIndex]); - } - } - mutator.setValueCount(nRecords); - - // Verify the contents. - final RepeatedIntVector.Accessor accessor1 = vector1.getAccessor(); - assertEquals(nRecords, accessor1.getValueCount()); - for (int recordIndex = 0; recordIndex < nRecords; ++recordIndex) { - for (int elementIndex = 0; elementIndex < nElements; ++elementIndex) { - final int value = accessor1.get(recordIndex, elementIndex); - assertEquals(recordIndex * values[elementIndex], value); - } - } - } - } - @Test public void testNullableFixedType() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableUInt4Holder.TYPE); // Create a new value vector for 1024 integers. - try (final NullableUInt4Vector vector = new NullableUInt4Vector(field, allocator)) { + try (final NullableUInt4Vector vector = new NullableUInt4Vector(EMPTY_SCHEMA_PATH, allocator)) { final NullableUInt4Vector.Mutator m = vector.getMutator(); vector.allocateNew(1024); @@ -222,10 +178,8 @@ public void testNullableFixedType() { @Test public void testNullableFloat() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableFloat4Holder.TYPE); - // Create a new value vector for 1024 integers - try (final NullableFloat4Vector vector = (NullableFloat4Vector) BasicTypeHelper.getNewVector(field, allocator)) { + try (final NullableFloat4Vector vector = (NullableFloat4Vector) MinorType.FLOAT4.getNewVector(EMPTY_SCHEMA_PATH, allocator, null)) { final NullableFloat4Vector.Mutator m = vector.getMutator(); vector.allocateNew(1024); @@ -271,10 +225,8 @@ public void testNullableFloat() { @Test public void testBitVector() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, BitHolder.TYPE); - // Create a new value vector for 1024 integers - try (final BitVector vector = new BitVector(field, allocator)) { + try (final BitVector vector = new BitVector(EMPTY_SCHEMA_PATH, allocator)) { final BitVector.Mutator m = vector.getMutator(); vector.allocateNew(1024); @@ -311,10 +263,8 @@ public void testBitVector() { @Test public void testReAllocNullableFixedWidthVector() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableFloat4Holder.TYPE); - // Create a new value vector for 1024 integers - try (final NullableFloat4Vector vector = (NullableFloat4Vector) BasicTypeHelper.getNewVector(field, allocator)) { + try (final NullableFloat4Vector vector = (NullableFloat4Vector) MinorType.FLOAT4.getNewVector(EMPTY_SCHEMA_PATH, allocator, null)) { final NullableFloat4Vector.Mutator m = vector.getMutator(); vector.allocateNew(1024); @@ -346,10 +296,8 @@ public void testReAllocNullableFixedWidthVector() { @Test public void testReAllocNullableVariableWidthVector() { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableVarCharHolder.TYPE); - // Create a new value vector for 1024 integers - try (final NullableVarCharVector vector = (NullableVarCharVector) BasicTypeHelper.getNewVector(field, allocator)) { + try (final NullableVarCharVector vector = (NullableVarCharVector) MinorType.VARCHAR.getNewVector(EMPTY_SCHEMA_PATH, allocator, null)) { final NullableVarCharVector.Mutator m = vector.getMutator(); vector.allocateNew(); @@ -376,69 +324,4 @@ public void testReAllocNullableVariableWidthVector() { } } - @Test - public void testVVInitialCapacity() throws Exception { - final MaterializedField[] fields = new MaterializedField[9]; - final ValueVector[] valueVectors = new ValueVector[9]; - - fields[0] = MaterializedField.create(EMPTY_SCHEMA_PATH, BitHolder.TYPE); - fields[1] = MaterializedField.create(EMPTY_SCHEMA_PATH, IntHolder.TYPE); - fields[2] = MaterializedField.create(EMPTY_SCHEMA_PATH, VarCharHolder.TYPE); - fields[3] = MaterializedField.create(EMPTY_SCHEMA_PATH, NullableVar16CharHolder.TYPE); - fields[4] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedFloat4Holder.TYPE); - fields[5] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedVarBinaryHolder.TYPE); - - fields[6] = MaterializedField.create(EMPTY_SCHEMA_PATH, MapVector.TYPE); - fields[6].addChild(fields[0] /*bit*/); - fields[6].addChild(fields[2] /*varchar*/); - - fields[7] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedMapVector.TYPE); - fields[7].addChild(fields[1] /*int*/); - fields[7].addChild(fields[3] /*optional var16char*/); - - fields[8] = MaterializedField.create(EMPTY_SCHEMA_PATH, RepeatedListVector.TYPE); - fields[8].addChild(fields[1] /*int*/); - - final int initialCapacity = 1024; - - try { - for (int i = 0; i < valueVectors.length; i++) { - valueVectors[i] = BasicTypeHelper.getNewVector(fields[i], allocator); - valueVectors[i].setInitialCapacity(initialCapacity); - valueVectors[i].allocateNew(); - } - - for (int i = 0; i < valueVectors.length; i++) { - final ValueVector vv = valueVectors[i]; - final int vvCapacity = vv.getValueCapacity(); - - // this can't be equality because Nullables will be allocated using power of two sized buffers (thus need 1025 - // spots in one vector > power of two is 2048, available capacity will be 2048 => 2047) - assertTrue(String.format("Incorrect value capacity for %s [%d]", vv.getField(), vvCapacity), - initialCapacity <= vvCapacity); - } - } finally { - for (ValueVector v : valueVectors) { - v.close(); - } - } - } - - @Test - public void testListVectorShouldNotThrowOversizedAllocationException() throws Exception { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, - Types.optional(MinorType.LIST)); - ListVector vector = new ListVector(field, allocator, null); - ListVector vectorFrom = new ListVector(field, allocator, null); - vectorFrom.allocateNew(); - - for (int i = 0; i < 10000; i++) { - vector.allocateNew(); - vector.copyFromSafe(0, 0, vectorFrom); - vector.clear(); - } - - vectorFrom.clear(); - vector.clear(); - } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index 4c24444d81d..24f00f14df0 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -27,7 +27,7 @@ import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.holders.UInt4Holder; -import org.apache.arrow.vector.types.MaterializedField; +import org.apache.arrow.vector.types.Types.MinorType; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -49,10 +49,9 @@ public void terminate() throws Exception { @Test public void testPromoteToUnion() throws Exception { - final MaterializedField field = MaterializedField.create(EMPTY_SCHEMA_PATH, UInt4Holder.TYPE); - try (final AbstractMapVector container = new MapVector(field, allocator, null); - final MapVector v = container.addOrGet("test", MapVector.TYPE, MapVector.class); + try (final AbstractMapVector container = new MapVector(EMPTY_SCHEMA_PATH, allocator, null); + final MapVector v = container.addOrGet("test", MinorType.MAP, MapVector.class); final PromotableWriter writer = new PromotableWriter(v, container)) { container.allocateNew(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java new file mode 100644 index 00000000000..bc17a2b2835 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -0,0 +1,270 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex.writer; + +import io.netty.buffer.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.impl.UnionReader; +import org.apache.arrow.vector.complex.impl.UnionWriter; +import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.Field; +import org.junit.Assert; +import org.junit.Test; + +public class TestComplexWriter { + + static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + + private static final int COUNT = 100; + + @Test + public void simpleNestedTypes() { + MapVector parent = new MapVector("parent", allocator, null); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < COUNT; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + writer.setValueCount(COUNT); + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + for (int i = 0; i < COUNT; i++) { + rootReader.setPosition(i); + Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); + Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); + } + + parent.close(); + } + + @Test + public void listScalarType() { + ListVector listVector = new ListVector("list", allocator, null); + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + listWriter.writeInt(j); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + Assert.assertEquals(j, listReader.reader().readInteger().intValue()); + } + } + } + + + @Test + public void listMapType() { + ListVector listVector = new ListVector("list", allocator, null); + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + MapWriter mapWriter = listWriter.map(); + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + mapWriter.start(); + mapWriter.integer("int").writeInt(j); + mapWriter.bigInt("bigInt").writeBigInt(j); + mapWriter.end(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + Assert.assertEquals("record: " + i, j, listReader.reader().reader("int").readInteger().intValue()); + Assert.assertEquals(j, listReader.reader().reader("bigInt").readLong().longValue()); + } + } + } + + @Test + public void listListType() { + ListVector listVector = new ListVector("list", allocator, null); + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listWriter.list(); + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + innerListWriter.integer().writeInt(k); + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + FieldReader innerListReader = listReader.reader(); + for (int k = 0; k < i % 13; k++) { + innerListReader.next(); + Assert.assertEquals("record: " + i, k, innerListReader.reader().readInteger().intValue()); + } + } + } + listVector.clear(); + } + + @Test + public void unionListListType() { + ListVector listVector = new ListVector("list", allocator, null); + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listWriter.list(); + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + if (k % 2 == 0) { + innerListWriter.integer().writeInt(k); + } else { + innerListWriter.bigInt().writeBigInt(k); + } + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + for (int j = 0; j < i % 7; j++) { + listReader.next(); + FieldReader innerListReader = listReader.reader(); + for (int k = 0; k < i % 13; k++) { + innerListReader.next(); + if (k % 2 == 0) { + Assert.assertEquals("record: " + i, k, innerListReader.reader().readInteger().intValue()); + } else { + Assert.assertEquals("record: " + i, k, innerListReader.reader().readLong().longValue()); + } + } + } + } + listVector.clear(); + } + + @Test + public void simpleUnion() { + UnionVector vector = new UnionVector("union", allocator, null); + UnionWriter unionWriter = new UnionWriter(vector); + unionWriter.allocate(); + for (int i = 0; i < COUNT; i++) { + unionWriter.setPosition(i); + if (i % 2 == 0) { + unionWriter.writeInt(i); + } else { + unionWriter.writeFloat4((float) i); + } + } + vector.getMutator().setValueCount(COUNT); + UnionReader unionReader = new UnionReader(vector); + for (int i = 0; i < COUNT; i++) { + unionReader.setPosition(i); + if (i % 2 == 0) { + Assert.assertEquals(i, i, unionReader.readInteger()); + } else { + Assert.assertEquals((float) i, unionReader.readFloat(), 1e-12); + } + } + vector.close(); + } + + @Test + public void promotableWriter() { + MapVector parent = new MapVector("parent", allocator, null); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + for (int i = 0; i < 100; i++) { + BigIntWriter bigIntWriter = rootWriter.bigInt("a"); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + Field field = parent.getField().getChildren().get(0).getChildren().get(0); + Assert.assertEquals("a", field.getName()); + Assert.assertEquals(Int.TYPE_TYPE, field.getType().getTypeType()); + Int intType = (Int) field.getType(); + + Assert.assertEquals(64, intType.getBitWidth()); + Assert.assertTrue(intType.getIsSigned()); + for (int i = 100; i < 200; i++) { + VarCharWriter varCharWriter = rootWriter.varChar("a"); + varCharWriter.setPosition(i); + byte[] bytes = Integer.toString(i).getBytes(); + ArrowBuf tempBuf = allocator.buffer(bytes.length); + tempBuf.setBytes(0, bytes); + varCharWriter.writeVarChar(0, bytes.length, tempBuf); + } + field = parent.getField().getChildren().get(0).getChildren().get(0); + Assert.assertEquals("a", field.getName()); + Assert.assertEquals(Union.TYPE_TYPE, field.getType().getTypeType()); + Assert.assertEquals(Int.TYPE_TYPE, field.getChildren().get(0).getType().getTypeType()); + Assert.assertEquals(Utf8.TYPE_TYPE, field.getChildren().get(1).getType().getTypeType()); + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + for (int i = 0; i < 100; i++) { + rootReader.setPosition(i); + Assert.assertEquals(i, rootReader.reader("a").readLong().intValue()); + } + for (int i = 100; i < 200; i++) { + rootReader.setPosition(i); + Assert.assertEquals(Integer.toString(i), rootReader.reader("a").readText().toString()); + } + } +} \ No newline at end of file diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java new file mode 100644 index 00000000000..06a1149c0d6 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.pojo; + +import com.google.common.collect.ImmutableList; +import com.google.flatbuffers.FlatBufferBuilder; +import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertEquals; + +/** + * Test conversion between Flatbuf and Pojo field representations + */ +public class TestConvert { + + @Test + public void simple() { + Field initialField = new Field("a", true, new Int(32, true), null); + run(initialField); + } + + @Test + public void complex() { + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + childrenBuilder.add(new Field("child1", true, Utf8.INSTANCE, null)); + childrenBuilder.add(new Field("child2", true, new FloatingPoint(0), ImmutableList.of())); + + Field initialField = new Field("a", true, Tuple.INSTANCE, childrenBuilder.build()); + run(initialField); + } + + @Test + public void schema() { + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + childrenBuilder.add(new Field("child1", true, Utf8.INSTANCE, null)); + childrenBuilder.add(new Field("child2", true, new FloatingPoint(0), ImmutableList.of())); + Schema initialSchema = new Schema(childrenBuilder.build()); + run(initialSchema); + + } + + private void run(Field initialField) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + builder.finish(initialField.getField(builder)); + org.apache.arrow.flatbuf.Field flatBufField = org.apache.arrow.flatbuf.Field.getRootAsField(builder.dataBuffer()); + Field finalField = Field.convertField(flatBufField); + assertEquals(initialField, finalField); + } + + private void run(Schema initialSchema) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + builder.finish(initialSchema.getSchema(builder)); + org.apache.arrow.flatbuf.Schema flatBufSchema = org.apache.arrow.flatbuf.Schema.getRootAsSchema(builder.dataBuffer()); + Schema finalSchema = Schema.convertSchema(flatBufSchema); + assertEquals(initialSchema, finalSchema); + } +} From fd2e52491bc39ae5aa0ddb7dbc21109172cea1c2 Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Thu, 18 Aug 2016 16:31:32 -0700 Subject: [PATCH 114/210] Revert version to 0.1-SNAPSHOT --- java/format/pom.xml | 2 +- java/memory/pom.xml | 2 +- java/pom.xml | 2 +- java/vector/pom.xml | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/format/pom.xml b/java/format/pom.xml index ea27a3072bc..cb11b5ff3c4 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -16,7 +16,7 @@ arrow-java-root org.apache.arrow - 0.1-decimal + 0.1-SNAPSHOT arrow-format diff --git a/java/memory/pom.xml b/java/memory/pom.xml index 12ff4c81d86..44332f5ed14 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -15,7 +15,7 @@ org.apache.arrow arrow-java-root - 0.1-decimal + 0.1-SNAPSHOT arrow-memory arrow-memory diff --git a/java/pom.xml b/java/pom.xml index 92ab109f939..8eb25af7545 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ org.apache.arrow arrow-java-root - 0.1-decimal + 0.1-SNAPSHOT pom Apache Arrow Java Root POM diff --git a/java/vector/pom.xml b/java/vector/pom.xml index fac788cef14..1a2921f6ea5 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -15,7 +15,7 @@ org.apache.arrow arrow-java-root - 0.1-decimal + 0.1-SNAPSHOT vector vectors @@ -25,7 +25,7 @@ org.apache.arrow arrow-format - 0.1-decimal + ${project.version} org.apache.arrow From 282fcacc86c9232c9dc1b1030e9fc9299bbc3f8d Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Fri, 19 Aug 2016 14:28:05 -0700 Subject: [PATCH 115/210] ARROW-265: Pad negative decimal values with1 --- .../codegen/templates/FixedValueVectors.java | 8 +- .../codegen/templates/HolderReaderImpl.java | 5 +- .../arrow/vector/util/DecimalUtility.java | 579 +----------------- .../arrow/vector/TestDecimalVector.java | 7 +- 4 files changed, 27 insertions(+), 572 deletions(-) diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java index fe2b5c5b5bc..37946f6b76e 100644 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -16,6 +16,8 @@ * limitations under the License. */ +import org.apache.arrow.vector.util.DecimalUtility; + import java.lang.Override; <@pp.dropOutputFile /> @@ -444,11 +446,7 @@ public void get(int index, Nullable${minor.class}Holder holder) { @Override public ${friendlyType} getObject(int index) { - byte[] bytes = new byte[${type.width}]; - int start = ${type.width} * index; - data.getBytes(start, bytes, 0, ${type.width}); - ${friendlyType} value = new BigDecimal(new BigInteger(bytes), scale); - return value; + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(data, index, scale); } <#else> diff --git a/java/vector/src/main/codegen/templates/HolderReaderImpl.java b/java/vector/src/main/codegen/templates/HolderReaderImpl.java index 1ed9287b00e..d66577bc1e4 100644 --- a/java/vector/src/main/codegen/templates/HolderReaderImpl.java +++ b/java/vector/src/main/codegen/templates/HolderReaderImpl.java @@ -156,9 +156,11 @@ private Object readSingleObject() { <#if type.major == "VarLen"> + <#if minor.class != "Decimal"> int length = holder.end - holder.start; byte[] value = new byte [length]; holder.buffer.getBytes(holder.start, value, 0, length); + <#if minor.class == "VarBinary"> return value; @@ -169,8 +171,7 @@ private Object readSingleObject() { text.set(value); return text; <#elseif minor.class == "Decimal" > - return new BigDecimal(new BigInteger(value), holder.scale); - + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(holder.buffer, holder.start, holder.scale); <#elseif minor.class == "Interval"> diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index 4eb0d9f2216..e171e87360d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -66,6 +66,8 @@ public class DecimalUtility { 100000000000000000l, 1000000000000000000l}; + public static final int DECIMAL_BYTE_LENGTH = 16; + /* * Simple function that returns the static precomputed * power of ten, instead of using Math.pow @@ -100,14 +102,6 @@ public static long adjustScaleDivide(long input, int factor) { } } - /* Given the number of actual digits this function returns the - * number of indexes it will occupy in the array of integers - * which are stored in base 1 billion - */ - public static int roundUp(int ndigits) { - return (ndigits + MAX_DIGITS - 1)/MAX_DIGITS; - } - /* Returns a string representation of the given integer * If the length of the given integer is less than the * passed length, this function will prepend zeroes to the string @@ -141,572 +135,33 @@ public static StringBuilder toStringWithZeroes(long number, int desiredLength) { return str; } - public static BigDecimal getBigDecimalFromIntermediate(ByteBuf data, int startIndex, int nDecimalDigits, int scale) { - - // In the intermediate representation we don't pad the scale with zeroes, so set truncate = false - return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, false); - } - - public static BigDecimal getBigDecimalFromSparse(ArrowBuf data, int startIndex, int nDecimalDigits, int scale) { - - // In the sparse representation we pad the scale with zeroes for ease of arithmetic, need to truncate - return getBigDecimalFromArrowBuf(data, startIndex, nDecimalDigits, scale, true); - } - - public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int start, int length, int scale) { - byte[] value = new byte[length]; - bytebuf.getBytes(start, value, 0, length); + public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int index, int scale) { + byte[] value = new byte[DECIMAL_BYTE_LENGTH]; + final int startIndex = index * DECIMAL_BYTE_LENGTH; + bytebuf.getBytes(startIndex, value, 0, DECIMAL_BYTE_LENGTH); BigInteger unscaledValue = new BigInteger(value); return new BigDecimal(unscaledValue, scale); } - public static BigDecimal getBigDecimalFromByteBuffer(ByteBuffer bytebuf, int start, int length, int scale) { - byte[] value = new byte[length]; + public static BigDecimal getBigDecimalFromByteBuffer(ByteBuffer bytebuf, int start, int scale) { + byte[] value = new byte[DECIMAL_BYTE_LENGTH]; bytebuf.get(value); BigInteger unscaledValue = new BigInteger(value); return new BigDecimal(unscaledValue, scale); } - public static void writeBigDecimalToArrowBuf(ArrowBuf bytebuf, int startIndex, BigDecimal value) { - byte[] bytes = value.unscaledValue().toByteArray(); - if (bytes.length > 16) { + public static void writeBigDecimalToArrowBuf(BigDecimal value, ArrowBuf bytebuf, int index) { + final byte[] bytes = value.unscaledValue().toByteArray(); + final int startIndex = index * DECIMAL_BYTE_LENGTH; + if (bytes.length > DECIMAL_BYTE_LENGTH) { throw new UnsupportedOperationException("Decimal size greater than 16 bytes"); } - bytebuf.setBytes(startIndex + 16 - bytes.length, bytes, 0, bytes.length); - } - - /* Create a BigDecimal object using the data in the ArrowBuf. - * This function assumes that data is provided in a non-dense format - * It works on both sparse and intermediate representations. - */ - public static BigDecimal getBigDecimalFromArrowBuf(ByteBuf data, int startIndex, int nDecimalDigits, int scale, - boolean truncateScale) { - - // For sparse decimal type we have padded zeroes at the end, strip them while converting to BigDecimal. - int actualDigits; - - // Initialize the BigDecimal, first digit in the ArrowBuf has the sign so mask it out - BigInteger decimalDigits = BigInteger.valueOf((data.getInt(startIndex)) & 0x7FFFFFFF); - - BigInteger base = BigInteger.valueOf(DIGITS_BASE); - - for (int i = 1; i < nDecimalDigits; i++) { - - BigInteger temp = BigInteger.valueOf(data.getInt(startIndex + (i * INTEGER_SIZE))); - decimalDigits = decimalDigits.multiply(base); - decimalDigits = decimalDigits.add(temp); - } - - // Truncate any additional padding we might have added - if (truncateScale == true && scale > 0 && (actualDigits = scale % MAX_DIGITS) != 0) { - BigInteger truncate = BigInteger.valueOf((int)Math.pow(10, (MAX_DIGITS - actualDigits))); - decimalDigits = decimalDigits.divide(truncate); - } - - // set the sign - if ((data.getInt(startIndex) & 0x80000000) != 0) { - decimalDigits = decimalDigits.negate(); + final int padLength = DECIMAL_BYTE_LENGTH - bytes.length; + final int padValue = value.signum() == -1 ? 0xFF : 0; + for (int i = 0; i < padLength; i++) { + bytebuf.setByte(startIndex + i, padValue); } - - BigDecimal decimal = new BigDecimal(decimalDigits, scale); - - return decimal; - } - - /* This function returns a BigDecimal object from the dense decimal representation. - * First step is to convert the dense representation into an intermediate representation - * and then invoke getBigDecimalFromArrowBuf() to get the BigDecimal object - */ - public static BigDecimal getBigDecimalFromDense(ArrowBuf data, int startIndex, int nDecimalDigits, int scale, int maxPrecision, int width) { - - /* This method converts the dense representation to - * an intermediate representation. The intermediate - * representation has one more integer than the dense - * representation. - */ - byte[] intermediateBytes = new byte[((nDecimalDigits + 1) * INTEGER_SIZE)]; - - // Start storing from the least significant byte of the first integer - int intermediateIndex = 3; - - int[] mask = {0x03, 0x0F, 0x3F, 0xFF}; - int[] reverseMask = {0xFC, 0xF0, 0xC0, 0x00}; - - int maskIndex; - int shiftOrder; - byte shiftBits; - - // TODO: Some of the logic here is common with casting from Dense to Sparse types, factor out common code - if (maxPrecision == 38) { - maskIndex = 0; - shiftOrder = 6; - shiftBits = 0x00; - intermediateBytes[intermediateIndex++] = (byte) (data.getByte(startIndex) & 0x7F); - } else if (maxPrecision == 28) { - maskIndex = 1; - shiftOrder = 4; - shiftBits = (byte) ((data.getByte(startIndex) & 0x03) << shiftOrder); - intermediateBytes[intermediateIndex++] = (byte) (((data.getByte(startIndex) & 0x3C) & 0xFF) >>> 2); - } else { - throw new UnsupportedOperationException("Dense types with max precision 38 and 28 are only supported"); - } - - int inputIndex = 1; - boolean sign = false; - - if ((data.getByte(startIndex) & 0x80) != 0) { - sign = true; - } - - while (inputIndex < width) { - - intermediateBytes[intermediateIndex] = (byte) ((shiftBits) | (((data.getByte(startIndex + inputIndex) & reverseMask[maskIndex]) & 0xFF) >>> (8 - shiftOrder))); - - shiftBits = (byte) ((data.getByte(startIndex + inputIndex) & mask[maskIndex]) << shiftOrder); - - inputIndex++; - intermediateIndex++; - - if (((inputIndex - 1) % INTEGER_SIZE) == 0) { - shiftBits = (byte) ((shiftBits & 0xFF) >>> 2); - maskIndex++; - shiftOrder -= 2; - } - - } - /* copy the last byte */ - intermediateBytes[intermediateIndex] = shiftBits; - - if (sign == true) { - intermediateBytes[0] = (byte) (intermediateBytes[0] | 0x80); - } - - final ByteBuf intermediate = UnpooledByteBufAllocator.DEFAULT.buffer(intermediateBytes.length); - try { - intermediate.setBytes(0, intermediateBytes); - - BigDecimal ret = getBigDecimalFromIntermediate(intermediate, 0, nDecimalDigits + 1, scale); - return ret; - } finally { - intermediate.release(); - } - - } - - public static void getSparseFromBigDecimal(BigDecimal input, ByteBuf data, int startIndex, int scale, int precision, - int nDecimalDigits) { - - // Initialize the buffer - for (int i = 0; i < nDecimalDigits; i++) { - data.setInt(startIndex + (i * INTEGER_SIZE), 0); - } - - boolean sign = false; - - if (input.signum() == -1) { - // negative input - sign = true; - input = input.abs(); - } - - // Truncate the input as per the scale provided - input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); - - // Separate out the integer part - BigDecimal integerPart = input.setScale(0, BigDecimal.ROUND_DOWN); - - int destIndex = nDecimalDigits - roundUp(scale) - 1; - - // we use base 1 billion integer digits for out integernal representation - BigDecimal base = new BigDecimal(DIGITS_BASE); - - while (integerPart.compareTo(BigDecimal.ZERO) == 1) { - // store the modulo as the integer value - data.setInt(startIndex + (destIndex * INTEGER_SIZE), (integerPart.remainder(base)).intValue()); - destIndex--; - // Divide by base 1 billion - integerPart = (integerPart.divide(base)).setScale(0, BigDecimal.ROUND_DOWN); - } - - /* Sparse representation contains padding of additional zeroes - * so each digit contains MAX_DIGITS for ease of arithmetic - */ - int actualDigits; - if ((actualDigits = (scale % MAX_DIGITS)) != 0) { - // Pad additional zeroes - scale = scale + (MAX_DIGITS - actualDigits); - input = input.setScale(scale, BigDecimal.ROUND_DOWN); - } - - //separate out the fractional part - BigDecimal fractionalPart = input.remainder(BigDecimal.ONE).movePointRight(scale); - - destIndex = nDecimalDigits - 1; - - while (scale > 0) { - // Get next set of MAX_DIGITS (9) store it in the ArrowBuf - fractionalPart = fractionalPart.movePointLeft(MAX_DIGITS); - BigDecimal temp = fractionalPart.remainder(BigDecimal.ONE); - - data.setInt(startIndex + (destIndex * INTEGER_SIZE), (temp.unscaledValue().intValue())); - destIndex--; - - fractionalPart = fractionalPart.setScale(0, BigDecimal.ROUND_DOWN); - scale -= MAX_DIGITS; - } - - // Set the negative sign - if (sign == true) { - data.setInt(startIndex, data.getInt(startIndex) | 0x80000000); - } - - } - - - public static long getDecimal18FromBigDecimal(BigDecimal input, int scale, int precision) { - // Truncate or pad to set the input to the correct scale - input = input.setScale(scale, BigDecimal.ROUND_HALF_UP); - - return (input.unscaledValue().longValue()); - } - - public static BigDecimal getBigDecimalFromPrimitiveTypes(int input, int scale, int precision) { - return BigDecimal.valueOf(input, scale); - } - - public static BigDecimal getBigDecimalFromPrimitiveTypes(long input, int scale, int precision) { - return BigDecimal.valueOf(input, scale); - } - - - public static int compareDenseBytes(ArrowBuf left, int leftStart, boolean leftSign, ArrowBuf right, int rightStart, boolean rightSign, int width) { - - int invert = 1; - - /* If signs are different then simply look at the - * sign of the two inputs and determine which is greater - */ - if (leftSign != rightSign) { - - return((leftSign == true) ? -1 : 1); - } else if(leftSign == true) { - /* Both inputs are negative, at the end we will - * have to invert the comparison - */ - invert = -1; - } - - int cmp = 0; - - for (int i = 0; i < width; i++) { - byte leftByte = left.getByte(leftStart + i); - byte rightByte = right.getByte(rightStart + i); - // Unsigned byte comparison - if ((leftByte & 0xFF) > (rightByte & 0xFF)) { - cmp = 1; - break; - } else if ((leftByte & 0xFF) < (rightByte & 0xFF)) { - cmp = -1; - break; - } - } - cmp *= invert; // invert the comparison if both were negative values - - return cmp; - } - - public static int getIntegerFromSparseBuffer(ArrowBuf buffer, int start, int index) { - int value = buffer.getInt(start + (index * 4)); - - if (index == 0) { - /* the first byte contains sign bit, return value without it */ - value = (value & 0x7FFFFFFF); - } - return value; - } - - public static void setInteger(ArrowBuf buffer, int start, int index, int value) { - buffer.setInt(start + (index * 4), value); - } - - public static int compareSparseBytes(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits, boolean absCompare) { - - int invert = 1; - - if (absCompare == false) { - if (leftSign != rightSign) { - return (leftSign == true) ? -1 : 1; - } - - // Both values are negative invert the outcome of the comparison - if (leftSign == true) { - invert = -1; - } - } - - int cmp = compareSparseBytesInner(left, leftStart, leftSign, leftScale, leftPrecision, right, rightStart, rightSign, rightPrecision, rightScale, width, nDecimalDigits); - return cmp * invert; - } - public static int compareSparseBytesInner(ArrowBuf left, int leftStart, boolean leftSign, int leftScale, int leftPrecision, ArrowBuf right, int rightStart, boolean rightSign, int rightPrecision, int rightScale, int width, int nDecimalDigits) { - /* compute the number of integer digits in each decimal */ - int leftInt = leftPrecision - leftScale; - int rightInt = rightPrecision - rightScale; - - /* compute the number of indexes required for storing integer digits */ - int leftIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftInt); - int rightIntRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightInt); - - /* compute number of indexes required for storing scale */ - int leftScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(leftScale); - int rightScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(rightScale); - - /* compute index of the most significant integer digits */ - int leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; - int rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; - - int leftStopIndex = nDecimalDigits - leftScaleRoundedUp; - int rightStopIndex = nDecimalDigits - rightScaleRoundedUp; - - /* Discard the zeroes in the integer part */ - while (leftIndex1 < leftStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { - break; - } - - /* Digit in this location is zero, decrement the actual number - * of integer digits - */ - leftIntRoundedUp--; - leftIndex1++; - } - - /* If we reached the stop index then the number of integers is zero */ - if (leftIndex1 == leftStopIndex) { - leftIntRoundedUp = 0; - } - - while (rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { - break; - } - - /* Digit in this location is zero, decrement the actual number - * of integer digits - */ - rightIntRoundedUp--; - rightIndex1++; - } - - if (rightIndex1 == rightStopIndex) { - rightIntRoundedUp = 0; - } - - /* We have the accurate number of non-zero integer digits, - * if the number of integer digits are different then we can determine - * which decimal is larger and needn't go down to comparing individual values - */ - if (leftIntRoundedUp > rightIntRoundedUp) { - return 1; - } - else if (rightIntRoundedUp > leftIntRoundedUp) { - return -1; - } - - /* The number of integer digits are the same, set the each index - * to the first non-zero integer and compare each digit - */ - leftIndex1 = nDecimalDigits - leftScaleRoundedUp - leftIntRoundedUp; - rightIndex1 = nDecimalDigits - rightScaleRoundedUp - rightIntRoundedUp; - - while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { - return 1; - } - else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { - return -1; - } - - leftIndex1++; - rightIndex1++; - } - - /* The integer part of both the decimal's are equal, now compare - * each individual fractional part. Set the index to be at the - * beginning of the fractional part - */ - leftIndex1 = leftStopIndex; - rightIndex1 = rightStopIndex; - - /* Stop indexes will be the end of the array */ - leftStopIndex = nDecimalDigits; - rightStopIndex = nDecimalDigits; - - /* compare the two fractional parts of the decimal */ - while (leftIndex1 < leftStopIndex && rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) > getIntegerFromSparseBuffer(right, rightStart, rightIndex1)) { - return 1; - } - else if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) > getIntegerFromSparseBuffer(left, leftStart, leftIndex1)) { - return -1; - } - - leftIndex1++; - rightIndex1++; - } - - /* Till now the fractional part of the decimals are equal, check - * if one of the decimal has fractional part that is remaining - * and is non-zero - */ - while (leftIndex1 < leftStopIndex) { - if (getIntegerFromSparseBuffer(left, leftStart, leftIndex1) != 0) { - return 1; - } - leftIndex1++; - } - - while(rightIndex1 < rightStopIndex) { - if (getIntegerFromSparseBuffer(right, rightStart, rightIndex1) != 0) { - return -1; - } - rightIndex1++; - } - - /* Both decimal values are equal */ - return 0; - } - - public static BigDecimal getBigDecimalFromByteArray(byte[] bytes, int start, int length, int scale) { - byte[] value = Arrays.copyOfRange(bytes, start, start + length); - BigInteger unscaledValue = new BigInteger(value); - return new BigDecimal(unscaledValue, scale); - } - - public static void roundDecimal(ArrowBuf result, int start, int nDecimalDigits, int desiredScale, int currentScale) { - int newScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(desiredScale); - int origScaleRoundedUp = org.apache.arrow.vector.util.DecimalUtility.roundUp(currentScale); - - if (desiredScale < currentScale) { - - boolean roundUp = false; - - //Extract the first digit to be truncated to check if we need to round up - int truncatedScaleIndex = desiredScale + 1; - if (truncatedScaleIndex <= currentScale) { - int extractDigitIndex = nDecimalDigits - origScaleRoundedUp -1; - extractDigitIndex += org.apache.arrow.vector.util.DecimalUtility.roundUp(truncatedScaleIndex); - int extractDigit = getIntegerFromSparseBuffer(result, start, extractDigitIndex); - int temp = org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS - (truncatedScaleIndex % org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS); - if (temp != 0) { - extractDigit = extractDigit / (int) (Math.pow(10, temp)); - } - if ((extractDigit % 10) > 4) { - roundUp = true; - } - } - - // Get the source index beyond which we will truncate - int srcIntIndex = nDecimalDigits - origScaleRoundedUp - 1; - int srcIndex = srcIntIndex + newScaleRoundedUp; - - // Truncate the remaining fractional part, move the integer part - int destIndex = nDecimalDigits - 1; - if (srcIndex != destIndex) { - while (srcIndex >= 0) { - setInteger(result, start, destIndex--, getIntegerFromSparseBuffer(result, start, srcIndex--)); - } - - // Set the remaining portion of the decimal to be zeroes - while (destIndex >= 0) { - setInteger(result, start, destIndex--, 0); - } - srcIndex = nDecimalDigits - 1; - } - - // We truncated the decimal digit. Now we need to truncate within the base 1 billion fractional digit - int truncateFactor = org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS - (desiredScale % org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS); - if (truncateFactor != org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS) { - truncateFactor = (int) Math.pow(10, truncateFactor); - int fractionalDigits = getIntegerFromSparseBuffer(result, start, nDecimalDigits - 1); - fractionalDigits /= truncateFactor; - setInteger(result, start, nDecimalDigits - 1, fractionalDigits * truncateFactor); - } - - // Finally round up the digit if needed - if (roundUp == true) { - srcIndex = nDecimalDigits - 1; - int carry; - if (truncateFactor != org.apache.arrow.vector.util.DecimalUtility.MAX_DIGITS) { - carry = truncateFactor; - } else { - carry = 1; - } - - while (srcIndex >= 0) { - int value = getIntegerFromSparseBuffer(result, start, srcIndex); - value += carry; - - if (value >= org.apache.arrow.vector.util.DecimalUtility.DIGITS_BASE) { - setInteger(result, start, srcIndex--, value % org.apache.arrow.vector.util.DecimalUtility.DIGITS_BASE); - carry = value / org.apache.arrow.vector.util.DecimalUtility.DIGITS_BASE; - } else { - setInteger(result, start, srcIndex--, value); - carry = 0; - break; - } - } - } - } else if (desiredScale > currentScale) { - // Add fractional digits to the decimal - - // Check if we need to shift the decimal digits to the left - if (newScaleRoundedUp > origScaleRoundedUp) { - int srcIndex = 0; - int destIndex = newScaleRoundedUp - origScaleRoundedUp; - - // Check while extending scale, we are not overwriting integer part - while (srcIndex < destIndex) { - if (getIntegerFromSparseBuffer(result, start, srcIndex++) != 0) { - throw new RuntimeException("Truncate resulting in loss of integer part, reduce scale specified"); - } - } - - srcIndex = 0; - while (destIndex < nDecimalDigits) { - setInteger(result, start, srcIndex++, getIntegerFromSparseBuffer(result, start, destIndex++)); - } - - // Clear the remaining part - while (srcIndex < nDecimalDigits) { - setInteger(result, start, srcIndex++, 0); - } - } - } - } - - public static int getFirstFractionalDigit(int decimal, int scale) { - if (scale == 0) { - return 0; - } - int temp = (int) adjustScaleDivide(decimal, scale - 1); - return Math.abs(temp % 10); - } - - public static int getFirstFractionalDigit(long decimal, int scale) { - if (scale == 0) { - return 0; - } - long temp = adjustScaleDivide(decimal, scale - 1); - return (int) (Math.abs(temp % 10)); - } - - public static int getFirstFractionalDigit(ArrowBuf data, int scale, int start, int nDecimalDigits) { - if (scale == 0) { - return 0; - } - - int index = nDecimalDigits - roundUp(scale); - return (int) (adjustScaleDivide(data.getInt(start + (index * INTEGER_SIZE)), MAX_DIGITS - 1)); + bytebuf.setBytes(startIndex + padLength, bytes, 0, bytes.length); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java index 7ab7db3117b..cca35e44a21 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestDecimalVector.java @@ -33,9 +33,10 @@ public class TestDecimalVector { private static long[] intValues; static { - intValues = new long[30]; - for (int i = 0; i < intValues.length; i++) { + intValues = new long[60]; + for (int i = 0; i < intValues.length / 2; i++) { intValues[i] = 1 << i + 1; + intValues[2 * i] = -1 * (1 << i + 1); } } private int scale = 3; @@ -50,7 +51,7 @@ public void test() { BigDecimal decimal = new BigDecimal(BigInteger.valueOf(intValues[i]), scale); values[i] = decimal; decimalVector.getMutator().setIndexDefined(i); - DecimalUtility.writeBigDecimalToArrowBuf(decimalVector.getBuffer(), i * 16, decimal); + DecimalUtility.writeBigDecimalToArrowBuf(decimal, decimalVector.getBuffer(), i); } decimalVector.getMutator().setValueCount(intValues.length); From c2eb1612df34bee7baddc8851d24826d3c33faa6 Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Fri, 19 Aug 2016 17:39:36 -0700 Subject: [PATCH 116/210] ARROW-265: Fix few decimal bugs --- .../AbstractPromotableFieldWriter.java | 19 ++++++++++++++++--- .../codegen/templates/FixedValueVectors.java | 2 +- .../main/codegen/templates/MapWriters.java | 2 +- .../org/apache/arrow/vector/types/Types.java | 3 ++- .../arrow/vector/util/DecimalUtility.java | 3 +-- 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java index 7e60320cfb8..d21dcd0f646 100644 --- a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java @@ -82,7 +82,18 @@ public void write(${name}Holder holder) { getWriter(MinorType.${name?upper_case}).write${minor.class}(<#list fields as field>${field.name}<#if field_has_next>, ); } + <#else> + @Override + public void write(DecimalHolder holder) { + getWriter(MinorType.DECIMAL).write(holder); + } + + public void writeDecimal(int start, ArrowBuf buffer) { + getWriter(MinorType.DECIMAL).writeDecimal(start, buffer); + } + + public void writeNull() { @@ -113,8 +124,11 @@ public ListWriter list(String name) { <#if lowerName == "int" ><#assign lowerName = "integer" /> <#assign upperName = minor.class?upper_case /> <#assign capName = minor.class?cap_first /> - <#if !minor.class?starts_with("Decimal") > - + <#if minor.class?starts_with("Decimal") > + public ${capName}Writer ${lowerName}(String name, int scale, int precision) { + return getWriter(MinorType.MAP).${lowerName}(name, scale, precision); + } + @Override public ${capName}Writer ${lowerName}(String name) { return getWriter(MinorType.MAP).${lowerName}(name); @@ -125,7 +139,6 @@ public ListWriter list(String name) { return getWriter(MinorType.LIST).${lowerName}(); } - public void copyReader(FieldReader reader) { diff --git a/java/vector/src/main/codegen/templates/FixedValueVectors.java b/java/vector/src/main/codegen/templates/FixedValueVectors.java index 37946f6b76e..7958222f5c1 100644 --- a/java/vector/src/main/codegen/templates/FixedValueVectors.java +++ b/java/vector/src/main/codegen/templates/FixedValueVectors.java @@ -446,7 +446,7 @@ public void get(int index, Nullable${minor.class}Holder holder) { @Override public ${friendlyType} getObject(int index) { - return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(data, index, scale); + return org.apache.arrow.vector.util.DecimalUtility.getBigDecimalFromArrowBuf(data, ${type.width} * index, scale); } <#else> diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index af2922826ec..8a8983a1497 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -198,7 +198,7 @@ public void end() { if(writer == null) { ValueVector vector; ValueVector currentVector = container.getChild(name); - ${vectName}Vector v = container.addOrGet(name, MinorType.${upperName}, ${vectName}Vector.class); + ${vectName}Vector v = container.addOrGet(name, MinorType.${upperName}, ${vectName}Vector.class<#if minor.class == "Decimal"> , new int[] {precision, scale}); writer = new PromotableWriter(v, container); vector = v; if (currentVector == null || currentVector != vector) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 5ea1456a051..c34882a8fb1 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -47,6 +47,7 @@ import org.apache.arrow.vector.complex.impl.BigIntWriterImpl; import org.apache.arrow.vector.complex.impl.BitWriterImpl; import org.apache.arrow.vector.complex.impl.DateWriterImpl; +import org.apache.arrow.vector.complex.impl.DecimalWriterImpl; import org.apache.arrow.vector.complex.impl.Float4WriterImpl; import org.apache.arrow.vector.complex.impl.Float8WriterImpl; import org.apache.arrow.vector.complex.impl.IntWriterImpl; @@ -386,7 +387,7 @@ public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new VarBinaryWriterImpl((NullableVarBinaryVector) vector); + return new DecimalWriterImpl((NullableDecimalVector) vector); } }, // variable length binary UINT1(new Int(8, false)) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java index e171e87360d..4c439b2cc10 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/DecimalUtility.java @@ -135,9 +135,8 @@ public static StringBuilder toStringWithZeroes(long number, int desiredLength) { return str; } - public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int index, int scale) { + public static BigDecimal getBigDecimalFromArrowBuf(ArrowBuf bytebuf, int startIndex, int scale) { byte[] value = new byte[DECIMAL_BYTE_LENGTH]; - final int startIndex = index * DECIMAL_BYTE_LENGTH; bytebuf.getBytes(startIndex, value, 0, DECIMAL_BYTE_LENGTH); BigInteger unscaledValue = new BigInteger(value); return new BigDecimal(unscaledValue, scale); From 812201a2db1ebabd0f65ebd2774ec8f0880bb8cb Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 19 Aug 2016 18:05:16 -0700 Subject: [PATCH 117/210] ARROW-266: [C++] Fix broken build due to Flatbuffers namespace change Author: Wes McKinney Closes #122 from wesm/ARROW-266 and squashes the following commits: 6193323 [Wes McKinney] Fix broken build due to Flatbuffers namespace change --- cpp/src/arrow/ipc/adapter.cc | 2 +- cpp/src/arrow/ipc/metadata-internal.cc | 2 +- cpp/src/arrow/ipc/metadata-internal.h | 2 +- cpp/src/arrow/ipc/metadata.cc | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 3259980058b..40d372bbd35 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -41,7 +41,7 @@ namespace arrow { -namespace flatbuf = apache::arrow::flatbuf; +namespace flatbuf = org::apache::arrow::flatbuf; namespace ipc { diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 8cd416ff585..16ba20f7e90 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -37,7 +37,7 @@ typedef flatbuffers::Offset Offset; namespace arrow { -namespace flatbuf = apache::arrow::flatbuf; +namespace flatbuf = org::apache::arrow::flatbuf; namespace ipc { diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 871b5bc4bf6..5faa8c947b5 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -28,7 +28,7 @@ namespace arrow { -namespace flatbuf = apache::arrow::flatbuf; +namespace flatbuf = org::apache::arrow::flatbuf; class Buffer; struct Field; diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index 4fc8ec50eb7..e510755110e 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -33,7 +33,7 @@ namespace arrow { -namespace flatbuf = apache::arrow::flatbuf; +namespace flatbuf = org::apache::arrow::flatbuf; namespace ipc { From 78619686f44da5a28319032551b07ddfadc26468 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Sat, 20 Aug 2016 10:39:50 -0700 Subject: [PATCH 118/210] ARROW-252: Add implementation guidelines to the documentation Author: Julien Le Dem Closes #120 from julienledem/arrow_252_impl_guidelines and squashes the following commits: caf6994 [Julien Le Dem] ARROW-252: review feedback 6b68ce1 [Julien Le Dem] ARROW-252: Add implementation guidelines to the documentation --- format/Guidelines.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 format/Guidelines.md diff --git a/format/Guidelines.md b/format/Guidelines.md new file mode 100644 index 00000000000..14f10578504 --- /dev/null +++ b/format/Guidelines.md @@ -0,0 +1,17 @@ +# Implementation guidelines + +An execution engine (or framework, or UDF executor, or storage engine, etc) can implements only a subset of the Arrow spec and/or extend it given the following constraints: + +## Implementing a subset the spec +### If only producing (and not consuming) arrow vectors. +Any subset of the vector spec and the corresponding metadata can be implemented. + +### If consuming and producing vectors +There is a minimal subset of vectors to be supported. +Production of a subset of vectors and their corresponding metadata is always fine. +Consumption of vectors should at least convert the unsupported input vectors to the supported subset (for example Timestamp.millis to timestamp.micros or int32 to int64) + +## Extensibility +An execution engine implementor can also extend their memory representation with their own vectors internally as long as they are never exposed. Before sending data to another system expecting Arrow data these custom vectors should be converted to a type that exist in the Arrow spec. +An example of this is operating on compressed data. +These custom vectors are not exchanged externaly and there is no support for custom metadata. From 8960a2ed4c0d400be32003beb183f150e019c4ec Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Sat, 20 Aug 2016 13:02:45 -0700 Subject: [PATCH 119/210] ARROW-255: Finalize Dictionary representation Author: Julien Le Dem Closes #119 from julienledem/arrow_255_dictionary and squashes the following commits: 316745d [Julien Le Dem] ARROW-255: fix typo and linter errors e28a3c8 [Julien Le Dem] ARROW-255: review feedback 8c27943 [Julien Le Dem] ARROW-255: Finalize Dictionary representation --- cpp/src/arrow/ipc/metadata-internal.cc | 3 ++- cpp/src/arrow/type.h | 11 +++++--- format/Layout.md | 37 ++++++++++++++++++++++++++ format/Message.fbs | 6 ++++- 4 files changed, 52 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 16ba20f7e90..50db730d208 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -220,7 +220,8 @@ static Status FieldToFlatbuffer( auto fb_children = fbb.CreateVector(children); *offset = flatbuf::CreateField( - fbb, fb_name, field->nullable, type_enum, type_data, fb_children); + fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, + fb_children); return Status::OK(); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4cb37fd1dea..02677d5e18b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -144,8 +144,13 @@ struct ARROW_EXPORT Field { // Fields can be nullable bool nullable; - Field(const std::string& name, const TypePtr& type, bool nullable = true) - : name(name), type(type), nullable(nullable) {} + // optional dictionary id if the field is dictionary encoded + // 0 means it's not dictionary encoded + int64_t dictionary; + + Field(const std::string& name, const TypePtr& type, bool nullable = true, + int64_t dictionary = 0) + : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } @@ -154,7 +159,7 @@ struct ARROW_EXPORT Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && this->nullable == other.nullable && - this->type->Equals(other.type.get())); + this->dictionary == dictionary && this->type->Equals(other.type.get())); } bool Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } diff --git a/format/Layout.md b/format/Layout.md index 5eaefeebf21..a953930e172 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -583,6 +583,43 @@ even if the null bitmap of the parent union array indicates the slot is null. Additionally, a child array may have a non-null slot even if the the types array indicates that a slot contains a different type at the index. +## Dictionary encoding + +When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. +The Dictionary is received as a DictionaryBacth whose id is referenced by a dictionary attribute defined in the metadata (Message.fbs) in the Field table. +The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatch. +When a Schema references a Dictionary id, it must send a DictionaryBatch for this id before any RecordBatch. + +As an example, you could have the following data: +``` +type: List + +[ + ['a', 'b'], + ['a', 'b'], + ['a', 'b'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['a', 'b'] +] +``` +In dictionary-encoded form, this could appear as: +``` +data List (dictionary-encoded, dictionary id i) +indices: [0, 0, 0, 1, 1, 1, 0] + +dictionary i + +type: List + +[ + ['a', 'b'], + ['c', 'd', 'e'], +] +``` + ## References Apache Drill Documentation - [Value Vectors][6] diff --git a/format/Message.fbs b/format/Message.fbs index 2928207db8c..a78009b6e5f 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -104,6 +104,10 @@ table Field { name: string; nullable: bool; type: Type; + // present only if the field is dictionary encoded + // will point to a dictionary provided by a DictionaryBatch message + dictionary: long; + // children apply only to Nested data types like Struct, List and Union children: [Field]; } @@ -185,8 +189,8 @@ table RecordBatch { /// For sending dictionary encoding information. Any Field can be /// dictionary-encoded, but in this case none of its children may be /// dictionary-encoded. +/// There is one dictionary batch per dictionary /// -/// TODO(wesm): To be documented in more detail table DictionaryBatch { id: long; From ec51d566708f5d6ea0a94a6d53152dc8cc98d6aa Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Mon, 22 Aug 2016 13:10:06 -0700 Subject: [PATCH 120/210] ARROW-269: Include typeVector buffers UnionVector.getBuffers() --- .../main/codegen/templates/UnionVector.java | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index e2f19f4b33b..1fef490d4ec 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -16,7 +16,9 @@ * limitations under the License. */ +import com.google.common.collect.ImmutableList; import com.google.flatbuffers.FlatBufferBuilder; +import io.netty.buffer.ArrowBuf; import org.apache.arrow.flatbuf.Field; import org.apache.arrow.flatbuf.Type; import org.apache.arrow.flatbuf.Union; @@ -35,6 +37,7 @@ package org.apache.arrow.vector.complex; <#include "/@includes/vv_imports.ftl" /> +import com.google.common.collect.ImmutableList; import java.util.ArrayList; import java.util.Iterator; import org.apache.arrow.vector.complex.impl.ComplexCopier; @@ -219,6 +222,7 @@ public TransferPair makeTransferPair(ValueVector target) { } public void transferTo(org.apache.arrow.vector.complex.UnionVector target) { + typeVector.makeTransferPair(target.typeVector).transfer(); internalMap.makeTransferPair(target.internalMap).transfer(); target.valueCount = valueCount; } @@ -307,20 +311,9 @@ public FieldWriter getWriter() { return mutator.writer; } -// @Override -// public UserBitShared.SerializedField getMetadata() { -// SerializedField.Builder b = getField() // -// .getAsBuilder() // -// .setBufferLength(getBufferSize()) // -// .setValueCount(valueCount); -// -// b.addChild(internalMap.getMetadata()); -// return b.build(); -// } - @Override public int getBufferSize() { - return internalMap.getBufferSize(); + return typeVector.getBufferSize() + internalMap.getBufferSize(); } @Override @@ -339,7 +332,11 @@ public int getBufferSizeFor(final int valueCount) { @Override public ArrowBuf[] getBuffers(boolean clear) { - return internalMap.getBuffers(clear); + ImmutableList.Builder builder = ImmutableList.builder(); + builder.add(typeVector.getBuffers(clear)); + builder.add(internalMap.getBuffers(clear)); + List list = builder.build(); + return list.toArray(new ArrowBuf[list.size()]); } @Override @@ -411,6 +408,7 @@ public class Mutator extends BaseValueVector.BaseMutator { @Override public void setValueCount(int valueCount) { UnionVector.this.valueCount = valueCount; + typeVector.getMutator().setValueCount(valueCount); internalMap.getMutator().setValueCount(valueCount); } From 803afeb502dcdd802fada2ed0d66c145546b8a78 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 26 Aug 2016 08:20:13 -0700 Subject: [PATCH 121/210] ARROW-264: File format This is work in progress Author: Julien Le Dem Closes #123 from julienledem/arrow_264_file_format and squashes the following commits: 252de6d [Julien Le Dem] remove outdated comment 04d797f [Julien Le Dem] maps are not nullable yet e8359b3 [Julien Le Dem] align on 8 byte boundaries; more tests 8b8b823 [Julien Le Dem] refactoring 31e95e6 [Julien Le Dem] fix list vector b824938 [Julien Le Dem] fix types; add licenses; more tests; more complex 2fd3bc1 [Julien Le Dem] cleanup 50fe680 [Julien Le Dem] nested support b0bf6bc [Julien Le Dem] cleanup 4247b1a [Julien Le Dem] fix whitespace d6a1788 [Julien Le Dem] refactoring 81863c5 [Julien Le Dem] fixed loader aa1b766 [Julien Le Dem] better test 2067e01 [Julien Le Dem] update format aacf61e [Julien Le Dem] fix pom b907aa5 [Julien Le Dem] simplify e43f26b [Julien Le Dem] add layout spec 0cc9718 [Julien Le Dem] add vector type ac6902a [Julien Le Dem] ARROW-264: File format 807db51 [Julien Le Dem] move information to schema f2f0596 [Julien Le Dem] Update FieldNode structure to be more explicit and reflect schema --- cpp/src/arrow/ipc/metadata-internal.cc | 1 + format/File.fbs | 28 ++ format/Message.fbs | 21 +- java/format/pom.xml | 1 + .../main/java/io/netty/buffer/ArrowBuf.java | 71 ++-- .../src/main/codegen/data/ArrowTypes.tdd | 4 +- .../src/main/codegen/templates/ArrowType.java | 29 +- .../templates/NullableValueVectors.java | 49 ++- .../main/codegen/templates/UnionVector.java | 40 ++- .../arrow/vector/BaseDataValueVector.java | 38 +- .../org/apache/arrow/vector/BufferBacked.java | 31 ++ .../org/apache/arrow/vector/FieldVector.java | 65 ++++ .../org/apache/arrow/vector/ValueVector.java | 6 +- .../org/apache/arrow/vector/VectorLoader.java | 99 ++++++ .../apache/arrow/vector/VectorUnloader.java | 78 +++++ .../org/apache/arrow/vector/ZeroVector.java | 39 ++- .../complex/AbstractContainerVector.java | 21 +- .../vector/complex/AbstractMapVector.java | 42 ++- .../complex/BaseRepeatedValueVector.java | 21 +- .../arrow/vector/complex/ListVector.java | 58 ++- .../arrow/vector/complex/MapVector.java | 59 +++- .../complex/impl/ComplexWriterImpl.java | 2 +- .../vector/complex/impl/PromotableWriter.java | 3 +- .../apache/arrow/vector/file/ArrowBlock.java | 82 +++++ .../apache/arrow/vector/file/ArrowFooter.java | 144 ++++++++ .../apache/arrow/vector/file/ArrowReader.java | 151 ++++++++ .../apache/arrow/vector/file/ArrowWriter.java | 179 ++++++++++ .../file/InvalidArrowFileException.java | 27 ++ .../arrow/vector/schema/ArrowBuffer.java | 81 +++++ .../arrow/vector/schema/ArrowFieldNode.java | 53 +++ .../arrow/vector/schema/ArrowRecordBatch.java | 127 +++++++ .../arrow/vector/schema/ArrowVectorType.java | 47 +++ .../arrow/vector/schema/FBSerializable.java | 24 ++ .../arrow/vector/schema/FBSerializables.java | 37 ++ .../arrow/vector/schema/TypeLayout.java | 208 +++++++++++ .../arrow/vector/schema/VectorLayout.java | 93 +++++ .../org/apache/arrow/vector/types/Types.java | 70 ++-- .../apache/arrow/vector/types/pojo/Field.java | 42 ++- .../arrow/vector/types/pojo/Schema.java | 13 +- .../arrow/vector/TestVectorUnloadLoad.java | 89 +++++ .../ByteArrayReadableSeekableByteChannel.java | 80 +++++ .../arrow/vector/file/TestArrowFile.java | 331 ++++++++++++++++++ .../arrow/vector/file/TestArrowFooter.java | 56 +++ .../vector/file/TestArrowReaderWriter.java | 106 ++++++ .../apache/arrow/vector/pojo/TestConvert.java | 38 +- 45 files changed, 2722 insertions(+), 162 deletions(-) create mode 100644 format/File.fbs create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/file/ByteArrayReadableSeekableByteChannel.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 50db730d208..c921e4d8e01 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -219,6 +219,7 @@ static Status FieldToFlatbuffer( RETURN_NOT_OK(TypeToFlatbuffer(fbb, field->type, &children, &type_enum, &type_data)); auto fb_children = fbb.CreateVector(children); + // TODO: produce the list of VectorTypes *offset = flatbuf::CreateField( fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, fb_children); diff --git a/format/File.fbs b/format/File.fbs new file mode 100644 index 00000000000..f7ad1e1594a --- /dev/null +++ b/format/File.fbs @@ -0,0 +1,28 @@ +include "Message.fbs"; + +namespace org.apache.arrow.flatbuf; + +/// ---------------------------------------------------------------------- +/// Arrow File metadata +/// + +table Footer { + + schema: org.apache.arrow.flatbuf.Schema; + + dictionaries: [ Block ]; + + recordBatches: [ Block ]; +} + +struct Block { + + offset: long; + + metaDataLength: int; + + bodyLength: long; + +} + +root_type Footer; diff --git a/format/Message.fbs b/format/Message.fbs index a78009b6e5f..b02f3fa3869 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -17,7 +17,7 @@ table Tuple { table List { } -enum UnionMode:int { Sparse, Dense } +enum UnionMode:short { Sparse, Dense } table Union { mode: UnionMode; @@ -28,7 +28,7 @@ table Int { is_signed: bool; } -enum Precision:int {SINGLE, DOUBLE} +enum Precision:short {SINGLE, DOUBLE} table FloatingPoint { precision: Precision; @@ -91,6 +91,17 @@ union Type { JSONScalar } +enum VectorType: short { + /// used in List type Dense Union and variable length primitive types (String, Binary) + OFFSET, + /// fixed length primitive values + VALUES, + /// Bit vector indicated if each value is null + VALIDITY, + /// Type vector used in Union type + TYPE +} + /// ---------------------------------------------------------------------- /// A field represents a named column in a record / row batch or child of a /// nested type. @@ -109,12 +120,16 @@ table Field { dictionary: long; // children apply only to Nested data types like Struct, List and Union children: [Field]; + /// the buffers produced for this type (as derived from the Type) + /// does not include children + /// each recordbatch will return instances of those Buffers. + buffers: [ VectorType ]; } /// ---------------------------------------------------------------------- /// Endianness of the platform that produces the RecordBatch -enum Endianness:int { Little, Big } +enum Endianness:short { Little, Big } /// ---------------------------------------------------------------------- /// A Schema describes the columns in a row batch diff --git a/java/format/pom.xml b/java/format/pom.xml index cb11b5ff3c4..dc5897581b5 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -106,6 +106,7 @@ -o target/generated-sources/ ../../format/Message.fbs + ../../format/File.fbs diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index bbec26aa85c..d10f00247e6 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -17,8 +17,6 @@ */ package io.netty.buffer; -import io.netty.util.internal.PlatformDependent; - import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; @@ -30,16 +28,18 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; +import org.apache.arrow.memory.AllocationManager.BufferLedger; import org.apache.arrow.memory.BaseAllocator; +import org.apache.arrow.memory.BaseAllocator.Verbosity; import org.apache.arrow.memory.BoundsChecking; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.BufferManager; -import org.apache.arrow.memory.AllocationManager.BufferLedger; -import org.apache.arrow.memory.BaseAllocator.Verbosity; import org.apache.arrow.memory.util.HistoricalLog; import com.google.common.base.Preconditions; +import io.netty.util.internal.PlatformDependent; + public final class ArrowBuf extends AbstractByteBuf implements AutoCloseable { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ArrowBuf.class); @@ -307,7 +307,7 @@ public ByteOrder order() { } @Override - public ByteBuf order(ByteOrder endianness) { + public ArrowBuf order(ByteOrder endianness) { return this; } @@ -344,7 +344,7 @@ public ByteBuf copy(int index, int length) { } @Override - public ByteBuf slice() { + public ArrowBuf slice() { return slice(readerIndex(), readableBytes()); } @@ -467,7 +467,7 @@ public boolean equals(Object obj) { } @Override - public ByteBuf retain(int increment) { + public ArrowBuf retain(int increment) { Preconditions.checkArgument(increment > 0, "retain(%d) argument is not positive", increment); if (isEmpty) { @@ -484,7 +484,7 @@ public ByteBuf retain(int increment) { } @Override - public ByteBuf retain() { + public ArrowBuf retain() { return retain(1); } @@ -535,49 +535,49 @@ public short getShort(int index) { } @Override - public ByteBuf setShort(int index, int value) { + public ArrowBuf setShort(int index, int value) { chk(index, 2); PlatformDependent.putShort(addr(index), (short) value); return this; } @Override - public ByteBuf setInt(int index, int value) { + public ArrowBuf setInt(int index, int value) { chk(index, 4); PlatformDependent.putInt(addr(index), value); return this; } @Override - public ByteBuf setLong(int index, long value) { + public ArrowBuf setLong(int index, long value) { chk(index, 8); PlatformDependent.putLong(addr(index), value); return this; } @Override - public ByteBuf setChar(int index, int value) { + public ArrowBuf setChar(int index, int value) { chk(index, 2); PlatformDependent.putShort(addr(index), (short) value); return this; } @Override - public ByteBuf setFloat(int index, float value) { + public ArrowBuf setFloat(int index, float value) { chk(index, 4); PlatformDependent.putInt(addr(index), Float.floatToRawIntBits(value)); return this; } @Override - public ByteBuf setDouble(int index, double value) { + public ArrowBuf setDouble(int index, double value) { chk(index, 8); PlatformDependent.putLong(addr(index), Double.doubleToRawLongBits(value)); return this; } @Override - public ByteBuf writeShort(int value) { + public ArrowBuf writeShort(int value) { ensure(2); PlatformDependent.putShort(addr(writerIndex), (short) value); writerIndex += 2; @@ -585,7 +585,7 @@ public ByteBuf writeShort(int value) { } @Override - public ByteBuf writeInt(int value) { + public ArrowBuf writeInt(int value) { ensure(4); PlatformDependent.putInt(addr(writerIndex), value); writerIndex += 4; @@ -593,7 +593,7 @@ public ByteBuf writeInt(int value) { } @Override - public ByteBuf writeLong(long value) { + public ArrowBuf writeLong(long value) { ensure(8); PlatformDependent.putLong(addr(writerIndex), value); writerIndex += 8; @@ -601,7 +601,7 @@ public ByteBuf writeLong(long value) { } @Override - public ByteBuf writeChar(int value) { + public ArrowBuf writeChar(int value) { ensure(2); PlatformDependent.putShort(addr(writerIndex), (short) value); writerIndex += 2; @@ -609,7 +609,7 @@ public ByteBuf writeChar(int value) { } @Override - public ByteBuf writeFloat(float value) { + public ArrowBuf writeFloat(float value) { ensure(4); PlatformDependent.putInt(addr(writerIndex), Float.floatToRawIntBits(value)); writerIndex += 4; @@ -617,7 +617,7 @@ public ByteBuf writeFloat(float value) { } @Override - public ByteBuf writeDouble(double value) { + public ArrowBuf writeDouble(double value) { ensure(8); PlatformDependent.putLong(addr(writerIndex), Double.doubleToRawLongBits(value)); writerIndex += 8; @@ -625,19 +625,19 @@ public ByteBuf writeDouble(double value) { } @Override - public ByteBuf getBytes(int index, byte[] dst, int dstIndex, int length) { + public ArrowBuf getBytes(int index, byte[] dst, int dstIndex, int length) { udle.getBytes(index + offset, dst, dstIndex, length); return this; } @Override - public ByteBuf getBytes(int index, ByteBuffer dst) { + public ArrowBuf getBytes(int index, ByteBuffer dst) { udle.getBytes(index + offset, dst); return this; } @Override - public ByteBuf setByte(int index, int value) { + public ArrowBuf setByte(int index, int value) { chk(index, 1); PlatformDependent.putByte(addr(index), (byte) value); return this; @@ -699,13 +699,13 @@ protected void _setLong(int index, long value) { } @Override - public ByteBuf getBytes(int index, ByteBuf dst, int dstIndex, int length) { + public ArrowBuf getBytes(int index, ByteBuf dst, int dstIndex, int length) { udle.getBytes(index + offset, dst, dstIndex, length); return this; } @Override - public ByteBuf getBytes(int index, OutputStream out, int length) throws IOException { + public ArrowBuf getBytes(int index, OutputStream out, int length) throws IOException { udle.getBytes(index + offset, out, length); return this; } @@ -724,12 +724,12 @@ public int getBytes(int index, GatheringByteChannel out, int length) throws IOEx } @Override - public ByteBuf setBytes(int index, ByteBuf src, int srcIndex, int length) { + public ArrowBuf setBytes(int index, ByteBuf src, int srcIndex, int length) { udle.setBytes(index + offset, src, srcIndex, length); return this; } - public ByteBuf setBytes(int index, ByteBuffer src, int srcIndex, int length) { + public ArrowBuf setBytes(int index, ByteBuffer src, int srcIndex, int length) { if (src.isDirect()) { checkIndex(index, length); PlatformDependent.copyMemory(PlatformDependent.directBufferAddress(src) + srcIndex, this.memoryAddress() + index, @@ -749,13 +749,13 @@ public ByteBuf setBytes(int index, ByteBuffer src, int srcIndex, int length) { } @Override - public ByteBuf setBytes(int index, byte[] src, int srcIndex, int length) { + public ArrowBuf setBytes(int index, byte[] src, int srcIndex, int length) { udle.setBytes(index + offset, src, srcIndex, length); return this; } @Override - public ByteBuf setBytes(int index, ByteBuffer src) { + public ArrowBuf setBytes(int index, ByteBuffer src) { udle.setBytes(index + offset, src); return this; } @@ -860,4 +860,17 @@ public void print(StringBuilder sb, int indent, Verbosity verbosity) { } } + @Override + public ArrowBuf readerIndex(int readerIndex) { + super.readerIndex(readerIndex); + return this; + } + + @Override + public ArrowBuf writerIndex(int writerIndex) { + super.writerIndex(writerIndex); + return this; + } + + } diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 4ab7f8562f9..2ecad3d3140 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -30,7 +30,7 @@ }, { name: "Union", - fields: [] + fields: [{name: "mode", type: short}] }, { name: "Int", @@ -38,7 +38,7 @@ }, { name: "FloatingPoint", - fields: [{name: precision, type: int}] + fields: [{name: precision, type: short}] }, { name: "Utf8", diff --git a/java/vector/src/main/codegen/templates/ArrowType.java b/java/vector/src/main/codegen/templates/ArrowType.java index 6dfaf216ad0..29dee20040a 100644 --- a/java/vector/src/main/codegen/templates/ArrowType.java +++ b/java/vector/src/main/codegen/templates/ArrowType.java @@ -24,9 +24,8 @@ <@pp.dropOutputFile /> <@pp.changeOutputFile name="/org/apache/arrow/vector/types/pojo/ArrowType.java" /> - - <#include "/@includes/license.ftl" /> + package org.apache.arrow.vector.types.pojo; import com.google.flatbuffers.FlatBufferBuilder; @@ -38,7 +37,13 @@ public abstract class ArrowType { public abstract byte getTypeType(); public abstract int getType(FlatBufferBuilder builder); + public abstract T accept(ArrowTypeVisitor visitor); + public static interface ArrowTypeVisitor { + <#list arrowTypes.types as type> + T visit(${type.name} type); + + } <#list arrowTypes.types as type> <#assign name = type.name> @@ -70,9 +75,14 @@ public byte getTypeType() { @Override public int getType(FlatBufferBuilder builder) { + <#list type.fields as field> + <#if field.type == "String"> + int ${field.name} = builder.createString(this.${field.name}); + + org.apache.arrow.flatbuf.${type.name}.start${type.name}(builder); <#list type.fields as field> - org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, <#if field.type == "String">builder.createString(${field.name})<#else>${field.name}); + org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, ${field.name}); return org.apache.arrow.flatbuf.${type.name}.end${type.name}(builder); } @@ -83,6 +93,14 @@ public int getType(FlatBufferBuilder builder) { } + public String toString() { + return "${name}{" + <#list fields as field> + + ", " + ${field.name} + + + "}"; + } + @Override public int hashCode() { return Objects.hash(<#list type.fields as field>${field.name}<#if field_has_next>, ); @@ -102,6 +120,11 @@ public boolean equals(Object obj) { } + + @Override + public T accept(ArrowTypeVisitor visitor) { + return visitor.visit(this); + } } diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index df508979c48..6b1aa040a5b 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -29,6 +29,9 @@ package org.apache.arrow.vector; +import org.apache.arrow.vector.schema.ArrowFieldNode; +import java.util.Collections; + <#include "/@includes/vv_imports.ftl" /> /** @@ -39,7 +42,7 @@ * NB: this class is automatically generated from ${.template_name} and ValueVectorTypes.tdd using FreeMarker. */ @SuppressWarnings("unused") -public final class ${className} extends BaseDataValueVector implements <#if type.major == "VarLen">VariableWidth<#else>FixedWidthVector, NullableVector{ +public final class ${className} extends BaseDataValueVector implements <#if type.major == "VarLen">VariableWidth<#else>FixedWidthVector, NullableVector, FieldVector { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(${className}.class); private final FieldReader reader = new ${minor.class}ReaderImpl(Nullable${minor.class}Vector.this); @@ -54,6 +57,8 @@ public final class ${className} extends BaseDataValueVector implements <#if type private final Mutator mutator; private final Accessor accessor; + private final List innerVectors; + <#if minor.class == "Decimal"> private final int precision; private final int scale; @@ -66,6 +71,10 @@ public final class ${className} extends BaseDataValueVector implements <#if type mutator = new Mutator(); accessor = new Accessor(); field = new Field(name, true, new Decimal(precision, scale), null); + innerVectors = Collections.unmodifiableList(Arrays.asList( + bits, + values + )); } <#else> public ${className}(String name, BufferAllocator allocator) { @@ -88,9 +97,9 @@ public final class ${className} extends BaseDataValueVector implements <#if type <#elseif minor.class == "Time"> field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Time(), null); <#elseif minor.class == "Float4"> - field = new Field(name, true, new FloatingPoint(0), null); + field = new Field(name, true, new FloatingPoint(org.apache.arrow.flatbuf.Precision.SINGLE), null); <#elseif minor.class == "Float8"> - field = new Field(name, true, new FloatingPoint(1), null); + field = new Field(name, true, new FloatingPoint(org.apache.arrow.flatbuf.Precision.DOUBLE), null); <#elseif minor.class == "TimeStamp"> field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(""), null); <#elseif minor.class == "IntervalDay"> @@ -104,9 +113,43 @@ public final class ${className} extends BaseDataValueVector implements <#if type <#elseif minor.class == "Bit"> field = new Field(name, true, new Bool(), null); + innerVectors = Collections.unmodifiableList(Arrays.asList( + bits, + <#if type.major = "VarLen"> + values.offsetVector, + + values + )); } + @Override + public List getFieldInnerVectors() { + return innerVectors; + } + + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("primitive type vector ${className} can not have children: " + children); + } + } + + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + org.apache.arrow.vector.BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + // TODO: do something with the sizes in fieldNode? + } + + public List getFieldBuffers() { + return org.apache.arrow.vector.BaseDataValueVector.unload(getFieldInnerVectors()); + } + @Override public Field getField() { return field; diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 1fef490d4ec..72125fa50fb 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -42,6 +42,10 @@ import java.util.Iterator; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.schema.ArrowFieldNode; + +import static org.apache.arrow.flatbuf.UnionMode.Sparse; + /* * This class is generated using freemarker and the ${.template_name} template. @@ -57,7 +61,7 @@ * For performance reasons, UnionVector stores a cached reference to each subtype vector, to avoid having to do the map lookup * each time the vector is accessed. */ -public class UnionVector implements ValueVector { +public class UnionVector implements FieldVector { private String name; private BufferAllocator allocator; @@ -95,6 +99,34 @@ public MinorType getMinorType() { return MinorType.UNION; } + @Override + public void initializeChildrenFromFields(List children) { + getMap().initializeChildrenFromFields(children); + } + + @Override + public List getChildrenFromFields() { + return getMap().getChildrenFromFields(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + // TODO + throw new UnsupportedOperationException(); + } + + @Override + public List getFieldBuffers() { + // TODO + throw new UnsupportedOperationException(); + } + + @Override + public List getFieldInnerVectors() { + // TODO + throw new UnsupportedOperationException(); + } + public MapVector getMap() { if (mapVector == null) { int vectorCount = internalMap.size(); @@ -203,7 +235,7 @@ public Field getField() { for (ValueVector v : internalMap.getChildren()) { childFields.add(v.getField()); } - return new Field(name, true, new ArrowType.Union(), childFields); + return new Field(name, true, new ArrowType.Union(Sparse), childFields); } @Override @@ -237,10 +269,10 @@ public void copyFromSafe(int inIndex, int outIndex, UnionVector from) { copyFrom(inIndex, outIndex, from); } - public ValueVector addVector(ValueVector v) { + public FieldVector addVector(FieldVector v) { String name = v.getMinorType().name().toLowerCase(); Preconditions.checkState(internalMap.getChild(name) == null, String.format("%s vector already exists", name)); - final ValueVector newVector = internalMap.addOrGet(name, v.getMinorType(), v.getClass()); + final FieldVector newVector = internalMap.addOrGet(name, v.getMinorType(), v.getClass()); v.makeTransferPair(newVector).transfer(); internalMap.putChild(name, newVector); if (callBack != null) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java index 05b7cf10067..c22258d4265 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseDataValueVector.java @@ -17,15 +17,38 @@ */ package org.apache.arrow.vector; -import io.netty.buffer.ArrowBuf; +import java.util.ArrayList; +import java.util.List; import org.apache.arrow.memory.BufferAllocator; +import io.netty.buffer.ArrowBuf; + -public abstract class BaseDataValueVector extends BaseValueVector { +public abstract class BaseDataValueVector extends BaseValueVector implements BufferBacked { protected final static byte[] emptyByteArray = new byte[]{}; // Nullable vectors use this + public static void load(List vectors, List buffers) { + int expectedSize = vectors.size(); + if (buffers.size() != expectedSize) { + throw new IllegalArgumentException("Illegal buffer count, expected " + expectedSize + ", got: " + buffers.size()); + } + for (int i = 0; i < expectedSize; i++) { + vectors.get(i).load(buffers.get(i)); + } + } + + public static List unload(List vectors) { + List result = new ArrayList<>(vectors.size()); + for (BufferBacked vector : vectors) { + result.add(vector.unLoad()); + } + return result; + } + + // TODO: Nullable vectors extend BaseDataValueVector but do not use the data field + // We should fix the inheritance tree protected ArrowBuf data; public BaseDataValueVector(String name, BufferAllocator allocator) { @@ -82,6 +105,17 @@ public ArrowBuf getBuffer() { return data; } + @Override + public void load(ArrowBuf data) { + this.data.release(); + this.data = data.retain(allocator); + } + + @Override + public ArrowBuf unLoad() { + return this.data.readerIndex(0); + } + /** * This method has a similar effect of allocateNew() without actually clearing and reallocating * the value vector. The purpose is to move the value vector to a "mutate" state diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java new file mode 100644 index 00000000000..d1c262d2265 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/BufferBacked.java @@ -0,0 +1,31 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import io.netty.buffer.ArrowBuf; + +/** + * Content is backed by a buffer and can be loaded/unloaded + */ +public interface BufferBacked { + + void load(ArrowBuf data); + + ArrowBuf unLoad(); + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java new file mode 100644 index 00000000000..b28433cfd0d --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/FieldVector.java @@ -0,0 +1,65 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import java.util.List; + +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.types.pojo.Field; + +import io.netty.buffer.ArrowBuf; + +/** + * A vector corresponding to a Field in the schema + * It has inner vectors backed by buffers (validity, offsets, data, ...) + */ +public interface FieldVector extends ValueVector { + + /** + * Initializes the child vectors + * to be later loaded with loadBuffers + * @param children the schema + */ + void initializeChildrenFromFields(List children); + + /** + * the returned list is the same size as the list passed to initializeChildrenFromFields + * @return the children according to schema (empty for primitive types) + */ + List getChildrenFromFields(); + + /** + * loads data in the vectors + * (ownBuffers must be the same size as getFieldVectors()) + * @param fieldNode the fieldNode + * @param ownBuffers the buffers for this Field (own buffers only, children not included) + */ + void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers); + + /** + * (same size as getFieldVectors() since it is their content) + * @return the buffers containing the data for this vector (ready for reading) + */ + List getFieldBuffers(); + + /** + * @return the inner vectors for this field as defined by the TypeLayout + */ + List getFieldInnerVectors(); + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java index 35321c947db..ba7790e47ef 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java @@ -19,14 +19,14 @@ import java.io.Closeable; -import io.netty.buffer.ArrowBuf; - import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.TransferPair; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.TransferPair; + +import io.netty.buffer.ArrowBuf; /** * An abstraction that is used to store a sequence of values in an individual column. diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java new file mode 100644 index 00000000000..58ac68b8282 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import static com.google.common.base.Preconditions.checkArgument; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.schema.VectorLayout; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +import com.google.common.collect.Iterators; + +import io.netty.buffer.ArrowBuf; + +/** + * Loads buffers into vectors + */ +public class VectorLoader { + private final List fieldVectors; + private final List fields; + + /** + * will create children in root based on schema + * @param schema the expected schema + * @param root the root to add vectors to based on schema + */ + public VectorLoader(Schema schema, FieldVector root) { + super(); + this.fields = schema.getFields(); + root.initializeChildrenFromFields(fields); + this.fieldVectors = root.getChildrenFromFields(); + if (this.fieldVectors.size() != fields.size()) { + throw new IllegalArgumentException("The root vector did not create the right number of children. found " + fieldVectors.size() + " expected " + fields.size()); + } + } + + /** + * Loads the record batch in the vectors + * will not close the record batch + * @param recordBatch + */ + public void load(ArrowRecordBatch recordBatch) { + Iterator buffers = recordBatch.getBuffers().iterator(); + Iterator nodes = recordBatch.getNodes().iterator(); + for (int i = 0; i < fields.size(); ++i) { + Field field = fields.get(i); + FieldVector fieldVector = fieldVectors.get(i); + loadBuffers(fieldVector, field, buffers, nodes); + } + if (nodes.hasNext() || buffers.hasNext()) { + throw new IllegalArgumentException("not all nodes and buffers where consumed. nodes: " + Iterators.toString(nodes) + " buffers: " + Iterators.toString(buffers)); + } + } + + private void loadBuffers(FieldVector vector, Field field, Iterator buffers, Iterator nodes) { + ArrowFieldNode fieldNode = nodes.next(); + List typeLayout = field.getTypeLayout().getVectors(); + List ownBuffers = new ArrayList<>(typeLayout.size()); + for (int j = 0; j < typeLayout.size(); j++) { + ownBuffers.add(buffers.next()); + } + try { + vector.loadFieldBuffers(fieldNode, ownBuffers); + } catch (RuntimeException e) { + throw new IllegalArgumentException("Could not load buffers for field " + field); + } + List children = field.getChildren(); + if (children.size() > 0) { + List childrenFromFields = vector.getChildrenFromFields(); + checkArgument(children.size() == childrenFromFields.size(), "should have as many children as in the schema: found " + childrenFromFields.size() + " expected " + children.size()); + for (int i = 0; i < childrenFromFields.size(); i++) { + Field child = children.get(i); + FieldVector fieldVector = childrenFromFields.get(i); + loadBuffers(fieldVector, child, buffers, nodes); + } + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java new file mode 100644 index 00000000000..e4d37bf47d1 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.vector.ValueVector.Accessor; +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.schema.ArrowVectorType; +import org.apache.arrow.vector.types.pojo.Schema; + +import io.netty.buffer.ArrowBuf; + +public class VectorUnloader { + + private final Schema schema; + private final int valueCount; + private final List vectors; + + public VectorUnloader(FieldVector parent) { + super(); + this.schema = new Schema(parent.getField().getChildren()); + this.valueCount = parent.getAccessor().getValueCount(); + this.vectors = parent.getChildrenFromFields(); + } + + public Schema getSchema() { + return schema; + } + + public ArrowRecordBatch getRecordBatch() { + List nodes = new ArrayList<>(); + List buffers = new ArrayList<>(); + for (FieldVector vector : vectors) { + appendNodes(vector, nodes, buffers); + } + return new ArrowRecordBatch(valueCount, nodes, buffers); + } + + private void appendNodes(FieldVector vector, List nodes, List buffers) { + Accessor accessor = vector.getAccessor(); + int nullCount = 0; + // TODO: should not have to do that + // we can do that a lot more efficiently (for example with Long.bitCount(i)) + for (int i = 0; i < accessor.getValueCount(); i++) { + if (accessor.isNull(i)) { + nullCount ++; + } + } + nodes.add(new ArrowFieldNode(accessor.getValueCount(), nullCount)); + List fieldBuffers = vector.getFieldBuffers(); + List expectedBuffers = vector.getField().getTypeLayout().getVectorTypes(); + if (fieldBuffers.size() != expectedBuffers.size()) { + throw new IllegalArgumentException("wrong number of buffers for field " + vector.getField() + ". found: " + fieldBuffers); + } + buffers.addAll(fieldBuffers); + for (FieldVector child : vector.getChildrenFromFields()) { + appendNodes(child, nodes, buffers); + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java index 705a24b02fe..c2482adefec 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/ZeroVector.java @@ -17,25 +17,23 @@ */ package org.apache.arrow.vector; -import com.google.flatbuffers.FlatBufferBuilder; -import io.netty.buffer.ArrowBuf; - import java.util.Collections; import java.util.Iterator; +import java.util.List; -import org.apache.arrow.flatbuf.Type; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.complex.impl.NullReader; import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.TransferPair; -import com.google.common.collect.Iterators; +import io.netty.buffer.ArrowBuf; -public class ZeroVector implements ValueVector { +public class ZeroVector implements FieldVector { public final static ZeroVector INSTANCE = new ZeroVector(); private final String name = "[DEFAULT]"; @@ -175,4 +173,33 @@ public Mutator getMutator() { public FieldReader getReader() { return NullReader.INSTANCE; } + + @Override + public void initializeChildrenFromFields(List children) { + if (!children.isEmpty()) { + throw new IllegalArgumentException("Zero vector has no children"); + } + } + + @Override + public List getChildrenFromFields() { + return Collections.emptyList(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + if (!ownBuffers.isEmpty()) { + throw new IllegalArgumentException("Zero vector has no buffers"); + } + } + + @Override + public List getFieldBuffers() { + return Collections.emptyList(); + } + + @Override + public List getFieldInnerVectors() { + return Collections.emptyList(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java index ed7797576d6..2f68886a169 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractContainerVector.java @@ -17,22 +17,13 @@ */ package org.apache.arrow.vector.complex; -import java.util.Collection; - -import javax.annotation.Nullable; - -import org.apache.arrow.flatbuf.Field; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.util.CallBack; -import com.google.common.base.Function; -import com.google.common.base.Preconditions; -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; - /** * Base class for composite vectors. * @@ -65,8 +56,8 @@ public BufferAllocator getAllocator() { /** * Returns a {@link org.apache.arrow.vector.ValueVector} corresponding to the given field name if exists or null. */ - public ValueVector getChild(String name) { - return getChild(name, ValueVector.class); + public FieldVector getChild(String name) { + return getChild(name, FieldVector.class); } /** @@ -81,7 +72,7 @@ public void close() { protected T typeify(ValueVector v, Class clazz) { if (clazz.isAssignableFrom(v.getClass())) { - return (T) v; + return clazz.cast(v); } throw new IllegalStateException(String.format("Vector requested [%s] was different than type stored [%s]. Arrow doesn't yet support hetergenous types.", clazz.getSimpleName(), v.getClass().getSimpleName())); } @@ -94,10 +85,10 @@ protected boolean supportsDirectRead() { public abstract int size(); // add a new vector with the input MajorType or return the existing vector if we already added one with the same type - public abstract T addOrGet(String name, MinorType minorType, Class clazz, int... precisionScale); + public abstract T addOrGet(String name, MinorType minorType, Class clazz, int... precisionScale); // return the child vector with the input name - public abstract T getChild(String name, Class clazz); + public abstract T getChild(String name, Class clazz); // return the child vector's ordinal in the composite container public abstract VectorWithOrdinal getChildVectorWithOrdinal(String name); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java index 5964f800791..23b4997f4f5 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/AbstractMapVector.java @@ -17,23 +17,24 @@ */ package org.apache.arrow.vector.complex; -import com.google.common.collect.ImmutableList; -import io.netty.buffer.ArrowBuf; - import java.util.ArrayList; +import java.util.Collections; import java.util.Iterator; import java.util.List; -import org.apache.arrow.flatbuf.Field; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.MapWithOrdinal; import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; +import io.netty.buffer.ArrowBuf; + /* * Base class for MapVectors. Currently used by RepeatedMapVector and MapVector */ @@ -41,7 +42,7 @@ public abstract class AbstractMapVector extends AbstractContainerVector { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(AbstractContainerVector.class); // Maintains a map with key as field name and value is the vector itself - private final MapWithOrdinal vectors = new MapWithOrdinal<>(); + private final MapWithOrdinal vectors = new MapWithOrdinal<>(); protected AbstractMapVector(String name, BufferAllocator allocator, CallBack callBack) { super(name, allocator, callBack); @@ -109,19 +110,19 @@ public boolean allocateNewSafe() { * @return resultant {@link org.apache.arrow.vector.ValueVector} */ @Override - public T addOrGet(String name, MinorType minorType, Class clazz, int... precisionScale) { + public T addOrGet(String name, MinorType minorType, Class clazz, int... precisionScale) { final ValueVector existing = getChild(name); boolean create = false; if (existing == null) { create = true; } else if (clazz.isAssignableFrom(existing.getClass())) { - return (T) existing; + return clazz.cast(existing); } else if (nullFilled(existing)) { existing.clear(); create = true; } if (create) { - final T vector = (T) minorType.getNewVector(name, allocator, callBack, precisionScale); + final T vector = clazz.cast(minorType.getNewVector(name, allocator, callBack, precisionScale)); putChild(name, vector); if (callBack!=null) { callBack.doWork(); @@ -153,7 +154,7 @@ public ValueVector getChildByOrdinal(int id) { * field name if exists or null. */ @Override - public T getChild(String name, Class clazz) { + public T getChild(String name, Class clazz) { final ValueVector v = vectors.get(name.toLowerCase()); if (v == null) { return null; @@ -161,12 +162,25 @@ public T getChild(String name, Class clazz) { return typeify(v, clazz); } + protected ValueVector add(String name, MinorType minorType, int... precisionScale) { + final ValueVector existing = getChild(name); + if (existing != null) { + throw new IllegalStateException(String.format("Vector already exists: Existing[%s], Requested[%s] ", existing.getClass().getSimpleName(), minorType)); + } + FieldVector vector = minorType.getNewVector(name, allocator, callBack, precisionScale); + putChild(name, vector); + if (callBack!=null) { + callBack.doWork(); + } + return vector; + } + /** * Inserts the vector with the given name if it does not exist else replaces it with the new value. * * Note that this method does not enforce any vector type check nor throws a schema change exception. */ - protected void putChild(String name, ValueVector vector) { + protected void putChild(String name, FieldVector vector) { putVector(name, vector); } @@ -175,7 +189,7 @@ protected void putChild(String name, ValueVector vector) { * @param name field name * @param vector vector to be inserted */ - protected void putVector(String name, ValueVector vector) { + protected void putVector(String name, FieldVector vector) { final ValueVector old = vectors.put( Preconditions.checkNotNull(name, "field name cannot be null").toLowerCase(), Preconditions.checkNotNull(vector, "vector cannot be null") @@ -189,9 +203,9 @@ protected void putVector(String name, ValueVector vector) { /** * Returns a sequence of underlying child vectors. */ - protected List getChildren() { + protected List getChildren() { int size = vectors.size(); - List children = new ArrayList<>(); + List children = new ArrayList<>(); for (int i = 0; i < size; i++) { children.add(vectors.getByOrdinal(i)); } @@ -216,7 +230,7 @@ public int size() { @Override public Iterator iterator() { - return vectors.values().iterator(); + return Collections.unmodifiableCollection(vectors.values()).iterator(); } /** diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java index 42262741df9..517d20c77a9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java @@ -17,8 +17,6 @@ */ package org.apache.arrow.vector.complex; -import io.netty.buffer.ArrowBuf; - import java.util.Collections; import java.util.Iterator; @@ -26,29 +24,32 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.AddOrGetResult; import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.util.SchemaChangeRuntimeException; import com.google.common.base.Preconditions; import com.google.common.collect.ObjectArrays; -import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.SchemaChangeRuntimeException; + +import io.netty.buffer.ArrowBuf; public abstract class BaseRepeatedValueVector extends BaseValueVector implements RepeatedValueVector { - public final static ValueVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; + public final static FieldVector DEFAULT_DATA_VECTOR = ZeroVector.INSTANCE; public final static String OFFSETS_VECTOR_NAME = "$offsets$"; public final static String DATA_VECTOR_NAME = "$data$"; protected final UInt4Vector offsets; - protected ValueVector vector; + protected FieldVector vector; protected BaseRepeatedValueVector(String name, BufferAllocator allocator) { this(name, allocator, DEFAULT_DATA_VECTOR); } - protected BaseRepeatedValueVector(String name, BufferAllocator allocator, ValueVector vector) { + protected BaseRepeatedValueVector(String name, BufferAllocator allocator, FieldVector vector) { super(name, allocator); this.offsets = new UInt4Vector(OFFSETS_VECTOR_NAME, allocator); this.vector = Preconditions.checkNotNull(vector, "data vector cannot be null"); @@ -83,7 +84,7 @@ public UInt4Vector getOffsetVector() { } @Override - public ValueVector getDataVector() { + public FieldVector getDataVector() { return vector; } @@ -121,7 +122,7 @@ public int getBufferSizeFor(int valueCount) { @Override public Iterator iterator() { - return Collections.singleton(getDataVector()).iterator(); + return Collections.singleton(getDataVector()).iterator(); } @Override @@ -167,7 +168,7 @@ public AddOrGetResult addOrGetVector(MinorType minorT return new AddOrGetResult<>((T)vector, created); } - protected void replaceDataVector(ValueVector v) { + protected void replaceDataVector(FieldVector v) { vector.clear(); vector = v; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index c6c6b090db6..2984c362514 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -18,15 +18,18 @@ ******************************************************************************/ package org.apache.arrow.vector.complex; -import com.google.common.collect.ImmutableList; -import com.google.flatbuffers.FlatBufferBuilder; -import io.netty.buffer.ArrowBuf; +import static java.util.Collections.singletonList; +import java.util.Arrays; +import java.util.Collections; import java.util.List; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.AddOrGetResult; +import org.apache.arrow.vector.BaseDataValueVector; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.UInt1Vector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; @@ -36,18 +39,24 @@ import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.FieldWriter; +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.JsonStringArrayList; import org.apache.arrow.vector.util.TransferPair; +import com.google.common.collect.ImmutableList; import com.google.common.collect.ObjectArrays; -public class ListVector extends BaseRepeatedValueVector { +import io.netty.buffer.ArrowBuf; + +public class ListVector extends BaseRepeatedValueVector implements FieldVector { - UInt4Vector offsets; + final UInt4Vector offsets; final UInt1Vector bits; + private final List innerVectors; private Mutator mutator = new Mutator(); private Accessor accessor = new Accessor(); private UnionListWriter writer; @@ -57,12 +66,46 @@ public class ListVector extends BaseRepeatedValueVector { public ListVector(String name, BufferAllocator allocator, CallBack callBack) { super(name, allocator); this.bits = new UInt1Vector("$bits$", allocator); - offsets = getOffsetVector(); + this.offsets = getOffsetVector(); + this.innerVectors = Collections.unmodifiableList(Arrays.asList(bits, offsets)); this.writer = new UnionListWriter(this); this.reader = new UnionListReader(this); this.callBack = callBack; } + @Override + public void initializeChildrenFromFields(List children) { + if (children.size() != 1) { + throw new IllegalArgumentException("Lists have only one child. Found: " + children); + } + Field field = children.get(0); + MinorType minorType = Types.getMinorTypeForArrowType(field.getType()); + AddOrGetResult addOrGetVector = addOrGetVector(minorType); + if (!addOrGetVector.isCreated()) { + throw new IllegalArgumentException("Child vector already existed: " + addOrGetVector.getVector()); + } + } + + @Override + public List getChildrenFromFields() { + return singletonList(getDataVector()); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + } + + @Override + public List getFieldBuffers() { + return BaseDataValueVector.unload(getFieldInnerVectors()); + } + + @Override + public List getFieldInnerVectors() { + return innerVectors; + } + public UnionListWriter getWriter() { return writer; } @@ -86,7 +129,7 @@ public void copyFrom(int inIndex, int outIndex, ListVector from) { } @Override - public ValueVector getDataVector() { + public FieldVector getDataVector() { return vector; } @@ -298,4 +341,5 @@ public void setValueCount(int valueCount) { bits.getMutator().setValueCount(valueCount); } } + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index 0cb613e2f7a..e3696588e60 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -17,10 +17,10 @@ */ package org.apache.arrow.vector.complex; -import io.netty.buffer.ArrowBuf; - import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -28,13 +28,17 @@ import javax.annotation.Nullable; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BaseDataValueVector; import org.apache.arrow.vector.BaseValueVector; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; @@ -45,7 +49,9 @@ import com.google.common.collect.Ordering; import com.google.common.primitives.Ints; -public class MapVector extends AbstractMapVector { +import io.netty.buffer.ArrowBuf; + +public class MapVector extends AbstractMapVector implements FieldVector { //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MapVector.class); private final SingleMapReaderImpl reader = new SingleMapReaderImpl(MapVector.this); @@ -53,6 +59,9 @@ public class MapVector extends AbstractMapVector { private final Mutator mutator = new Mutator(); int valueCount; + // TODO: validity vector + private final List innerVectors = Collections.unmodifiableList(Arrays.asList()); + public MapVector(String name, BufferAllocator allocator, CallBack callBack){ super(name, allocator, callBack); } @@ -120,7 +129,7 @@ public ArrowBuf[] getBuffers(boolean clear) { int expectedSize = getBufferSize(); int actualSize = super.getBufferSize(); - Preconditions.checkArgument(expectedSize == actualSize); + Preconditions.checkArgument(expectedSize == actualSize, expectedSize + " != " + actualSize); return super.getBuffers(clear); } @@ -159,7 +168,7 @@ protected MapTransferPair(MapVector from, MapVector to, boolean allocate) { this.to.ephPair = null; int i = 0; - ValueVector vector; + FieldVector vector; for (String child:from.getChildFieldNames()) { int preSize = to.size(); vector = from.getChild(child); @@ -175,7 +184,7 @@ protected MapTransferPair(MapVector from, MapVector to, boolean allocate) { // (This is similar to what happens in ScanBatch where the children cannot be added till they are // read). To take care of this, we ensure that the hashCode of the MaterializedField does not // include the hashCode of the children but is based only on MaterializedField$key. - final ValueVector newVector = to.addOrGet(child, vector.getMinorType(), vector.getClass()); + final FieldVector newVector = to.addOrGet(child, vector.getMinorType(), vector.getClass()); if (allocate && to.size() != preSize) { newVector.allocateNew(); } @@ -315,13 +324,45 @@ public MinorType getMinorType() { @Override public void close() { - final Collection vectors = getChildren(); - for (final ValueVector v : vectors) { + final Collection vectors = getChildren(); + for (final FieldVector v : vectors) { v.close(); } vectors.clear(); + valueCount = 0; super.close(); } + + @Override + public void initializeChildrenFromFields(List children) { + for (Field field : children) { + MinorType minorType = Types.getMinorTypeForArrowType(field.getType()); + FieldVector vector = (FieldVector)this.add(field.getName(), minorType); + vector.initializeChildrenFromFields(field.getChildren()); + } + } + + @Override + public List getChildrenFromFields() { + return getChildren(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + // TODO: something with fieldNode? + } + + @Override + public List getFieldBuffers() { + return BaseDataValueVector.unload(getFieldInnerVectors()); + } + + @Override + public List getFieldInnerVectors() { + return innerVectors; + } + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java index 4d2adfb3256..89bfefc8f19 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -22,9 +22,9 @@ import org.apache.arrow.vector.complex.StateTool; import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; import com.google.common.base.Preconditions; -import org.apache.arrow.vector.types.pojo.Field; public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWriter { // private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ComplexWriterImpl.class); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index 586b1283fe8..c282688530b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -17,6 +17,7 @@ */ package org.apache.arrow.vector.complex.impl; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.AbstractMapVector; @@ -129,7 +130,7 @@ private FieldWriter promoteToUnion() { } else if (listVector != null) { unionVector = listVector.promoteToUnion(); } - unionVector.addVector(tp.getTo()); + unionVector.addVector((FieldVector)tp.getTo()); writer = new UnionWriter(unionVector); writer.setPosition(idx()); for (int i = 0; i < idx(); i++) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java new file mode 100644 index 00000000000..90fb02b0597 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowBlock.java @@ -0,0 +1,82 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import org.apache.arrow.flatbuf.Block; +import org.apache.arrow.vector.schema.FBSerializable; + +import com.google.flatbuffers.FlatBufferBuilder; + +public class ArrowBlock implements FBSerializable { + + private final long offset; + private final int metadataLength; + private final long bodyLength; + + public ArrowBlock(long offset, int metadataLength, long bodyLength) { + super(); + this.offset = offset; + this.metadataLength = metadataLength; + this.bodyLength = bodyLength; + } + + public long getOffset() { + return offset; + } + + public int getMetadataLength() { + return metadataLength; + } + + public long getBodyLength() { + return bodyLength; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + return Block.createBlock(builder, offset, metadataLength, bodyLength); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + (int) (bodyLength ^ (bodyLength >>> 32)); + result = prime * result + metadataLength; + result = prime * result + (int) (offset ^ (offset >>> 32)); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + ArrowBlock other = (ArrowBlock) obj; + if (bodyLength != other.bodyLength) + return false; + if (metadataLength != other.metadataLength) + return false; + if (offset != other.offset) + return false; + return true; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java new file mode 100644 index 00000000000..01e175b31b8 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowFooter.java @@ -0,0 +1,144 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.arrow.flatbuf.Block; +import org.apache.arrow.flatbuf.Footer; +import org.apache.arrow.vector.schema.FBSerializable; +import org.apache.arrow.vector.types.pojo.Schema; + +import com.google.flatbuffers.FlatBufferBuilder; + +public class ArrowFooter implements FBSerializable { + + private final Schema schema; + + private final List dictionaries; + + private final List recordBatches; + + public ArrowFooter(Schema schema, List dictionaries, List recordBatches) { + super(); + this.schema = schema; + this.dictionaries = dictionaries; + this.recordBatches = recordBatches; + } + + public ArrowFooter(Footer footer) { + this( + Schema.convertSchema(footer.schema()), + dictionaries(footer), + recordBatches(footer) + ); + } + + private static List recordBatches(Footer footer) { + List recordBatches = new ArrayList<>(); + Block tempBLock = new Block(); + int recordBatchesLength = footer.recordBatchesLength(); + for (int i = 0; i < recordBatchesLength; i++) { + Block block = footer.recordBatches(tempBLock, i); + recordBatches.add(new ArrowBlock(block.offset(), block.metaDataLength(), block.bodyLength())); + } + return recordBatches; + } + + private static List dictionaries(Footer footer) { + List dictionaries = new ArrayList<>(); + Block tempBLock = new Block(); + int dictionariesLength = footer.dictionariesLength(); + for (int i = 0; i < dictionariesLength; i++) { + Block block = footer.dictionaries(tempBLock, i); + dictionaries.add(new ArrowBlock(block.offset(), block.metaDataLength(), block.bodyLength())); + } + return dictionaries; + } + + public Schema getSchema() { + return schema; + } + + public List getDictionaries() { + return dictionaries; + } + + public List getRecordBatches() { + return recordBatches; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + int schemaIndex = schema.getSchema(builder); + Footer.startDictionariesVector(builder, dictionaries.size()); + int dicsOffset = endVector(builder, dictionaries); + Footer.startRecordBatchesVector(builder, recordBatches.size()); + int rbsOffset = endVector(builder, recordBatches); + Footer.startFooter(builder); + Footer.addSchema(builder, schemaIndex); + Footer.addDictionaries(builder, dicsOffset); + Footer.addRecordBatches(builder, rbsOffset); + return Footer.endFooter(builder); + } + + private int endVector(FlatBufferBuilder builder, List blocks) { + for (ArrowBlock block : blocks) { + block.writeTo(builder); + } + return builder.endVector(); + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((dictionaries == null) ? 0 : dictionaries.hashCode()); + result = prime * result + ((recordBatches == null) ? 0 : recordBatches.hashCode()); + result = prime * result + ((schema == null) ? 0 : schema.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + ArrowFooter other = (ArrowFooter) obj; + if (dictionaries == null) { + if (other.dictionaries != null) + return false; + } else if (!dictionaries.equals(other.dictionaries)) + return false; + if (recordBatches == null) { + if (other.recordBatches != null) + return false; + } else if (!recordBatches.equals(other.recordBatches)) + return false; + if (schema == null) { + if (other.schema != null) + return false; + } else if (!schema.equals(other.schema)) + return false; + return true; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java new file mode 100644 index 00000000000..bbcd3e9f470 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowReader.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.arrow.flatbuf.Buffer; +import org.apache.arrow.flatbuf.FieldNode; +import org.apache.arrow.flatbuf.Footer; +import org.apache.arrow.flatbuf.RecordBatch; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.netty.buffer.ArrowBuf; + +public class ArrowReader implements AutoCloseable { + private static final Logger LOGGER = LoggerFactory.getLogger(ArrowReader.class); + + private static final byte[] MAGIC = "ARROW1".getBytes(); + + private final SeekableByteChannel in; + + private final BufferAllocator allocator; + + private ArrowFooter footer; + + public ArrowReader(SeekableByteChannel in, BufferAllocator allocator) { + super(); + this.in = in; + this.allocator = allocator; + } + + private int readFully(ArrowBuf buffer, int l) throws IOException { + int n = readFully(buffer.nioBuffer(buffer.writerIndex(), l)); + buffer.writerIndex(n); + if (n != l) { + throw new IllegalStateException(n + " != " + l); + } + return n; + } + + private int readFully(ByteBuffer buffer) throws IOException { + int total = 0; + int n; + do { + n = in.read(buffer); + total += n; + } while (n >= 0 && buffer.remaining() > 0); + buffer.flip(); + return total; + } + + private static int bytesToInt(byte[] bytes) { + return ((int)(bytes[3] & 255) << 24) + + ((int)(bytes[2] & 255) << 16) + + ((int)(bytes[1] & 255) << 8) + + ((int)(bytes[0] & 255) << 0); + } + + public ArrowFooter readFooter() throws IOException { + if (footer == null) { + if (in.size() <= (MAGIC.length * 2 + 4)) { + throw new InvalidArrowFileException("file too small: " + in.size()); + } + ByteBuffer buffer = ByteBuffer.allocate(4 + MAGIC.length); + long footerLengthOffset = in.size() - buffer.remaining(); + in.position(footerLengthOffset); + readFully(buffer); + byte[] array = buffer.array(); + if (!Arrays.equals(MAGIC, Arrays.copyOfRange(array, 4, array.length))) { + throw new InvalidArrowFileException("missing Magic number " + Arrays.toString(buffer.array())); + } + int footerLength = bytesToInt(array); + if (footerLength <= 0 || footerLength + MAGIC.length * 2 + 4 > in.size()) { + throw new InvalidArrowFileException("invalid footer length: " + footerLength); + } + long footerOffset = footerLengthOffset - footerLength; + LOGGER.debug(String.format("Footer starts at %d, length: %d", footerOffset, footerLength)); + ByteBuffer footerBuffer = ByteBuffer.allocate(footerLength); + in.position(footerOffset); + readFully(footerBuffer); + Footer footerFB = Footer.getRootAsFooter(footerBuffer); + this.footer = new ArrowFooter(footerFB); + } + return footer; + } + + // TODO: read dictionaries + + public ArrowRecordBatch readRecordBatch(ArrowBlock recordBatchBlock) throws IOException { + LOGGER.debug(String.format("RecordBatch at %d, metadata: %d, body: %d", recordBatchBlock.getOffset(), recordBatchBlock.getMetadataLength(), recordBatchBlock.getBodyLength())); + int l = (int)(recordBatchBlock.getMetadataLength() + recordBatchBlock.getBodyLength()); + if (l < 0) { + throw new InvalidArrowFileException("block invalid: " + recordBatchBlock); + } + final ArrowBuf buffer = allocator.buffer(l); + LOGGER.debug("allocated buffer " + buffer); + in.position(recordBatchBlock.getOffset()); + int n = readFully(buffer, l); + if (n != l) { + throw new IllegalStateException(n + " != " + l); + } + RecordBatch recordBatchFB = RecordBatch.getRootAsRecordBatch(buffer.nioBuffer().asReadOnlyBuffer()); + int nodesLength = recordBatchFB.nodesLength(); + final ArrowBuf body = buffer.slice(recordBatchBlock.getMetadataLength(), (int)recordBatchBlock.getBodyLength()); + List nodes = new ArrayList<>(); + for (int i = 0; i < nodesLength; ++i) { + FieldNode node = recordBatchFB.nodes(i); + nodes.add(new ArrowFieldNode(node.length(), node.nullCount())); + } + List buffers = new ArrayList<>(); + for (int i = 0; i < recordBatchFB.buffersLength(); ++i) { + Buffer bufferFB = recordBatchFB.buffers(i); + LOGGER.debug(String.format("Buffer in RecordBatch at %d, length: %d", bufferFB.offset(), bufferFB.length())); + ArrowBuf vectorBuffer = body.slice((int)bufferFB.offset(), (int)bufferFB.length()); + buffers.add(vectorBuffer); + } + ArrowRecordBatch arrowRecordBatch = new ArrowRecordBatch(recordBatchFB.length(), nodes, buffers); + LOGGER.debug("released buffer " + buffer); + buffer.release(); + return arrowRecordBatch; + } + + public void close() throws IOException { + in.close(); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java new file mode 100644 index 00000000000..9881a229c23 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/ArrowWriter.java @@ -0,0 +1,179 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.vector.schema.ArrowBuffer; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.schema.FBSerializable; +import org.apache.arrow.vector.types.pojo.Schema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.flatbuffers.FlatBufferBuilder; + +import io.netty.buffer.ArrowBuf; + +public class ArrowWriter implements AutoCloseable { + private static final Logger LOGGER = LoggerFactory.getLogger(ArrowWriter.class); + + private static final byte[] MAGIC = "ARROW1".getBytes(); + + private final WritableByteChannel out; + + private final Schema schema; + + private final List recordBatches = new ArrayList<>(); + + private long currentPosition = 0; + + private boolean started = false; + + public ArrowWriter(WritableByteChannel out, Schema schema) { + this.out = out; + this.schema = schema; + } + + private void start() throws IOException { + writeMagic(); + } + + private long write(byte[] buffer) throws IOException { + return write(ByteBuffer.wrap(buffer)); + } + + private long writeZeros(int zeroCount) throws IOException { + return write(new byte[zeroCount]); + } + + private long align() throws IOException { + if (currentPosition % 8 != 0) { // align on 8 byte boundaries + return writeZeros(8 - (int)(currentPosition % 8)); + } + return 0; + } + + private long write(ByteBuffer buffer) throws IOException { + long length = buffer.remaining(); + out.write(buffer); + currentPosition += length; + return length; + } + + private static byte[] intToBytes(int value) { + byte[] outBuffer = new byte[4]; + outBuffer[3] = (byte)(value >>> 24); + outBuffer[2] = (byte)(value >>> 16); + outBuffer[1] = (byte)(value >>> 8); + outBuffer[0] = (byte)(value >>> 0); + return outBuffer; + } + + private long writeIntLittleEndian(int v) throws IOException { + return write(intToBytes(v)); + } + + // TODO: write dictionaries + + public void writeRecordBatch(ArrowRecordBatch recordBatch) throws IOException { + checkStarted(); + align(); + // write metadata header + long offset = currentPosition; + write(recordBatch); + align(); + // write body + long bodyOffset = currentPosition; + List buffers = recordBatch.getBuffers(); + List buffersLayout = recordBatch.getBuffersLayout(); + if (buffers.size() != buffersLayout.size()) { + throw new IllegalStateException("the layout does not match: " + buffers.size() + " != " + buffersLayout.size()); + } + for (int i = 0; i < buffers.size(); i++) { + ArrowBuf buffer = buffers.get(i); + ArrowBuffer layout = buffersLayout.get(i); + long startPosition = bodyOffset + layout.getOffset(); + if (startPosition != currentPosition) { + writeZeros((int)(startPosition - currentPosition)); + } + write(buffer); + if (currentPosition != startPosition + layout.getSize()) { + throw new IllegalStateException("wrong buffer size: " + currentPosition + " != " + startPosition + layout.getSize()); + } + } + int metadataLength = (int)(bodyOffset - offset); + if (metadataLength <= 0) { + throw new InvalidArrowFileException("invalid recordBatch"); + } + long bodyLength = currentPosition - bodyOffset; + LOGGER.debug(String.format("RecordBatch at %d, metadata: %d, body: %d", offset, metadataLength, bodyLength)); + // add metadata to footer + recordBatches.add(new ArrowBlock(offset, metadataLength, bodyLength)); + } + + private void write(ArrowBuf buffer) throws IOException { + write(buffer.nioBuffer(buffer.readerIndex(), buffer.readableBytes())); + } + + private void checkStarted() throws IOException { + if (!started) { + started = true; + start(); + } + } + + public void close() throws IOException { + try { + long footerStart = currentPosition; + writeFooter(); + int footerLength = (int)(currentPosition - footerStart); + if (footerLength <= 0 ) { + throw new InvalidArrowFileException("invalid footer"); + } + writeIntLittleEndian(footerLength); + LOGGER.debug(String.format("Footer starts at %d, length: %d", footerStart, footerLength)); + writeMagic(); + } finally { + out.close(); + } + } + + private void writeMagic() throws IOException { + write(MAGIC); + LOGGER.debug(String.format("magic written, now at %d", currentPosition)); + } + + private void writeFooter() throws IOException { + // TODO: dictionaries + write(new ArrowFooter(schema, Collections.emptyList(), recordBatches)); + } + + private long write(FBSerializable writer) throws IOException { + FlatBufferBuilder builder = new FlatBufferBuilder(); + int root = writer.writeTo(builder); + builder.finish(root); + return write(builder.dataBuffer()); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java b/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java new file mode 100644 index 00000000000..3ec75dcb12a --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/InvalidArrowFileException.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +public class InvalidArrowFileException extends RuntimeException { + private static final long serialVersionUID = 1L; + + public InvalidArrowFileException(String message) { + super(message); + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java new file mode 100644 index 00000000000..3aa3e52582b --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowBuffer.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import org.apache.arrow.flatbuf.Buffer; + +import com.google.flatbuffers.FlatBufferBuilder; + +public class ArrowBuffer implements FBSerializable { + + private int page; + private long offset; + private long size; + + public ArrowBuffer(int page, long offset, long size) { + super(); + this.page = page; + this.offset = offset; + this.size = size; + } + + public int getPage() { + return page; + } + + public long getOffset() { + return offset; + } + + public long getSize() { + return size; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + (int) (offset ^ (offset >>> 32)); + result = prime * result + page; + result = prime * result + (int) (size ^ (size >>> 32)); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + ArrowBuffer other = (ArrowBuffer) obj; + if (offset != other.offset) + return false; + if (page != other.page) + return false; + if (size != other.size) + return false; + return true; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + return Buffer.createBuffer(builder, page, offset, size); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java new file mode 100644 index 00000000000..71dd0abc6bc --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowFieldNode.java @@ -0,0 +1,53 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import org.apache.arrow.flatbuf.FieldNode; + +import com.google.flatbuffers.FlatBufferBuilder; + +public class ArrowFieldNode implements FBSerializable { + + private final int length; + private final int nullCount; + + public ArrowFieldNode(int length, int nullCount) { + super(); + this.length = length; + this.nullCount = nullCount; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + return FieldNode.createFieldNode(builder, length, nullCount); + } + + public int getNullCount() { + return nullCount; + } + + public int getLength() { + return length; + } + + @Override + public String toString() { + return "ArrowFieldNode [length=" + length + ", nullCount=" + nullCount + "]"; + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java new file mode 100644 index 00000000000..9162efd29f8 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowRecordBatch.java @@ -0,0 +1,127 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import static org.apache.arrow.vector.schema.FBSerializables.writeAllStructsToVector; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.flatbuf.RecordBatch; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.flatbuffers.FlatBufferBuilder; + +import io.netty.buffer.ArrowBuf; + +public class ArrowRecordBatch implements FBSerializable, AutoCloseable { + private static final Logger LOGGER = LoggerFactory.getLogger(ArrowRecordBatch.class); + + /** number of records */ + private final int length; + + /** Nodes correspond to the pre-ordered flattened logical schema */ + private final List nodes; + + private final List buffers; + + private final List buffersLayout; + + private boolean closed = false; + + /** + * @param length how many rows in this batch + * @param nodes field level info + * @param buffers will be retained until this recordBatch is closed + */ + public ArrowRecordBatch(int length, List nodes, List buffers) { + super(); + this.length = length; + this.nodes = nodes; + this.buffers = buffers; + List arrowBuffers = new ArrayList<>(); + long offset = 0; + for (ArrowBuf arrowBuf : buffers) { + arrowBuf.retain(); + long size = arrowBuf.readableBytes(); + arrowBuffers.add(new ArrowBuffer(0, offset, size)); + LOGGER.debug(String.format("Buffer in RecordBatch at %d, length: %d", offset, size)); + offset += size; + if (offset % 8 != 0) { // align on 8 byte boundaries + offset += 8 - (offset % 8); + } + } + this.buffersLayout = Collections.unmodifiableList(arrowBuffers); + } + + public int getLength() { + return length; + } + + /** + * @return the FieldNodes corresponding to the schema + */ + public List getNodes() { + return nodes; + } + + /** + * @return the buffers containing the data + */ + public List getBuffers() { + if (closed) { + throw new IllegalStateException("already closed"); + } + return buffers; + } + + /** + * @return the serialized layout if we send the buffers on the wire + */ + public List getBuffersLayout() { + return buffersLayout; + } + + @Override + public int writeTo(FlatBufferBuilder builder) { + RecordBatch.startNodesVector(builder, nodes.size()); + int nodesOffset = writeAllStructsToVector(builder, nodes); + RecordBatch.startBuffersVector(builder, buffers.size()); + int buffersOffset = writeAllStructsToVector(builder, buffersLayout); + RecordBatch.startRecordBatch(builder); + RecordBatch.addLength(builder, length); + RecordBatch.addNodes(builder, nodesOffset); + RecordBatch.addBuffers(builder, buffersOffset); + return RecordBatch.endRecordBatch(builder); + } + + /** + * releases the buffers + */ + public void close() { + if (!closed) { + closed = true; + for (ArrowBuf arrowBuf : buffers) { + arrowBuf.release(); + } + } + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java new file mode 100644 index 00000000000..e3d3e34e0ae --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java @@ -0,0 +1,47 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import org.apache.arrow.flatbuf.VectorType; + +public class ArrowVectorType { + + public static final ArrowVectorType VALUES = new ArrowVectorType(VectorType.VALUES); + public static final ArrowVectorType OFFSET = new ArrowVectorType(VectorType.OFFSET); + public static final ArrowVectorType VALIDITY = new ArrowVectorType(VectorType.VALIDITY); + public static final ArrowVectorType TYPE = new ArrowVectorType(VectorType.TYPE); + + private final short type; + + public ArrowVectorType(short type) { + this.type = type; + } + + public short getType() { + return type; + } + + @Override + public String toString() { + try { + return VectorType.name(type); + } catch (ArrayIndexOutOfBoundsException e) { + return "Unlnown type " + type; + } + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java new file mode 100644 index 00000000000..d23ed91948e --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializable.java @@ -0,0 +1,24 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import com.google.flatbuffers.FlatBufferBuilder; + +public interface FBSerializable { + int writeTo(FlatBufferBuilder builder); +} \ No newline at end of file diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java new file mode 100644 index 00000000000..31c17ad6df0 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/FBSerializables.java @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import com.google.flatbuffers.FlatBufferBuilder; + +public class FBSerializables { + + public static int writeAllStructsToVector(FlatBufferBuilder builder, List all) { + // struct vectors have to be created in reverse order + List reversed = new ArrayList<>(all); + Collections.reverse(reversed); + for (FBSerializable element : reversed) { + element.writeTo(builder); + } + return builder.endVector(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java new file mode 100644 index 00000000000..1275e0eb5dc --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -0,0 +1,208 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import static java.util.Arrays.asList; +import static org.apache.arrow.flatbuf.Precision.DOUBLE; +import static org.apache.arrow.flatbuf.Precision.SINGLE; +import static org.apache.arrow.vector.schema.VectorLayout.booleanVector; +import static org.apache.arrow.vector.schema.VectorLayout.byteVector; +import static org.apache.arrow.vector.schema.VectorLayout.dataVector; +import static org.apache.arrow.vector.schema.VectorLayout.offsetVector; +import static org.apache.arrow.vector.schema.VectorLayout.typeVector; +import static org.apache.arrow.vector.schema.VectorLayout.validityVector; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.flatbuf.UnionMode; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; +import org.apache.arrow.vector.types.pojo.ArrowType.Binary; +import org.apache.arrow.vector.types.pojo.ArrowType.Bool; +import org.apache.arrow.vector.types.pojo.ArrowType.Date; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; +import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; +import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.IntervalDay; +import org.apache.arrow.vector.types.pojo.ArrowType.IntervalYear; +import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.Time; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; +import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; +import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; + +/** + * The layout of vectors for a given type + * It defines its own vectors followed by the vectors for the children + * if it is a nested type (Tuple, List, Union) + */ +public class TypeLayout { + + public static TypeLayout getTypeLayout(final ArrowType arrowType) { + TypeLayout layout = arrowType.accept(new ArrowTypeVisitor() { + + @Override public TypeLayout visit(Int type) { + return newFixedWidthTypeLayout(dataVector(type.getBitWidth())); + } + + @Override public TypeLayout visit(Union type) { + List vectors; + switch (type.getMode()) { + case UnionMode.Dense: + vectors = asList( + // TODO: validate this + validityVector(), + typeVector(), + offsetVector() // offset to find the vector + ); + break; + case UnionMode.Sparse: + vectors = asList( + validityVector(), + typeVector() + ); + break; + default: + throw new UnsupportedOperationException("Unsupported Union Mode: " + type.getMode()); + } + return new TypeLayout(vectors); + } + + @Override public TypeLayout visit(Tuple type) { + List vectors = asList( + // TODO: add validity vector in Map +// validityVector() + ); + return new TypeLayout(vectors); + } + + @Override public TypeLayout visit(Timestamp type) { + return newFixedWidthTypeLayout(dataVector(64)); + } + + @Override public TypeLayout visit(org.apache.arrow.vector.types.pojo.ArrowType.List type) { + List vectors = asList( + validityVector(), + offsetVector() + ); + return new TypeLayout(vectors); + } + + @Override public TypeLayout visit(FloatingPoint type) { + int bitWidth; + switch (type.getPrecision()) { + case SINGLE: + bitWidth = 32; + break; + case DOUBLE: + bitWidth = 64; + break; + default: + throw new UnsupportedOperationException("Unsupported Precision: " + type.getPrecision()); + } + return newFixedWidthTypeLayout(dataVector(bitWidth)); + } + + @Override public TypeLayout visit(Decimal type) { + // TODO: check size + return newFixedWidthTypeLayout(dataVector(64)); // actually depends on the type fields + } + + @Override public TypeLayout visit(Bool type) { + return newFixedWidthTypeLayout(booleanVector()); + } + + @Override public TypeLayout visit(Binary type) { + return newVariableWidthTypeLayout(); + } + + @Override public TypeLayout visit(Utf8 type) { + return newVariableWidthTypeLayout(); + } + + private TypeLayout newVariableWidthTypeLayout() { + return newPrimitiveTypeLayout(validityVector(), offsetVector(), byteVector()); + } + + private TypeLayout newPrimitiveTypeLayout(VectorLayout... vectors) { + return new TypeLayout(asList(vectors)); + } + + public TypeLayout newFixedWidthTypeLayout(VectorLayout dataVector) { + return newPrimitiveTypeLayout(validityVector(), dataVector); + } + + @Override + public TypeLayout visit(Null type) { + return new TypeLayout(Collections.emptyList()); + } + + @Override + public TypeLayout visit(Date type) { + return newFixedWidthTypeLayout(dataVector(64)); + } + + @Override + public TypeLayout visit(Time type) { + return newFixedWidthTypeLayout(dataVector(64)); + } + + @Override + public TypeLayout visit(IntervalDay type) { // TODO: check size + return newFixedWidthTypeLayout(dataVector(64)); + } + + @Override + public TypeLayout visit(IntervalYear type) { // TODO: check size + return newFixedWidthTypeLayout(dataVector(64)); + } + }); + return layout; + } + + private final List vectors; + + public TypeLayout(List vectors) { + super(); + this.vectors = vectors; + } + + public TypeLayout(VectorLayout... vectors) { + this(asList(vectors)); + } + + + public List getVectors() { + return vectors; + } + + public List getVectorTypes() { + List types = new ArrayList<>(vectors.size()); + for (VectorLayout vector : vectors) { + types.add(vector.getType()); + } + return types; + } + + public String toString() { + return "TypeLayout{" + vectors + "}"; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java new file mode 100644 index 00000000000..421ebcb8376 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.schema; + +import static org.apache.arrow.vector.schema.ArrowVectorType.OFFSET; +import static org.apache.arrow.vector.schema.ArrowVectorType.TYPE; +import static org.apache.arrow.vector.schema.ArrowVectorType.VALIDITY; +import static org.apache.arrow.vector.schema.ArrowVectorType.VALUES; + +public class VectorLayout { + + private static final VectorLayout VALIDITY_VECTOR = new VectorLayout(VALIDITY, 1); + private static final VectorLayout OFFSET_VECTOR = new VectorLayout(OFFSET, 32); + private static final VectorLayout TYPE_VECTOR = new VectorLayout(TYPE, 32); + private static final VectorLayout BOOLEAN_VECTOR = new VectorLayout(VALUES, 1); + private static final VectorLayout VALUES_64 = new VectorLayout(VALUES, 64); + private static final VectorLayout VALUES_32 = new VectorLayout(VALUES, 32); + private static final VectorLayout VALUES_16 = new VectorLayout(VALUES, 16); + private static final VectorLayout VALUES_8 = new VectorLayout(VALUES, 8); + + public static VectorLayout typeVector() { + return TYPE_VECTOR; + } + + public static VectorLayout offsetVector() { + return OFFSET_VECTOR; + } + + public static VectorLayout dataVector(int typeBitWidth) { + switch (typeBitWidth) { + case 8: + return VALUES_8; + case 16: + return VALUES_16; + case 32: + return VALUES_32; + case 64: + return VALUES_64; + default: + throw new IllegalArgumentException("only 8, 16, 32, or 64 bits supported"); + } + } + + public static VectorLayout booleanVector() { + return BOOLEAN_VECTOR; + } + + public static VectorLayout validityVector() { + return VALIDITY_VECTOR; + } + + public static VectorLayout byteVector() { + return dataVector(8); + } + + private final int typeBitWidth; + + private final ArrowVectorType type; + + private VectorLayout(ArrowVectorType type, int typeBitWidth) { + super(); + this.type = type; + this.typeBitWidth = typeBitWidth; + } + + public int getTypeBitWidth() { + return typeBitWidth; + } + + public ArrowVectorType getType() { + return type; + } + + @Override + public String toString() { + return String.format("{width=%s,type=%s}", typeBitWidth, type); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index c34882a8fb1..4d0d9ee114a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -17,8 +17,14 @@ */ package org.apache.arrow.vector.types; +import java.util.HashMap; +import java.util.Map; + +import org.apache.arrow.flatbuf.Precision; import org.apache.arrow.flatbuf.Type; +import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullableBigIntVector; import org.apache.arrow.vector.NullableBitVector; import org.apache.arrow.vector.NullableDateVector; @@ -38,7 +44,6 @@ import org.apache.arrow.vector.NullableUInt8Vector; import org.apache.arrow.vector.NullableVarBinaryVector; import org.apache.arrow.vector.NullableVarCharVector; -import org.apache.arrow.vector.SmallIntVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.ListVector; @@ -85,9 +90,6 @@ import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; -import java.util.HashMap; -import java.util.Map; - public class Types { public static final Field NULL_FIELD = new Field("", true, Null.INSTANCE, null); @@ -104,8 +106,8 @@ public class Types { public static final Field TIMESTAMP_FIELD = new Field("", true, new Timestamp(""), null); public static final Field INTERVALDAY_FIELD = new Field("", true, IntervalDay.INSTANCE, null); public static final Field INTERVALYEAR_FIELD = new Field("", true, IntervalYear.INSTANCE, null); - public static final Field FLOAT4_FIELD = new Field("", true, new FloatingPoint(0), null); - public static final Field FLOAT8_FIELD = new Field("", true, new FloatingPoint(1), null); + public static final Field FLOAT4_FIELD = new Field("", true, new FloatingPoint(Precision.SINGLE), null); + public static final Field FLOAT8_FIELD = new Field("", true, new FloatingPoint(Precision.DOUBLE), null); public static final Field LIST_FIELD = new Field("", true, List.INSTANCE, null); public static final Field VARCHAR_FIELD = new Field("", true, Utf8.INSTANCE, null); public static final Field VARBINARY_FIELD = new Field("", true, Binary.INSTANCE, null); @@ -120,7 +122,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return ZeroVector.INSTANCE; } @@ -136,7 +138,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new MapVector(name, allocator, callBack); } @@ -153,7 +155,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableTinyIntVector(name, allocator); } @@ -169,8 +171,8 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { - return new SmallIntVector(name, allocator); + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + return new NullableSmallIntVector(name, allocator); } @Override @@ -185,7 +187,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableIntVector(name, allocator); } @@ -201,7 +203,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableBigIntVector(name, allocator); } @@ -217,7 +219,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableDateVector(name, allocator); } @@ -233,7 +235,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableTimeVector(name, allocator); } @@ -249,7 +251,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableTimeStampVector(name, allocator); } @@ -265,7 +267,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableIntervalDayVector(name, allocator); } @@ -281,7 +283,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableIntervalDayVector(name, allocator); } @@ -290,14 +292,14 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new IntervalYearWriterImpl((NullableIntervalYearVector) vector); } }, - FLOAT4(new FloatingPoint(0)) { + FLOAT4(new FloatingPoint(Precision.SINGLE)) { @Override public Field getField() { return FLOAT4_FIELD; } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableFloat4Vector(name, allocator); } @@ -306,14 +308,14 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new Float4WriterImpl((NullableFloat4Vector) vector); } }, // 4 byte ieee 754 - FLOAT8(new FloatingPoint(1)) { + FLOAT8(new FloatingPoint(Precision.DOUBLE)) { @Override public Field getField() { return FLOAT8_FIELD; } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableFloat8Vector(name, allocator); } @@ -329,7 +331,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableBitVector(name, allocator); } @@ -345,7 +347,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableVarCharVector(name, allocator); } @@ -361,7 +363,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableVarBinaryVector(name, allocator); } @@ -381,7 +383,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableDecimalVector(name, allocator, precisionScale[0], precisionScale[1]); } @@ -397,7 +399,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableUInt1Vector(name, allocator); } @@ -413,7 +415,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableUInt2Vector(name, allocator); } @@ -429,7 +431,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableUInt4Vector(name, allocator); } @@ -445,7 +447,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new NullableUInt8Vector(name, allocator); } @@ -461,7 +463,7 @@ public Field getField() { } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new ListVector(name, allocator, callBack); } @@ -470,14 +472,14 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new UnionListWriter((ListVector) vector); } }, - UNION(Union.INSTANCE) { + UNION(new Union(UnionMode.Sparse)) { @Override public Field getField() { throw new UnsupportedOperationException("Cannot get simple field for Union type"); } @Override - public ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { + public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { return new UnionVector(name, allocator, callBack); } @@ -499,7 +501,7 @@ public ArrowType getType() { public abstract Field getField(); - public abstract ValueVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale); + public abstract FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale); public abstract FieldWriter getNewFieldWriter(ValueVector vector); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index 49d0503e470..36712b9bea3 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -18,19 +18,24 @@ package org.apache.arrow.vector.types.pojo; -import com.google.common.collect.ImmutableList; -import com.google.flatbuffers.FlatBufferBuilder; +import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; +import java.util.ArrayList; import java.util.List; import java.util.Objects; -import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; +import org.apache.arrow.vector.schema.ArrowVectorType; +import org.apache.arrow.vector.schema.TypeLayout; + +import com.google.common.collect.ImmutableList; +import com.google.flatbuffers.FlatBufferBuilder; public class Field { private final String name; private final boolean nullable; private final ArrowType type; private final List children; + private final TypeLayout typeLayout; public Field(String name, boolean nullable, ArrowType type, List children) { this.name = name; @@ -41,18 +46,32 @@ public Field(String name, boolean nullable, ArrowType type, List children } else { this.children = children; } + this.typeLayout = TypeLayout.getTypeLayout(type); } public static Field convertField(org.apache.arrow.flatbuf.Field field) { String name = field.name(); boolean nullable = field.nullable(); ArrowType type = getTypeForField(field); + List buffers = new ArrayList<>(); + for (int i = 0; i < field.buffersLength(); ++i) { + buffers.add(new ArrowVectorType(field.buffers(i))); + } ImmutableList.Builder childrenBuilder = ImmutableList.builder(); for (int i = 0; i < field.childrenLength(); i++) { childrenBuilder.add(convertField(field.children(i))); } List children = childrenBuilder.build(); - return new Field(name, nullable, type, children); + Field result = new Field(name, nullable, type, children); + TypeLayout typeLayout = result.getTypeLayout(); + if (typeLayout.getVectors().size() != field.buffersLength()) { + List types = new ArrayList<>(); + for (int i = 0; i < field.buffersLength(); i++) { + types.add(new ArrowVectorType(field.buffers(i))); + } + throw new IllegalArgumentException("Deserialized field does not match expected vectors. expected: " + typeLayout.getVectorTypes() + " got " + types); + } + return result; } public int getField(FlatBufferBuilder builder) { @@ -63,12 +82,18 @@ public int getField(FlatBufferBuilder builder) { childrenData[i] = children.get(i).getField(builder); } int childrenOffset = org.apache.arrow.flatbuf.Field.createChildrenVector(builder, childrenData); + short[] buffersData = new short[typeLayout.getVectors().size()]; + for (int i = 0; i < buffersData.length; i++) { + buffersData[i] = typeLayout.getVectors().get(i).getType().getType(); + } + int buffersOffset = org.apache.arrow.flatbuf.Field.createBuffersVector(builder, buffersData ); org.apache.arrow.flatbuf.Field.startField(builder); org.apache.arrow.flatbuf.Field.addName(builder, nameOffset); org.apache.arrow.flatbuf.Field.addNullable(builder, nullable); org.apache.arrow.flatbuf.Field.addTypeType(builder, type.getTypeType()); org.apache.arrow.flatbuf.Field.addType(builder, typeOffset); org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset); + org.apache.arrow.flatbuf.Field.addBuffers(builder, buffersOffset); return org.apache.arrow.flatbuf.Field.endField(builder); } @@ -88,6 +113,10 @@ public List getChildren() { return children; } + public TypeLayout getTypeLayout() { + return typeLayout; + } + @Override public boolean equals(Object obj) { if (!(obj instanceof Field)) { @@ -102,4 +131,9 @@ public boolean equals(Object obj) { (this.children.size() == 0 && that.children == null)); } + + @Override + public String toString() { + return String.format("Field{name=%s, type=%s, children=%s, layout=%s}", name, type, children, typeLayout); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java index 9e2894170b2..231be9bd55c 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java @@ -18,15 +18,13 @@ package org.apache.arrow.vector.types.pojo; -import com.google.common.collect.ImmutableList; -import com.google.flatbuffers.FlatBufferBuilder; +import static org.apache.arrow.vector.types.pojo.Field.convertField; -import java.nio.ByteBuffer; import java.util.List; import java.util.Objects; -import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; -import static org.apache.arrow.vector.types.pojo.Field.convertField; +import com.google.common.collect.ImmutableList; +import com.google.flatbuffers.FlatBufferBuilder; public class Schema { private List fields; @@ -71,4 +69,9 @@ public static Schema convertSchema(org.apache.arrow.flatbuf.Schema schema) { List fields = childrenBuilder.build(); return new Schema(fields); } + + @Override + public String toString() { + return "Schema" + fields; + } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java new file mode 100644 index 00000000000..85bb2cfc99f --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -0,0 +1,89 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import java.io.IOException; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; +import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.Test; + +public class TestVectorUnloadLoad { + + static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + + @Test + public void test() throws IOException { + int count = 10000; + Schema schema; + + try ( + BufferAllocator originalVectorsAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", originalVectorsAllocator, null)) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + writer.setValueCount(count); + + VectorUnloader vectorUnloader = new VectorUnloader((MapVector)parent.getChild("root")); + schema = vectorUnloader.getSchema(); + + try ( + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + MapVector newParent = new MapVector("parent", finalVectorsAllocator, null)) { + MapVector root = newParent.addOrGet("root", MinorType.MAP, MapVector.class); + VectorLoader vectorLoader = new VectorLoader(schema, root); + + vectorLoader.load(recordBatch); + + MapReader rootReader = new SingleMapReaderImpl(newParent).reader("root"); + for (int i = 0; i < count; i++) { + rootReader.setPosition(i); + Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); + Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); + } + } + } + } + + @AfterClass + public static void afterClass() { + allocator.close(); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/ByteArrayReadableSeekableByteChannel.java b/java/vector/src/test/java/org/apache/arrow/vector/file/ByteArrayReadableSeekableByteChannel.java new file mode 100644 index 00000000000..7c423d5881a --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/ByteArrayReadableSeekableByteChannel.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.SeekableByteChannel; + +public class ByteArrayReadableSeekableByteChannel implements SeekableByteChannel { + private byte[] byteArray; + private int position = 0; + + public ByteArrayReadableSeekableByteChannel(byte[] byteArray) { + if (byteArray == null) { + throw new NullPointerException(); + } + this.byteArray = byteArray; + } + + @Override + public boolean isOpen() { + return byteArray != null; + } + + @Override + public void close() throws IOException { + byteArray = null; + } + + @Override + public int read(final ByteBuffer dst) throws IOException { + int remainingInBuf = byteArray.length - this.position; + int length = Math.min(dst.remaining(), remainingInBuf); + dst.put(this.byteArray, this.position, length); + this.position += length; + return length; + } + + @Override + public long position() throws IOException { + return this.position; + } + + @Override + public SeekableByteChannel position(final long newPosition) throws IOException { + this.position = (int)newPosition; + return this; + } + + @Override + public long size() throws IOException { + return this.byteArray.length; + } + + @Override + public int write(final ByteBuffer src) throws IOException { + throw new UnsupportedOperationException("Read only"); + } + + @Override + public SeekableByteChannel truncate(final long size) throws IOException { + throw new UnsupportedOperationException("Read only"); + } + +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java new file mode 100644 index 00000000000..11de0a2ef00 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -0,0 +1,331 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector.Accessor; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; +import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.schema.ArrowBuffer; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import io.netty.buffer.ArrowBuf; + +public class TestArrowFile { + private static final int COUNT = 10; + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void tearDown() { + allocator.close(); + } + + @Test + public void testWrite() throws IOException { + File file = new File("target/mytest_write.arrow"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null)) { + writeData(count, parent); + write((MapVector)parent.getChild("root"), file); + } + } + + @Test + public void testWriteComplex() throws IOException { + File file = new File("target/mytest_write_complex.arrow"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null)) { + writeComplexData(count, parent); + validateComplexContent(count, parent); + write((MapVector)parent.getChild("root"), file); + } + } + + private void writeComplexData(int count, MapVector parent) { + ArrowBuf varchar = allocator.buffer(3); + varchar.readerIndex(0); + varchar.setByte(0, 'a'); + varchar.setByte(1, 'b'); + varchar.setByte(2, 'c'); + varchar.writerIndex(3); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + ListWriter listWriter = rootWriter.list("list"); + MapWriter mapWriter = rootWriter.map("map"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.timeStamp("timestamp").writeTimeStamp(i); + mapWriter.end(); + } + writer.setValueCount(count); + varchar.release(); + } + + + private void writeData(int count, MapVector parent) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + writer.setValueCount(count); + } + + @Test + public void testWriteRead() throws IOException { + File file = new File("target/mytest.arrow"); + int count = COUNT; + + // write + try ( + BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", originalVectorAllocator, null)) { + writeData(count, parent); + write((MapVector)parent.getChild("root"), file); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(file); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); + BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null) + ) { + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + System.out.println("reading schema: " + schema); + + // initialize vectors + + MapVector root = parent.addOrGet("root", MinorType.MAP, MapVector.class); + + VectorLoader vectorLoader = new VectorLoader(schema, root); + + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + Assert.assertEquals(0, rbBlock.getOffset() % 8); + Assert.assertEquals(0, rbBlock.getMetadataLength() % 8); + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + List buffersLayout = recordBatch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } + vectorLoader.load(recordBatch); + } + + validateContent(count, parent); + } + } + } + + private void validateContent(int count, MapVector parent) { + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + for (int i = 0; i < count; i++) { + rootReader.setPosition(i); + Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); + Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); + } + } + + @Test + public void testWriteReadComplex() throws IOException { + File file = new File("target/mytest_complex.arrow"); + int count = COUNT; + + // write + try ( + BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", originalVectorAllocator, null)) { + writeComplexData(count, parent); + write((MapVector)parent.getChild("root"), file); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(file); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); + BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null) + ) { + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + System.out.println("reading schema: " + schema); + + // initialize vectors + + MapVector root = parent.addOrGet("root", MinorType.MAP, MapVector.class); + + VectorLoader vectorLoader = new VectorLoader(schema, root); + + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + vectorLoader.load(recordBatch); + } + validateComplexContent(count, parent); + } + } + } + + public void printVectors(List vectors) { + for (FieldVector vector : vectors) { + System.out.println(vector.getField().getName()); + Accessor accessor = vector.getAccessor(); + int valueCount = accessor.getValueCount(); + for (int i = 0; i < valueCount; i++) { + System.out.println(accessor.getObject(i)); + } + } + } + + private void validateComplexContent(int count, MapVector parent) { + printVectors(parent.getChildrenFromFields()); + + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + for (int i = 0; i < count; i++) { + rootReader.setPosition(i); + Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); + Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); + Assert.assertEquals(i % 3, rootReader.reader("list").size()); + Assert.assertEquals(i, rootReader.reader("map").reader("timestamp").readDateTime().getMillis() % COUNT); + } + } + + private void write(MapVector parent, File file) throws FileNotFoundException, IOException { + VectorUnloader vectorUnloader = new VectorUnloader(parent); + Schema schema = vectorUnloader.getSchema(); + System.out.println("writing schema: " + schema); + try ( + FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + ) { + arrowWriter.writeRecordBatch(recordBatch); + } + } + + @Test + public void testWriteReadMultipleRBs() throws IOException { + File file = new File("target/mytest_multiple.arrow"); + int count = COUNT; + + // write + try ( + BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", originalVectorAllocator, null); + FileOutputStream fileOutputStream = new FileOutputStream(file);) { + writeData(count, parent); + VectorUnloader vectorUnloader = new VectorUnloader(parent.getChild("root")); + Schema schema = vectorUnloader.getSchema(); + Assert.assertEquals(2, schema.getFields().size()); + try (ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema);) { + try (ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch()) { + arrowWriter.writeRecordBatch(recordBatch); + } + parent.allocateNew(); + writeData(count, parent); + try (ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch()) { + arrowWriter.writeRecordBatch(recordBatch); + } + } + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(file); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); + BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null); + ) { + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + System.out.println("reading schema: " + schema); + MapVector root = parent.addOrGet("root", MinorType.MAP, MapVector.class); + VectorLoader vectorLoader = new VectorLoader(schema, root); + List recordBatches = footer.getRecordBatches(); + Assert.assertEquals(2, recordBatches.size()); + for (ArrowBlock rbBlock : recordBatches) { + Assert.assertEquals(0, rbBlock.getOffset() % 8); + Assert.assertEquals(0, rbBlock.getMetadataLength() % 8); + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + List buffersLayout = recordBatch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } + vectorLoader.load(recordBatch); + validateContent(count, parent); + } + } + } + } + +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java new file mode 100644 index 00000000000..707dba2af98 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFooter.java @@ -0,0 +1,56 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; + +import java.nio.ByteBuffer; +import java.util.Collections; + +import org.apache.arrow.flatbuf.Footer; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; + +import com.google.flatbuffers.FlatBufferBuilder; + +public class TestArrowFooter { + + @Test + public void test() { + Schema schema = new Schema(asList( + new Field("a", true, new ArrowType.Int(8, true), Collections.emptyList()) + )); + ArrowFooter footer = new ArrowFooter(schema, Collections.emptyList(), Collections.emptyList()); + ArrowFooter newFooter = roundTrip(footer); + assertEquals(footer, newFooter); + } + + + private ArrowFooter roundTrip(ArrowFooter footer) { + FlatBufferBuilder builder = new FlatBufferBuilder(); + int i = footer.writeTo(builder); + builder.finish(i); + ByteBuffer dataBuffer = builder.dataBuffer(); + ArrowFooter newFooter = new ArrowFooter(Footer.getRootAsFooter(dataBuffer)); + return newFooter; + } + +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java new file mode 100644 index 00000000000..f90329aca11 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowReaderWriter.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import static java.util.Arrays.asList; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.file.ArrowBlock; +import org.apache.arrow.vector.file.ArrowFooter; +import org.apache.arrow.vector.file.ArrowReader; +import org.apache.arrow.vector.file.ArrowWriter; +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Before; +import org.junit.Test; + +import io.netty.buffer.ArrowBuf; + +public class TestArrowReaderWriter { + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + ArrowBuf buf(byte[] bytes) { + ArrowBuf buffer = allocator.buffer(bytes.length); + buffer.writeBytes(bytes); + return buffer; + } + + byte[] array(ArrowBuf buf) { + byte[] bytes = new byte[buf.readableBytes()]; + buf.readBytes(bytes); + return bytes; + } + + @Test + public void test() throws IOException { + Schema schema = new Schema(asList(new Field("testField", true, new ArrowType.Int(8, true), Collections.emptyList()))); + byte[] validity = new byte[] { (byte)255, 0}; + // second half is "undefined" + byte[] values = new byte[] { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}; + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + try (ArrowWriter writer = new ArrowWriter(Channels.newChannel(out), schema)) { + ArrowBuf validityb = buf(validity); + ArrowBuf valuesb = buf(values); + writer.writeRecordBatch(new ArrowRecordBatch(16, asList(new ArrowFieldNode(16, 8)), asList(validityb, valuesb))); + } + + byte[] byteArray = out.toByteArray(); + + try (ArrowReader reader = new ArrowReader(new ByteArrayReadableSeekableByteChannel(byteArray), allocator)) { + ArrowFooter footer = reader.readFooter(); + Schema readSchema = footer.getSchema(); + assertEquals(schema, readSchema); + assertTrue(readSchema.getFields().get(0).getTypeLayout().getVectorTypes().toString(), readSchema.getFields().get(0).getTypeLayout().getVectors().size() > 0); + // TODO: dictionaries + List recordBatches = footer.getRecordBatches(); + assertEquals(1, recordBatches.size()); + ArrowRecordBatch recordBatch = reader.readRecordBatch(recordBatches.get(0)); + List nodes = recordBatch.getNodes(); + assertEquals(1, nodes.size()); + ArrowFieldNode node = nodes.get(0); + assertEquals(16, node.getLength()); + assertEquals(8, node.getNullCount()); + List buffers = recordBatch.getBuffers(); + assertEquals(2, buffers.size()); + assertArrayEquals(validity, array(buffers.get(0))); + assertArrayEquals(values, array(buffers.get(1))); + + } + } + +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index 06a1149c0d6..61327f1970e 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -17,19 +17,24 @@ */ package org.apache.arrow.vector.pojo; -import com.google.common.collect.ImmutableList; -import com.google.flatbuffers.FlatBufferBuilder; +import static org.apache.arrow.flatbuf.Precision.DOUBLE; +import static org.apache.arrow.flatbuf.Precision.SINGLE; +import static org.junit.Assert.assertEquals; + +import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; +import org.apache.arrow.vector.types.pojo.ArrowType.List; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Test; -import java.util.List; - -import static org.junit.Assert.assertEquals; +import com.google.common.collect.ImmutableList; +import com.google.flatbuffers.FlatBufferBuilder; /** * Test conversion between Flatbuf and Pojo field representations @@ -46,7 +51,7 @@ public void simple() { public void complex() { ImmutableList.Builder childrenBuilder = ImmutableList.builder(); childrenBuilder.add(new Field("child1", true, Utf8.INSTANCE, null)); - childrenBuilder.add(new Field("child2", true, new FloatingPoint(0), ImmutableList.of())); + childrenBuilder.add(new Field("child2", true, new FloatingPoint(SINGLE), ImmutableList.of())); Field initialField = new Field("a", true, Tuple.INSTANCE, childrenBuilder.build()); run(initialField); @@ -56,10 +61,29 @@ public void complex() { public void schema() { ImmutableList.Builder childrenBuilder = ImmutableList.builder(); childrenBuilder.add(new Field("child1", true, Utf8.INSTANCE, null)); - childrenBuilder.add(new Field("child2", true, new FloatingPoint(0), ImmutableList.of())); + childrenBuilder.add(new Field("child2", true, new FloatingPoint(SINGLE), ImmutableList.of())); Schema initialSchema = new Schema(childrenBuilder.build()); run(initialSchema); + } + @Test + public void nestedSchema() { + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + childrenBuilder.add(new Field("child1", true, Utf8.INSTANCE, null)); + childrenBuilder.add(new Field("child2", true, new FloatingPoint(SINGLE), ImmutableList.of())); + childrenBuilder.add(new Field("child3", true, new Tuple(), ImmutableList.of( + new Field("child3.1", true, Utf8.INSTANCE, null), + new Field("child3.2", true, new FloatingPoint(DOUBLE), ImmutableList.of()) + ))); + childrenBuilder.add(new Field("child4", true, new List(), ImmutableList.of( + new Field("child4.1", true, Utf8.INSTANCE, null) + ))); + childrenBuilder.add(new Field("child5", true, new Union(UnionMode.Sparse), ImmutableList.of( + new Field("child5.1", true, new Timestamp("UTC"), null), + new Field("child5.2", true, new FloatingPoint(DOUBLE), ImmutableList.of()) + ))); + Schema initialSchema = new Schema(childrenBuilder.build()); + run(initialSchema); } private void run(Field initialField) { From 907cc5a1295c4e9227ac50abf5babbe497f1edd1 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 28 Aug 2016 13:43:01 -0400 Subject: [PATCH 122/210] ARROW-262: Start metadata specification document The purpose of this patch is to: * Provide exposition and a place to clarify / provide examples illustrating the canonical metadata * Begin providing definitions of logical types * Where relevant, the data header metadata generated by a particular logical type (for example: strings produce one fewer buffer compared with List even though the effective memory layout is the same as a the nested type without any nulls in its child array) This is not a complete specification and will require follow-up JIRAs to address more logical types and fill other gaps. Author: Wes McKinney Closes #121 from wesm/ARROW-262 and squashes the following commits: bba5e82 [Wes McKinney] int->short 8cc52fd [Wes McKinney] Drafting Metadata specification document --- format/Message.fbs | 3 +- format/Metadata.md | 258 +++++++++++++++++++++++++++++++++++++++++++++ format/README.md | 1 + 3 files changed, 261 insertions(+), 1 deletion(-) create mode 100644 format/Metadata.md diff --git a/format/Message.fbs b/format/Message.fbs index b02f3fa3869..71428b58103 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -28,12 +28,13 @@ table Int { is_signed: bool; } -enum Precision:short {SINGLE, DOUBLE} +enum Precision:short {HALF, SINGLE, DOUBLE} table FloatingPoint { precision: Precision; } +/// Unicode with UTF-8 encoding table Utf8 { } diff --git a/format/Metadata.md b/format/Metadata.md new file mode 100644 index 00000000000..e227b8d4afd --- /dev/null +++ b/format/Metadata.md @@ -0,0 +1,258 @@ +# Metadata: Logical types, schemas, data headers + +This is documentation for the Arrow metadata specification, which enables +systems to communicate the + +* Logical array types (which are implemented using the physical memory layouts + specified in [Layout.md][1]) + +* Schemas for table-like collections of Arrow data structures + +* "Data headers" indicating the physical locations of memory buffers sufficient + to reconstruct a Arrow data structures without copying memory. + +## Canonical implementation + +We are using [Flatbuffers][2] for low-overhead reading and writing of the Arrow +metadata. See [Message.fbs][3]. + +## Schemas + +The `Schema` type describes a table-like structure consisting of any number of +Arrow arrays, each of which can be interpreted as a column in the table. A +schema by itself does not describe the physical structure of any particular set +of data. + +A schema consists of a sequence of **fields**, which are metadata describing +the columns. The Flatbuffers IDL for a field is: + +``` +table Field { + // Name is not required, in i.e. a List + name: string; + nullable: bool; + type: Type; + children: [Field]; +} +``` + +The `type` is the logical type of the field. Nested types, such as List, +Struct, and Union, have a sequence of child fields. + +## Record data headers + +A record batch is a collection of top-level named, equal length Arrow arrays +(or vectors). If one of the arrays contains nested data, its child arrays are +not required to be the same length as the top-level arrays. + +One can be thought of as a realization of a particular schema. The metadata +describing a particular record batch is called a "data header". Here is the +Flatbuffers IDL for a record batch data header + +``` +table RecordBatch { + length: int; + nodes: [FieldNode]; + buffers: [Buffer]; +} +``` + +The `nodes` and `buffers` fields are produced by a depth-first traversal / +flattening of a schema (possibly containing nested types) for a given in-memory +data set. + +### Buffers + +A buffer is metadata describing a contiguous memory region relative to some +virtual address space. This may include: + +* Shared memory, e.g. a memory-mapped file +* An RPC message received in-memory +* Data in a file + +The key form of the Buffer type is: + +``` +struct Buffer { + offset: long; + length: long; +} +``` + +In the context of a record batch, each field has some number of buffers +associated with it, which are derived from their physical memory layout. + +Each logical type (separate from its children, if it is a nested type) has a +deterministic number of buffers associated with it. These will be specified in +the logical types section. + +### Field metadata + +The `FieldNode` values contain metadata about each level in a nested type +hierarchy. + +``` +struct FieldNode { + /// The number of value slots in the Arrow array at this level of a nested + /// tree + length: int; + + /// The number of observed nulls. + null_count: int; +} +``` + +## Flattening of nested data + +Nested types are flattened in the record batch in depth-first order. When +visiting each field in the nested type tree, the metadata is appended to the +top-level `fields` array and the buffers associated with that field (but not +its children) are appended to the `buffers` array. + +For example, let's consider the schema + +``` +col1: Struct, c: Float64> +col2: Utf8 +``` + +The flattened version of this is: + +``` +FieldNode 0: Struct name='col1' +FieldNode 1: Int32 name=a' +FieldNode 2: List name='b' +FieldNode 3: Int64 name='item' # arbitrary +FieldNode 4: Float64 name='c' +FieldNode 5: Utf8 name='col2' +``` + +For the buffers produced, we would have the following (as described in more +detail for each type below): + +``` +buffer 0: field 0 validity bitmap + +buffer 1: field 1 validity bitmap +buffer 2: field 1 values + +buffer 3: field 2 validity bitmap +buffer 4: field 2 list offsets + +buffer 5: field 3 validity bitmap +buffer 6: field 3 values + +buffer 7: field 4 validity bitmap +buffer 8: field 4 values + +buffer 9: field 5 validity bitmap +buffer 10: field 5 offsets +buffer 11: field 5 data +``` + +## Logical types + +A logical type consists of a type name and metadata along with an explicit +mapping to a physical memory representation. These may fall into some different +categories: + +* Types represented as fixed-width primitive arrays (for example: C-style + integers and floating point numbers) +* Types having equivalent memory layout to a physical nested type (e.g. strings + use the list representation, but logically are not nested types) + +### Integers + +In the first version of Arrow we provide the standard 8-bit through 64-bit size +standard C integer types, both signed and unsigned: + +* Signed types: Int8, Int16, Int32, Int64 +* Unsigned types: UInt8, UInt16, UInt32, UInt64 + +The IDL looks like: + +``` +table Int { + bitWidth: int; + is_signed: bool; +} +``` + +The integer endianness is currently set globally at the schema level. If a +schema is set to be little-endian, then all integer types occurring within must +be little-endian. Integers that are part of other data representations, such as +list offsets and union types, must have the same endianness as the entire +record batch. + +### Floating point numbers + +We provide 3 types of floating point numbers as fixed bit-width primitive array + +- Half precision, 16-bit width +- Single precision, 32-bit width +- Double precision, 64-bit width + +The IDL looks like: + +``` +enum Precision:int {HALF, SINGLE, DOUBLE} + +table FloatingPoint { + precision: Precision; +} +``` + +### Boolean + +The Boolean logical type is represented as a 1-bit wide primitive physical +type. The bits are numbered using least-significant bit (LSB) ordering. + +Like other fixed bit-width primitive types, boolean data appears as 2 buffers +in the data header (one bitmap for the validity vector and one for the values). + +### List + +The `List` logical type is the logical (and identically-named) counterpart to +the List physical type. + +In data header form, the list field node contains 2 buffers: + +* Validity bitmap +* List offsets + +The buffers associated with a list's child field are handled recursively +according to the child logical type (e.g. `List` vs. `List`). + +### Utf8 and Binary + +We specify two logical types for variable length bytes: + +* `Utf8` data is unicode values with UTF-8 encoding +* `Binary` is any other variable length bytes + +These types both have the same memory layout as the nested type `List`, +with the constraint that the inner bytes can contain no null values. From a +logical type perspective they are primitive, not nested types. + +In data header form, while `List` would appear as 2 field nodes (`List` +and `UInt8`) and 4 buffers (2 for each of the nodes, as per above), these types +have a simplified representation single field node (of `Utf8` or `Binary` +logical type, which have no children) and 3 buffers: + +* Validity bitmap +* List offsets +* Byte data + +### Decimal + +TBD + +### Timestamp + +TBD + +## Dictionary encoding + +[1]: https://github.com/apache/arrow/blob/master/format/Layout.md +[2]: http://github.com/google/flatbuffers +[3]: https://github.com/apache/arrow/blob/master/format/Message.fbs diff --git a/format/README.md b/format/README.md index c84e00772c3..3b0e50364d8 100644 --- a/format/README.md +++ b/format/README.md @@ -6,6 +6,7 @@ Currently, the Arrow specification consists of these pieces: +- Metadata specification (see Metadata.md) - Physical memory layout specification (see Layout.md) - Metadata serialized representation (see Message.fbs) From e081a4c27a5a592251f9f325a05479d4120e30e6 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Sun, 28 Aug 2016 13:45:34 -0400 Subject: [PATCH 123/210] ARROW-271: Update Field structure to be more explicit This is a proposal. I have not updated the code depending on this yet. Author: Julien Le Dem Closes #124 from julienledem/record_batch and squashes the following commits: 8e42d74 [Julien Le Dem] ARROW-271: Update Field structure to be more explicit add bit_width to vector layout --- format/Message.fbs | 26 ++++++--- .../templates/NullableValueVectors.java | 6 ++- .../arrow/vector/schema/ArrowVectorType.java | 2 +- .../arrow/vector/schema/TypeLayout.java | 22 +++++++- .../arrow/vector/schema/VectorLayout.java | 54 +++++++++++++++---- .../apache/arrow/vector/types/pojo/Field.java | 43 ++++++++------- .../apache/arrow/vector/pojo/TestConvert.java | 2 + 7 files changed, 115 insertions(+), 40 deletions(-) diff --git a/format/Message.fbs b/format/Message.fbs index 71428b58103..9c957248977 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -92,17 +92,31 @@ union Type { JSONScalar } +/// ---------------------------------------------------------------------- +/// The possible types of a vector + enum VectorType: short { - /// used in List type Dense Union and variable length primitive types (String, Binary) + /// used in List type, Dense Union and variable length primitive types (String, Binary) OFFSET, - /// fixed length primitive values - VALUES, - /// Bit vector indicated if each value is null + /// actual data, either wixed width primitive types in slots or variable width delimited by an OFFSET vector + DATA, + /// Bit vector indicating if each value is null VALIDITY, /// Type vector used in Union type TYPE } +/// ---------------------------------------------------------------------- +/// represents the physical layout of a buffer +/// buffers have fixed width slots of a given type + +table VectorLayout { + /// the width of a slot in the buffer (typically 1, 8, 16, 32 or 64) + bit_width: short; + /// the purpose of the vector + type: VectorType; +} + /// ---------------------------------------------------------------------- /// A field represents a named column in a record / row batch or child of a /// nested type. @@ -121,10 +135,10 @@ table Field { dictionary: long; // children apply only to Nested data types like Struct, List and Union children: [Field]; - /// the buffers produced for this type (as derived from the Type) + /// layout of buffers produced for this type (as derived from the Type) /// does not include children /// each recordbatch will return instances of those Buffers. - buffers: [ VectorType ]; + layout: [ VectorLayout ]; } /// ---------------------------------------------------------------------- diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index 6b1aa040a5b..bb2c0012160 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -34,6 +34,8 @@ <#include "/@includes/vv_imports.ftl" /> +import org.apache.arrow.flatbuf.Precision; + /** * Nullable${minor.class} implements a vector of values which could be null. Elements in the vector * are first checked against a fixed length vector of boolean values. Then the element is retrieved @@ -97,9 +99,9 @@ public final class ${className} extends BaseDataValueVector implements <#if type <#elseif minor.class == "Time"> field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Time(), null); <#elseif minor.class == "Float4"> - field = new Field(name, true, new FloatingPoint(org.apache.arrow.flatbuf.Precision.SINGLE), null); + field = new Field(name, true, new FloatingPoint(Precision.SINGLE), null); <#elseif minor.class == "Float8"> - field = new Field(name, true, new FloatingPoint(org.apache.arrow.flatbuf.Precision.DOUBLE), null); + field = new Field(name, true, new FloatingPoint(Precision.DOUBLE), null); <#elseif minor.class == "TimeStamp"> field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(""), null); <#elseif minor.class == "IntervalDay"> diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java index e3d3e34e0ae..9b7fa45bb9a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java @@ -21,7 +21,7 @@ public class ArrowVectorType { - public static final ArrowVectorType VALUES = new ArrowVectorType(VectorType.VALUES); + public static final ArrowVectorType DATA = new ArrowVectorType(VectorType.DATA); public static final ArrowVectorType OFFSET = new ArrowVectorType(VectorType.OFFSET); public static final ArrowVectorType VALIDITY = new ArrowVectorType(VectorType.VALIDITY); public static final ArrowVectorType TYPE = new ArrowVectorType(VectorType.TYPE); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 1275e0eb5dc..15cd49865bd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -49,6 +49,8 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import com.google.common.base.Preconditions; + /** * The layout of vectors for a given type * It defines its own vectors followed by the vectors for the children @@ -182,7 +184,7 @@ public TypeLayout visit(IntervalYear type) { // TODO: check size public TypeLayout(List vectors) { super(); - this.vectors = vectors; + this.vectors = Preconditions.checkNotNull(vectors); } public TypeLayout(VectorLayout... vectors) { @@ -205,4 +207,22 @@ public List getVectorTypes() { public String toString() { return "TypeLayout{" + vectors + "}"; } + + @Override + public int hashCode() { + return vectors.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + TypeLayout other = (TypeLayout) obj; + return vectors.equals(other.vectors); + } + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java index 421ebcb8376..532e9d2328b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java @@ -17,21 +17,24 @@ */ package org.apache.arrow.vector.schema; +import static org.apache.arrow.vector.schema.ArrowVectorType.DATA; import static org.apache.arrow.vector.schema.ArrowVectorType.OFFSET; import static org.apache.arrow.vector.schema.ArrowVectorType.TYPE; import static org.apache.arrow.vector.schema.ArrowVectorType.VALIDITY; -import static org.apache.arrow.vector.schema.ArrowVectorType.VALUES; -public class VectorLayout { +import com.google.common.base.Preconditions; +import com.google.flatbuffers.FlatBufferBuilder; + +public class VectorLayout implements FBSerializable { private static final VectorLayout VALIDITY_VECTOR = new VectorLayout(VALIDITY, 1); private static final VectorLayout OFFSET_VECTOR = new VectorLayout(OFFSET, 32); private static final VectorLayout TYPE_VECTOR = new VectorLayout(TYPE, 32); - private static final VectorLayout BOOLEAN_VECTOR = new VectorLayout(VALUES, 1); - private static final VectorLayout VALUES_64 = new VectorLayout(VALUES, 64); - private static final VectorLayout VALUES_32 = new VectorLayout(VALUES, 32); - private static final VectorLayout VALUES_16 = new VectorLayout(VALUES, 16); - private static final VectorLayout VALUES_8 = new VectorLayout(VALUES, 8); + private static final VectorLayout BOOLEAN_VECTOR = new VectorLayout(DATA, 1); + private static final VectorLayout VALUES_64 = new VectorLayout(DATA, 64); + private static final VectorLayout VALUES_32 = new VectorLayout(DATA, 32); + private static final VectorLayout VALUES_16 = new VectorLayout(DATA, 16); + private static final VectorLayout VALUES_8 = new VectorLayout(DATA, 8); public static VectorLayout typeVector() { return TYPE_VECTOR; @@ -68,14 +71,21 @@ public static VectorLayout byteVector() { return dataVector(8); } - private final int typeBitWidth; + private final short typeBitWidth; private final ArrowVectorType type; private VectorLayout(ArrowVectorType type, int typeBitWidth) { super(); - this.type = type; - this.typeBitWidth = typeBitWidth; + this.type = Preconditions.checkNotNull(type); + this.typeBitWidth = (short)typeBitWidth; + if (typeBitWidth <= 0) { + throw new IllegalArgumentException("bitWidth invalid: " + typeBitWidth); + } + } + + public VectorLayout(org.apache.arrow.flatbuf.VectorLayout layout) { + this(new ArrowVectorType(layout.type()), layout.bitWidth()); } public int getTypeBitWidth() { @@ -90,4 +100,28 @@ public ArrowVectorType getType() { public String toString() { return String.format("{width=%s,type=%s}", typeBitWidth, type); } + + @Override + public int hashCode() { + return 31 * (31 + type.hashCode()) + typeBitWidth; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + VectorLayout other = (VectorLayout) obj; + return type.equals(other.type) && (typeBitWidth == other.typeBitWidth); + } + + @Override + public int writeTo(FlatBufferBuilder builder) {; + return org.apache.arrow.flatbuf.VectorLayout.createVectorLayout(builder, typeBitWidth, type.getType()); + } + + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index 36712b9bea3..cfa1ed40aeb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -20,12 +20,11 @@ import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; -import java.util.ArrayList; import java.util.List; import java.util.Objects; -import org.apache.arrow.vector.schema.ArrowVectorType; import org.apache.arrow.vector.schema.TypeLayout; +import org.apache.arrow.vector.schema.VectorLayout; import com.google.common.collect.ImmutableList; import com.google.flatbuffers.FlatBufferBuilder; @@ -37,7 +36,7 @@ public class Field { private final List children; private final TypeLayout typeLayout; - public Field(String name, boolean nullable, ArrowType type, List children) { + private Field(String name, boolean nullable, ArrowType type, List children, TypeLayout typeLayout) { this.name = name; this.nullable = nullable; this.type = type; @@ -46,34 +45,37 @@ public Field(String name, boolean nullable, ArrowType type, List children } else { this.children = children; } - this.typeLayout = TypeLayout.getTypeLayout(type); + this.typeLayout = typeLayout; + } + + public Field(String name, boolean nullable, ArrowType type, List children) { + this(name, nullable, type, children, TypeLayout.getTypeLayout(type)); } public static Field convertField(org.apache.arrow.flatbuf.Field field) { String name = field.name(); boolean nullable = field.nullable(); ArrowType type = getTypeForField(field); - List buffers = new ArrayList<>(); - for (int i = 0; i < field.buffersLength(); ++i) { - buffers.add(new ArrowVectorType(field.buffers(i))); + ImmutableList.Builder layout = ImmutableList.builder(); + for (int i = 0; i < field.layoutLength(); ++i) { + layout.add(new org.apache.arrow.vector.schema.VectorLayout(field.layout(i))); } ImmutableList.Builder childrenBuilder = ImmutableList.builder(); for (int i = 0; i < field.childrenLength(); i++) { childrenBuilder.add(convertField(field.children(i))); } List children = childrenBuilder.build(); - Field result = new Field(name, nullable, type, children); - TypeLayout typeLayout = result.getTypeLayout(); - if (typeLayout.getVectors().size() != field.buffersLength()) { - List types = new ArrayList<>(); - for (int i = 0; i < field.buffersLength(); i++) { - types.add(new ArrowVectorType(field.buffers(i))); - } - throw new IllegalArgumentException("Deserialized field does not match expected vectors. expected: " + typeLayout.getVectorTypes() + " got " + types); - } + Field result = new Field(name, nullable, type, children, new TypeLayout(layout.build())); return result; } + public void validate() { + TypeLayout expectedLayout = TypeLayout.getTypeLayout(type); + if (!expectedLayout.equals(typeLayout)) { + throw new IllegalArgumentException("Deserialized field does not match expected vectors. expected: " + expectedLayout + " got " + typeLayout); + } + } + public int getField(FlatBufferBuilder builder) { int nameOffset = builder.createString(name); int typeOffset = type.getType(builder); @@ -82,18 +84,19 @@ public int getField(FlatBufferBuilder builder) { childrenData[i] = children.get(i).getField(builder); } int childrenOffset = org.apache.arrow.flatbuf.Field.createChildrenVector(builder, childrenData); - short[] buffersData = new short[typeLayout.getVectors().size()]; + int[] buffersData = new int[typeLayout.getVectors().size()]; for (int i = 0; i < buffersData.length; i++) { - buffersData[i] = typeLayout.getVectors().get(i).getType().getType(); + VectorLayout vectorLayout = typeLayout.getVectors().get(i); + buffersData[i] = vectorLayout.writeTo(builder); } - int buffersOffset = org.apache.arrow.flatbuf.Field.createBuffersVector(builder, buffersData ); + int layoutOffset = org.apache.arrow.flatbuf.Field.createLayoutVector(builder, buffersData); org.apache.arrow.flatbuf.Field.startField(builder); org.apache.arrow.flatbuf.Field.addName(builder, nameOffset); org.apache.arrow.flatbuf.Field.addNullable(builder, nullable); org.apache.arrow.flatbuf.Field.addTypeType(builder, type.getTypeType()); org.apache.arrow.flatbuf.Field.addType(builder, typeOffset); org.apache.arrow.flatbuf.Field.addChildren(builder, childrenOffset); - org.apache.arrow.flatbuf.Field.addBuffers(builder, buffersOffset); + org.apache.arrow.flatbuf.Field.addLayout(builder, layoutOffset); return org.apache.arrow.flatbuf.Field.endField(builder); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index 61327f1970e..e557cc84f3b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -22,6 +22,8 @@ import static org.junit.Assert.assertEquals; import org.apache.arrow.flatbuf.UnionMode; +import static org.junit.Assert.assertEquals; + import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.List; From 0a411fd29ed1baac6f1524be82fc15e08f2b28db Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 28 Aug 2016 15:25:35 -0400 Subject: [PATCH 124/210] ARROW-242: Support Timestamp Data Type For the Pandas<->Parquet bridge this is a lossy conversion but must be explicitly activated by the user. Regarding Parquet 1.0: Yes, the logical type is not supported but should be simply ignored by the reader. Implementation for INT96 timestamps is not in the scope of this PR. Author: Uwe L. Korn Closes #107 from xhochy/arrow-242 and squashes the following commits: 8db6968 [Uwe L. Korn] Add missing include 34126b1 [Uwe L. Korn] ARROW-242: Support Timestamp Data Type --- .../parquet/parquet-reader-writer-test.cc | 12 +- cpp/src/arrow/parquet/parquet-schema-test.cc | 23 +++- cpp/src/arrow/parquet/reader.cc | 1 + cpp/src/arrow/parquet/schema.cc | 13 ++- cpp/src/arrow/parquet/writer.cc | 1 + cpp/src/arrow/types/construct.cc | 3 +- cpp/src/arrow/types/datetime.h | 12 +- cpp/src/arrow/types/primitive.cc | 1 + cpp/src/arrow/types/primitive.h | 11 ++ python/pyarrow/array.pyx | 40 ++++++- python/pyarrow/includes/libarrow.pxd | 1 + python/pyarrow/tests/test_convert_pandas.py | 24 +++- python/pyarrow/tests/test_parquet.py | 4 +- python/src/pyarrow/adapters/pandas.cc | 107 ++++++++++++++++-- 14 files changed, 232 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc index bfc27d26d63..d7b39dda377 100644 --- a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc +++ b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc @@ -137,6 +137,15 @@ struct test_traits { const int64_t test_traits::value(-1024); +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT64; + static constexpr LogicalType::type logical_enum = LogicalType::TIMESTAMP_MILLIS; + static int64_t const value; +}; + +const int64_t test_traits::value(14695634030000); + template <> struct test_traits { static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT; @@ -248,7 +257,8 @@ class TestParquetIO : public ::testing::Test { // Parquet version 1.0. typedef ::testing::Types TestTypes; + Int32Type, UInt64Type, Int64Type, TimestampType, FloatType, DoubleType, + StringType> TestTypes; TYPED_TEST_CASE(TestParquetIO, TestTypes); diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index 819cdd3ec43..a2bcd3e05c3 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -22,6 +22,7 @@ #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/types/datetime.h" #include "arrow/types/decimal.h" #include "arrow/util/status.h" @@ -45,6 +46,9 @@ const auto INT64 = std::make_shared(); const auto FLOAT = std::make_shared(); const auto DOUBLE = std::make_shared(); const auto UTF8 = std::make_shared(); +const auto TIMESTAMP_MS = std::make_shared(TimestampType::Unit::MILLI); +// TODO: This requires parquet-cpp implementing the MICROS enum value +// const auto TIMESTAMP_US = std::make_shared(TimestampType::Unit::MICRO); const auto BINARY = std::make_shared(std::make_shared("", UINT8)); const auto DECIMAL_8_4 = std::make_shared(8, 4); @@ -89,6 +93,14 @@ TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64)); arrow_fields.push_back(std::make_shared("int64", INT64, false)); + parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, + ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); + arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_MS, false)); + + // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, + // ParquetType::INT64, LogicalType::TIMESTAMP_MICROS)); + // arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_US, false)); + parquet_fields.push_back( PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); arrow_fields.push_back(std::make_shared("float", FLOAT)); @@ -153,9 +165,6 @@ TEST_F(TestConvertParquetSchema, UnsupportedThings) { unsupported_nodes.push_back(PrimitiveNode::Make( "int32", Repetition::OPTIONAL, ParquetType::INT32, LogicalType::DATE)); - unsupported_nodes.push_back(PrimitiveNode::Make( - "int64", Repetition::OPTIONAL, ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); - for (const NodePtr& node : unsupported_nodes) { ASSERT_RAISES(NotImplemented, ConvertSchema({node})); } @@ -209,6 +218,14 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64)); arrow_fields.push_back(std::make_shared("int64", INT64, false)); + parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, + ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); + arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_MS, false)); + + // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, + // ParquetType::INT64, LogicalType::TIMESTAMP_MICROS)); + // arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_US, false)); + parquet_fields.push_back( PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); arrow_fields.push_back(std::make_shared("float", FLOAT)); diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index e92967e5363..9f6212570dc 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -368,6 +368,7 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr* TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) TYPED_BATCH_CASE(STRING, StringType, ::parquet::ByteArrayType) + TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type) default: return Status::NotImplemented(field_->type->ToString()); } diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index a79342afe2f..cd91df32271 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -52,6 +52,7 @@ const auto INT64 = std::make_shared(); const auto FLOAT = std::make_shared(); const auto DOUBLE = std::make_shared(); const auto UTF8 = std::make_shared(); +const auto TIMESTAMP_MS = std::make_shared(TimestampType::Unit::MILLI); const auto BINARY = std::make_shared(std::make_shared("", UINT8)); TypePtr MakeDecimalType(const PrimitiveNode* node) { @@ -133,6 +134,9 @@ static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { case LogicalType::DECIMAL: *out = MakeDecimalType(node); break; + case LogicalType::TIMESTAMP_MILLIS: + *out = TIMESTAMP_MS; + break; default: return Status::NotImplemented("Unhandled logical type for int64"); break; @@ -289,10 +293,15 @@ Status FieldToNode(const std::shared_ptr& field, type = ParquetType::INT32; logical_type = LogicalType::DATE; break; - case Type::TIMESTAMP: + case Type::TIMESTAMP: { + auto timestamp_type = static_cast(field->type.get()); + if (timestamp_type->unit != TimestampType::Unit::MILLI) { + return Status::NotImplemented( + "Other timestamp units than millisecond are not yet support with parquet."); + } type = ParquetType::INT64; logical_type = LogicalType::TIMESTAMP_MILLIS; - break; + } break; case Type::TIMESTAMP_DOUBLE: type = ParquetType::INT64; // This is specified as seconds since the UNIX epoch diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index f9514aa2ad2..ddee573fa1e 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -240,6 +240,7 @@ Status FileWriter::Impl::WriteFlatColumnChunk( TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) TYPED_BATCH_CASE(UINT64, UInt64Type, ::parquet::Int64Type) TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) + TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type) TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) default: diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 5ae9c5ab6d4..0b71ea96551 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -51,6 +51,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(INT32, Int32Builder); BUILDER_CASE(UINT64, UInt64Builder); BUILDER_CASE(INT64, Int64Builder); + BUILDER_CASE(TIMESTAMP, TimestampBuilder); BUILDER_CASE(BOOL, BooleanBuilder); @@ -105,7 +106,7 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length, MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); - MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray); diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index b782455546c..241a126d100 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -18,6 +18,8 @@ #ifndef ARROW_TYPES_DATETIME_H #define ARROW_TYPES_DATETIME_H +#include + #include "arrow/type.h" namespace arrow { @@ -34,15 +36,23 @@ struct DateType : public DataType { static char const* name() { return "date"; } }; -struct TimestampType : public DataType { +struct ARROW_EXPORT TimestampType : public DataType { enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; + typedef int64_t c_type; + static constexpr Type::type type_enum = Type::TIMESTAMP; + + int value_size() const override { return sizeof(int64_t); } + Unit unit; explicit TimestampType(Unit unit = Unit::MILLI) : DataType(Type::TIMESTAMP), unit(unit) {} TimestampType(const TimestampType& other) : TimestampType(other.unit) {} + virtual ~TimestampType() {} + + std::string ToString() const override { return "timestamp"; } static char const* name() { return "timestamp"; } }; diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index f4b47f9d2f5..375e94f2bc1 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -158,6 +158,7 @@ template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; +template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 770de765f1f..c643783f681 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/type.h" +#include "arrow/types/datetime.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -100,6 +101,7 @@ NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type); NUMERIC_ARRAY_DECL(Int32Array, Int32Type); NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type); NUMERIC_ARRAY_DECL(Int64Array, Int64Type); +NUMERIC_ARRAY_DECL(TimestampArray, TimestampType); NUMERIC_ARRAY_DECL(FloatArray, FloatType); NUMERIC_ARRAY_DECL(DoubleArray, DoubleType); @@ -235,7 +237,15 @@ struct type_traits { static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; + +template <> +struct type_traits { + typedef TimestampArray ArrayType; + + static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } +}; template <> + struct type_traits { typedef FloatArray ArrayType; @@ -260,6 +270,7 @@ typedef NumericBuilder Int8Builder; typedef NumericBuilder Int16Builder; typedef NumericBuilder Int32Builder; typedef NumericBuilder Int64Builder; +typedef NumericBuilder TimestampBuilder; typedef NumericBuilder FloatBuilder; typedef NumericBuilder DoubleBuilder; diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 619e5ef7e39..5229b429f58 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -19,6 +19,8 @@ # distutils: language = c++ # cython: embedsignature = True +import numpy as np + from pyarrow.includes.libarrow cimport * cimport pyarrow.includes.pyarrow as pyarrow @@ -186,6 +188,7 @@ cdef dict _array_classes = { Type_DOUBLE: DoubleArray, Type_LIST: ListArray, Type_STRING: StringArray, + Type_TIMESTAMP: Int64Array, } cdef object box_arrow_array(const shared_ptr[CArray]& sp_array): @@ -217,11 +220,28 @@ def from_pylist(object list_obj, DataType type=None): return box_arrow_array(sp_array) -def from_pandas_series(object series, object mask=None): +def from_pandas_series(object series, object mask=None, timestamps_to_ms=False): + """ + Convert pandas.Series to an Arrow Array. + + Parameters + ---------- + series: pandas.Series or numpy.ndarray + + mask: pandas.Series or numpy.ndarray + array to mask null entries in the series + + timestamps_to_ms: bool + Convert datetime columns to ms resolution. This is needed for + compability with other functionality like Parquet I/O which + only supports milliseconds. + """ cdef: shared_ptr[CArray] out series_values = series_as_ndarray(series) + if series_values.dtype.type == np.datetime64 and timestamps_to_ms: + series_values = series_values.astype('datetime64[ms]') if mask is None: check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(), @@ -234,14 +254,28 @@ def from_pandas_series(object series, object mask=None): return box_arrow_array(out) -def from_pandas_dataframe(object df, name=None): +def from_pandas_dataframe(object df, name=None, timestamps_to_ms=False): + """ + Convert pandas.DataFrame to an Arrow Table + + Parameters + ---------- + df: pandas.DataFrame + + name: str + + timestamps_to_ms: bool + Convert datetime columns to ms resolution. This is needed for + compability with other functionality like Parquet I/O which + only supports milliseconds. + """ cdef: list names = [] list arrays = [] for name in df.columns: col = df[name] - arr = from_pandas_series(col) + arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms) names.append(name) arrays.append(arr) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 91ce069df8f..854d07d691d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -38,6 +38,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: Type_FLOAT" arrow::Type::FLOAT" Type_DOUBLE" arrow::Type::DOUBLE" + Type_TIMESTAMP" arrow::Type::TIMESTAMP" Type_STRING" arrow::Type::STRING" Type_LIST" arrow::Type::LIST" diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 6dc9c689e24..55302996f45 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -33,8 +33,9 @@ def setUp(self): def tearDown(self): pass - def _check_pandas_roundtrip(self, df, expected=None): - table = A.from_pandas_dataframe(df) + def _check_pandas_roundtrip(self, df, expected=None, + timestamps_to_ms=False): + table = A.from_pandas_dataframe(df, timestamps_to_ms=timestamps_to_ms) result = table.to_pandas() if expected is None: expected = df @@ -164,6 +165,25 @@ def test_strings(self): expected = pd.DataFrame({'strings': values * repeats}) self._check_pandas_roundtrip(df, expected) + def test_timestamps_notimezone(self): + df = pd.DataFrame({ + 'datetime64': np.array([ + '2007-07-13T01:23:34.123', + '2006-01-13T12:34:56.432', + '2010-08-13T05:46:57.437'], + dtype='datetime64[ms]') + }) + self._check_pandas_roundtrip(df, timestamps_to_ms=True) + + df = pd.DataFrame({ + 'datetime64': np.array([ + '2007-07-13T01:23:34.123456789', + '2006-01-13T12:34:56.432539784', + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ns]') + }) + self._check_pandas_roundtrip(df, timestamps_to_ms=False) + # def test_category(self): # repeats = 1000 # values = [b'foo', None, u'bar', 'qux', np.nan] diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index de9cfbb46e1..d89d947b7b6 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -57,11 +57,13 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, + # Pandas only support ns resolution, Arrow at the moment only ms + 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None] }) filename = tmpdir.join('pandas_rountrip.parquet') - arrow_table = A.from_pandas_dataframe(df) + arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True) A.parquet.write_table(arrow_table, filename.strpath, version="2.0") table_read = pyarrow.parquet.read_table(filename.strpath) df_read = table_read.to_pandas() diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 8dcc2b1c92e..a4e7fb6f3bb 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -38,6 +38,7 @@ namespace pyarrow { using arrow::Array; using arrow::Column; +using arrow::DataType; namespace util = arrow::util; // ---------------------------------------------------------------------- @@ -50,7 +51,7 @@ struct npy_traits { template <> struct npy_traits { typedef uint8_t value_type; - using ArrayType = arrow::BooleanArray; + using TypeClass = arrow::BooleanType; static constexpr bool supports_nulls = false; static inline bool isnull(uint8_t v) { @@ -62,7 +63,7 @@ struct npy_traits { template <> \ struct npy_traits { \ typedef T value_type; \ - using ArrayType = arrow::CapType##Array; \ + using TypeClass = arrow::CapType##Type; \ \ static constexpr bool supports_nulls = false; \ static inline bool isnull(T v) { \ @@ -82,7 +83,7 @@ NPY_INT_DECL(UINT64, UInt64, uint64_t); template <> struct npy_traits { typedef float value_type; - using ArrayType = arrow::FloatArray; + using TypeClass = arrow::FloatType; static constexpr bool supports_nulls = true; @@ -94,7 +95,7 @@ struct npy_traits { template <> struct npy_traits { typedef double value_type; - using ArrayType = arrow::DoubleArray; + using TypeClass = arrow::DoubleType; static constexpr bool supports_nulls = true; @@ -103,6 +104,22 @@ struct npy_traits { } }; +template <> +struct npy_traits { + typedef double value_type; + using TypeClass = arrow::TimestampType; + + static constexpr bool supports_nulls = true; + + static inline bool isnull(int64_t v) { + // NaT = -2**63 + // = -0x8000000000000000 + // = -9223372036854775808; + // = std::numeric_limits::min() + return v == std::numeric_limits::min(); + } +}; + template <> struct npy_traits { typedef PyObject* value_type; @@ -206,6 +223,8 @@ class ArrowSerializer { return Status::OK(); } + Status MakeDataType(std::shared_ptr* out); + arrow::MemoryPool* pool_; PyArrayObject* arr_; @@ -253,6 +272,39 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) return null_count; } +template +inline Status ArrowSerializer::MakeDataType(std::shared_ptr* out) { + out->reset(new typename npy_traits::TypeClass()); + return Status::OK(); +} + +template <> +inline Status ArrowSerializer::MakeDataType(std::shared_ptr* out) { + PyArray_Descr* descr = PyArray_DESCR(arr_); + auto date_dtype = reinterpret_cast(descr->c_metadata); + arrow::TimestampType::Unit unit; + + switch (date_dtype->meta.base) { + case NPY_FR_s: + unit = arrow::TimestampType::Unit::SECOND; + break; + case NPY_FR_ms: + unit = arrow::TimestampType::Unit::MILLI; + break; + case NPY_FR_us: + unit = arrow::TimestampType::Unit::MICRO; + break; + case NPY_FR_ns: + unit = arrow::TimestampType::Unit::NANO; + break; + default: + return Status::ValueError("Unknown NumPy datetime unit"); + } + + out->reset(new arrow::TimestampType(unit)); + return Status::OK(); +} + template inline Status ArrowSerializer::Convert(std::shared_ptr* out) { typedef npy_traits traits; @@ -269,9 +321,9 @@ inline Status ArrowSerializer::Convert(std::shared_ptr* out) { } RETURN_NOT_OK(ConvertData()); - *out = std::make_shared(length_, data_, null_count, - null_bitmap_); - + std::shared_ptr type; + RETURN_NOT_OK(MakeDataType(&type)); + RETURN_ARROW_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_, out)); return Status::OK(); } @@ -402,6 +454,7 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, TO_ARROW_CASE(UINT64); TO_ARROW_CASE(FLOAT32); TO_ARROW_CASE(FLOAT64); + TO_ARROW_CASE(DATETIME); TO_ARROW_CASE(OBJECT); default: std::stringstream ss; @@ -476,6 +529,17 @@ struct arrow_traits { typedef typename npy_traits::value_type T; }; +template <> +struct arrow_traits { + static constexpr int npy_type = NPY_DATETIME; + static constexpr bool supports_nulls = true; + static constexpr int64_t na_value = std::numeric_limits::min(); + static constexpr bool is_boolean = false; + static constexpr bool is_integer = true; + static constexpr bool is_floating = false; + typedef typename npy_traits::value_type T; +}; + template <> struct arrow_traits { static constexpr int npy_type = NPY_OBJECT; @@ -494,6 +558,30 @@ static inline PyObject* make_pystring(const uint8_t* data, int32_t length) { #endif } +inline void set_numpy_metadata(int type, DataType* datatype, PyArrayObject* out) { + if (type == NPY_DATETIME) { + auto timestamp_type = static_cast(datatype); + // We only support ms resolution at the moment + PyArray_Descr* descr = PyArray_DESCR(out); + auto date_dtype = reinterpret_cast(descr->c_metadata); + + switch (timestamp_type->unit) { + case arrow::TimestampType::Unit::SECOND: + date_dtype->meta.base = NPY_FR_s; + break; + case arrow::TimestampType::Unit::MILLI: + date_dtype->meta.base = NPY_FR_ms; + break; + case arrow::TimestampType::Unit::MICRO: + date_dtype->meta.base = NPY_FR_us; + break; + case arrow::TimestampType::Unit::NANO: + date_dtype->meta.base = NPY_FR_ns; + break; + } + } +} + template class ArrowDeserializer { public: @@ -522,6 +610,8 @@ class ArrowDeserializer { return Status::OK(); } + set_numpy_metadata(type, col_->type().get(), out_); + return Status::OK(); } @@ -538,6 +628,8 @@ class ArrowDeserializer { return Status::OK(); } + set_numpy_metadata(type, col_->type().get(), out_); + if (PyArray_SetBaseObject(out_, py_ref_) == -1) { // Error occurred, trust that SetBaseObject set the error state return Status::OK(); @@ -713,6 +805,7 @@ Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, FROM_ARROW_CASE(FLOAT); FROM_ARROW_CASE(DOUBLE); FROM_ARROW_CASE(STRING); + FROM_ARROW_CASE(TIMESTAMP); default: return Status::NotImplemented("Arrow type reading not implemented"); } From e197b2d6e41d0cf6be7c097d6b09c3be29d12cc0 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Mon, 29 Aug 2016 16:08:23 -0700 Subject: [PATCH 125/210] ARROW-279: rename vector module to arrow-vector Author: Julien Le Dem Closes #127 from julienledem/rename_vector and squashes the following commits: cf8a2aa [Julien Le Dem] ARROW-279: rename vector module to arrow-vector --- java/memory/pom.xml | 2 +- java/vector/pom.xml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/java/memory/pom.xml b/java/memory/pom.xml index 44332f5ed14..b91b5981559 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -18,7 +18,7 @@ 0.1-SNAPSHOT arrow-memory - arrow-memory + Arrow Memory diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 1a2921f6ea5..08f9bc8da4e 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -17,8 +17,8 @@ arrow-java-root 0.1-SNAPSHOT - vector - vectors + arrow-vector + Arrow Vectors From 2d8ec789365f3c0f82b1f22d76160d5af150dd31 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 6 Sep 2016 11:46:56 -0700 Subject: [PATCH 126/210] ARROW-274: Add NullableMapVector to support nullable maps Author: Julien Le Dem Closes #128 from julienledem/nullable_map and squashes the following commits: d98580a [Julien Le Dem] review feedback ee1dd45 [Julien Le Dem] Fix complex writers/readers 8780f48 [Julien Le Dem] ARROW-274: Add NullableMapVector to support nullable maps --- .../main/codegen/templates/MapWriters.java | 55 ++-- .../codegen/templates/UnionListWriter.java | 2 + .../main/codegen/templates/UnionVector.java | 6 +- .../main/codegen/templates/UnionWriter.java | 2 +- .../apache/arrow/vector/NullableVector.java | 2 +- .../apache/arrow/vector/VectorUnloader.java | 4 +- .../arrow/vector/complex/MapVector.java | 53 +--- .../vector/complex/NullableMapVector.java | 260 ++++++++++++++++++ .../complex/impl/AbstractBaseReader.java | 7 +- .../complex/impl/ComplexWriterImpl.java | 11 +- .../complex/impl/NullableMapReaderImpl.java | 45 +++ .../complex/impl/SingleMapReaderImpl.java | 4 +- .../arrow/vector/schema/TypeLayout.java | 3 +- .../org/apache/arrow/vector/types/Types.java | 8 +- .../arrow/vector/TestVectorUnloadLoad.java | 5 +- .../complex/impl/TestPromotableWriter.java | 4 +- .../complex/writer/TestComplexWriter.java | 33 ++- .../arrow/vector/file/TestArrowFile.java | 39 +-- .../apache/arrow/vector/pojo/TestConvert.java | 2 - 19 files changed, 408 insertions(+), 137 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index 8a8983a1497..7f319a9ca34 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -17,14 +17,13 @@ */ <@pp.dropOutputFile /> -<#list ["Single"] as mode> +<#list ["Nullable", "Single"] as mode> <@pp.changeOutputFile name="/org/apache/arrow/vector/complex/impl/${mode}MapWriter.java" /> +<#assign index = "idx()"> <#if mode == "Single"> <#assign containerClass = "MapVector" /> -<#assign index = "idx()"> <#else> -<#assign containerClass = "RepeatedMapVector" /> -<#assign index = "currentChildIndex"> +<#assign containerClass = "NullableMapVector" /> <#include "/@includes/license.ftl" /> @@ -49,9 +48,13 @@ public class ${mode}MapWriter extends AbstractFieldWriter { protected final ${containerClass} container; private final Map fields = Maps.newHashMap(); - <#if mode == "Repeated">private int currentChildIndex = 0; public ${mode}MapWriter(${containerClass} container) { + <#if mode == "Single"> + if (container instanceof NullableMapVector) { + throw new IllegalArgumentException("Invalid container: " + container); + } + this.container = container; } @@ -75,12 +78,12 @@ public MapWriter map(String name) { FieldWriter writer = fields.get(name.toLowerCase()); if(writer == null){ int vectorCount=container.size(); - MapVector vector = container.addOrGet(name, MinorType.MAP, MapVector.class); + NullableMapVector vector = container.addOrGet(name, MinorType.MAP, NullableMapVector.class); writer = new PromotableWriter(vector, container); if(vectorCount != container.size()) { writer.allocate(); } - writer.setPosition(${index}); + writer.setPosition(idx()); fields.put(name.toLowerCase(), writer); } return writer; @@ -117,40 +120,12 @@ public ListWriter list(String name) { if (container.size() > vectorCount) { writer.allocate(); } - writer.setPosition(${index}); + writer.setPosition(idx()); fields.put(name.toLowerCase(), writer); } return writer; } - <#if mode == "Repeated"> - public void start() { - // update the repeated vector to state that there is current+1 objects. - final RepeatedMapHolder h = new RepeatedMapHolder(); - final RepeatedMapVector map = (RepeatedMapVector) container; - final RepeatedMapVector.Mutator mutator = map.getMutator(); - - // Make sure that the current vector can support the end position of this list. - if(container.getValueCapacity() <= idx()) { - mutator.setValueCount(idx()+1); - } - - map.getAccessor().get(idx(), h); - if (h.start >= h.end) { - container.getMutator().startNewValue(idx()); - } - currentChildIndex = container.getMutator().add(idx()); - for(final FieldWriter w : fields.values()) { - w.setPosition(currentChildIndex); - } - } - - - public void end() { - // noop - } - <#else> - public void setValueCount(int count) { container.getMutator().setValueCount(count); } @@ -165,14 +140,16 @@ public void setPosition(int index) { @Override public void start() { + <#if mode == "Single"> + <#else> + container.getMutator().setIndexDefined(idx()); + } @Override public void end() { } - - <#list vv.types as type><#list type.minor as minor> <#assign lowerName = minor.class?uncap_first /> <#if lowerName == "int" ><#assign lowerName = "integer" /> @@ -204,7 +181,7 @@ public void end() { if (currentVector == null || currentVector != vector) { vector.allocateNewSafe(); } - writer.setPosition(${index}); + writer.setPosition(idx()); fields.put(name.toLowerCase(), writer); } return writer; diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index 49d57e716bc..d502803d716 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -160,11 +160,13 @@ public void start() { vector.getMutator().setNotNull(idx()); offsets.getMutator().setSafe(idx() + 1, nextOffset); writer.setPosition(nextOffset); + writer.start(); } @Override public void end() { // if (inMap) { + writer.end(); inMap = false; final int nextOffset = offsets.getAccessor().get(idx() + 1); offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 72125fa50fb..3014bbba9d5 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -72,7 +72,7 @@ public class UnionVector implements FieldVector { MapVector internalMap; UInt1Vector typeVector; - private MapVector mapVector; + private NullableMapVector mapVector; private ListVector listVector; private FieldReader reader; @@ -127,10 +127,10 @@ public List getFieldInnerVectors() { throw new UnsupportedOperationException(); } - public MapVector getMap() { + public NullableMapVector getMap() { if (mapVector == null) { int vectorCount = internalMap.size(); - mapVector = internalMap.addOrGet("map", MinorType.MAP, MapVector.class); + mapVector = internalMap.addOrGet("map", MinorType.MAP, NullableMapVector.class); if (internalMap.size() > vectorCount) { mapVector.allocateNew(); if (callBack != null) { diff --git a/java/vector/src/main/codegen/templates/UnionWriter.java b/java/vector/src/main/codegen/templates/UnionWriter.java index 1137e2cb020..460ec1c0d95 100644 --- a/java/vector/src/main/codegen/templates/UnionWriter.java +++ b/java/vector/src/main/codegen/templates/UnionWriter.java @@ -74,7 +74,7 @@ public void endList() { private MapWriter getMapWriter() { if (mapWriter == null) { - mapWriter = new SingleMapWriter(data.getMap()); + mapWriter = new NullableMapWriter(data.getMap()); mapWriter.setPosition(idx()); writers.add(mapWriter); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java index 00c33fc2d6e..0212b3c0d7b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/NullableVector.java @@ -17,7 +17,7 @@ */ package org.apache.arrow.vector; -public interface NullableVector extends ValueVector{ +public interface NullableVector extends ValueVector { ValueVector getValuesVector(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java index e4d37bf47d1..3375a7d5c31 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -68,7 +68,9 @@ private void appendNodes(FieldVector vector, List nodes, List fieldBuffers = vector.getFieldBuffers(); List expectedBuffers = vector.getField().getTypeLayout().getVectorTypes(); if (fieldBuffers.size() != expectedBuffers.size()) { - throw new IllegalArgumentException("wrong number of buffers for field " + vector.getField() + ". found: " + fieldBuffers); + throw new IllegalArgumentException(String.format( + "wrong number of buffers for field %s in vector %s. found: %s", + vector.getField(), vector.getClass().getSimpleName(), fieldBuffers)); } buffers.addAll(fieldBuffers); for (FieldVector child : vector.getChildrenFromFields()) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index e3696588e60..1b8483a3d41 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -18,9 +18,7 @@ package org.apache.arrow.vector.complex; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -28,15 +26,12 @@ import javax.annotation.Nullable; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.vector.BaseDataValueVector; import org.apache.arrow.vector.BaseValueVector; -import org.apache.arrow.vector.BufferBacked; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.holders.ComplexHolder; -import org.apache.arrow.vector.schema.ArrowFieldNode; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; @@ -49,26 +44,20 @@ import com.google.common.collect.Ordering; import com.google.common.primitives.Ints; -import io.netty.buffer.ArrowBuf; - -public class MapVector extends AbstractMapVector implements FieldVector { +public class MapVector extends AbstractMapVector { //private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(MapVector.class); - private final SingleMapReaderImpl reader = new SingleMapReaderImpl(MapVector.this); + private final SingleMapReaderImpl reader = new SingleMapReaderImpl(this); private final Accessor accessor = new Accessor(); private final Mutator mutator = new Mutator(); int valueCount; - // TODO: validity vector - private final List innerVectors = Collections.unmodifiableList(Arrays.asList()); - - public MapVector(String name, BufferAllocator allocator, CallBack callBack){ + public MapVector(String name, BufferAllocator allocator, CallBack callBack) { super(name, allocator, callBack); } @Override public FieldReader getReader() { - //return new SingleMapReaderImpl(MapVector.this); return reader; } @@ -124,18 +113,9 @@ public int getBufferSizeFor(final int valueCount) { return (int) bufferSize; } - @Override - public ArrowBuf[] getBuffers(boolean clear) { - int expectedSize = getBufferSize(); - int actualSize = super.getBufferSize(); - - Preconditions.checkArgument(expectedSize == actualSize, expectedSize + " != " + actualSize); - return super.getBuffers(clear); - } - @Override public TransferPair getTransferPair(BufferAllocator allocator) { - return new MapTransferPair(this, name, allocator); + return new MapTransferPair(this, new MapVector(name, allocator, callBack), false); } @Override @@ -145,7 +125,7 @@ public TransferPair makeTransferPair(ValueVector to) { @Override public TransferPair getTransferPair(String ref, BufferAllocator allocator) { - return new MapTransferPair(this, ref, allocator); + return new MapTransferPair(this, new MapVector(ref, allocator, callBack), false); } protected static class MapTransferPair implements TransferPair{ @@ -153,10 +133,6 @@ protected static class MapTransferPair implements TransferPair{ private final MapVector from; private final MapVector to; - public MapTransferPair(MapVector from, String name, BufferAllocator allocator) { - this(from, new MapVector(name, allocator, from.callBack), false); - } - public MapTransferPair(MapVector from, MapVector to) { this(from, to, true); } @@ -335,7 +311,6 @@ public void close() { super.close(); } - @Override public void initializeChildrenFromFields(List children) { for (Field field : children) { MinorType minorType = Types.getMinorTypeForArrowType(field.getType()); @@ -344,25 +319,9 @@ public void initializeChildrenFromFields(List children) { } } - @Override + public List getChildrenFromFields() { return getChildren(); } - @Override - public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); - // TODO: something with fieldNode? - } - - @Override - public List getFieldBuffers() { - return BaseDataValueVector.unload(getFieldInnerVectors()); - } - - @Override - public List getFieldInnerVectors() { - return innerVectors; - } - } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java new file mode 100644 index 00000000000..6b257c095d2 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -0,0 +1,260 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.complex; + +import static com.google.common.base.Preconditions.checkNotNull; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BaseDataValueVector; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullableVectorDefinitionSetter; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.complex.impl.NullableMapReaderImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.holders.ComplexHolder; +import org.apache.arrow.vector.schema.ArrowFieldNode; +import org.apache.arrow.vector.util.CallBack; +import org.apache.arrow.vector.util.TransferPair; + +import com.google.common.collect.ObjectArrays; + +import io.netty.buffer.ArrowBuf; + +public class NullableMapVector extends MapVector implements FieldVector { + + private final NullableMapReaderImpl reader = new NullableMapReaderImpl(this); + + protected final UInt1Vector bits; + + private final List innerVectors; + + private final Accessor accessor; + private final Mutator mutator; + + public NullableMapVector(String name, BufferAllocator allocator, CallBack callBack) { + super(name, checkNotNull(allocator), callBack); + this.bits = new UInt1Vector("$bits$", allocator); + this.innerVectors = Collections.unmodifiableList(Arrays.asList(bits)); + this.accessor = new Accessor(); + this.mutator = new Mutator(); + } + + @Override + public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { + BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + this.valueCount = fieldNode.getLength(); + } + + @Override + public List getFieldBuffers() { + return BaseDataValueVector.unload(getFieldInnerVectors()); + } + + @Override + public List getFieldInnerVectors() { + return innerVectors; + } + + @Override + public FieldReader getReader() { + return reader; + } + + @Override + public TransferPair getTransferPair(BufferAllocator allocator) { + return new NullableMapTransferPair(this, new NullableMapVector(name, allocator, callBack), false); + } + + @Override + public TransferPair makeTransferPair(ValueVector to) { + return new NullableMapTransferPair(this, (NullableMapVector) to, true); + } + + @Override + public TransferPair getTransferPair(String ref, BufferAllocator allocator) { + return new NullableMapTransferPair(this, new NullableMapVector(ref, allocator, callBack), false); + } + + protected class NullableMapTransferPair extends MapTransferPair { + + private NullableMapVector target; + + protected NullableMapTransferPair(NullableMapVector from, NullableMapVector to, boolean allocate) { + super(from, to, allocate); + this.target = to; + } + + @Override + public void transfer() { + bits.transferTo(target.bits); + super.transfer(); + } + + @Override + public void copyValueSafe(int fromIndex, int toIndex) { + target.bits.copyFromSafe(fromIndex, toIndex, bits); + super.copyValueSafe(fromIndex, toIndex); + } + + @Override + public void splitAndTransfer(int startIndex, int length) { + bits.splitAndTransferTo(startIndex, length, target.bits); + super.splitAndTransfer(startIndex, length); + } + } + + @Override + public int getValueCapacity() { + return Math.min(bits.getValueCapacity(), super.getValueCapacity()); + } + + @Override + public ArrowBuf[] getBuffers(boolean clear) { + return ObjectArrays.concat(bits.getBuffers(clear), super.getBuffers(clear), ArrowBuf.class); + } + + @Override + public void close() { + bits.close(); + super.close(); + } + + @Override + public void clear() { + bits.clear(); + super.clear(); + } + + + @Override + public int getBufferSize(){ + return super.getBufferSize() + bits.getBufferSize(); + } + + @Override + public int getBufferSizeFor(final int valueCount) { + if (valueCount == 0) { + return 0; + } + return super.getBufferSizeFor(valueCount) + + bits.getBufferSizeFor(valueCount); + } + + @Override + public void setInitialCapacity(int numRecords) { + bits.setInitialCapacity(numRecords); + super.setInitialCapacity(numRecords); + } + + @Override + public boolean allocateNewSafe() { + /* Boolean to keep track if all the memory allocations were successful + * Used in the case of composite vectors when we need to allocate multiple + * buffers for multiple vectors. If one of the allocations failed we need to + * clear all the memory that we allocated + */ + boolean success = false; + try { + success = super.allocateNewSafe() && bits.allocateNewSafe(); + } finally { + if (!success) { + clear(); + } + } + bits.zeroVector(); + return success; + } + public final class Accessor extends MapVector.Accessor { + final UInt1Vector.Accessor bAccessor = bits.getAccessor(); + + @Override + public Object getObject(int index) { + if (isNull(index)) { + return null; + } else { + return super.getObject(index); + } + } + + @Override + public void get(int index, ComplexHolder holder) { + holder.isSet = isSet(index); + super.get(index, holder); + } + + @Override + public boolean isNull(int index) { + return isSet(index) == 0; + } + + public int isSet(int index){ + return bAccessor.get(index); + } + + } + + public final class Mutator extends MapVector.Mutator implements NullableVectorDefinitionSetter { + + private Mutator(){ + } + + @Override + public void setIndexDefined(int index){ + bits.getMutator().setSafe(index, 1); + } + + public void setNull(int index){ + bits.getMutator().setSafe(index, 0); + } + + @Override + public void setValueCount(int valueCount) { + assert valueCount >= 0; + super.setValueCount(valueCount); + bits.getMutator().setValueCount(valueCount); + } + + @Override + public void generateTestData(int valueCount){ + super.generateTestData(valueCount); + bits.getMutator().generateTestDataAlt(valueCount); + } + + @Override + public void reset(){ + bits.getMutator().setValueCount(0); + } + + } + + @Override + public Accessor getAccessor() { + return accessor; + } + + @Override + public Mutator getMutator() { + return mutator; + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java index 259a954233c..e7c3c8c7e4b 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/AbstractBaseReader.java @@ -19,15 +19,10 @@ import java.util.Iterator; -import com.google.flatbuffers.FlatBufferBuilder; -import org.apache.arrow.flatbuf.Type; -import org.apache.arrow.flatbuf.Union; -import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.holders.UnionHolder; -import org.apache.arrow.vector.types.pojo.Field; abstract class AbstractBaseReader implements FieldReader{ @@ -44,7 +39,7 @@ public void setPosition(int index){ this.index = index; } - int idx(){ + protected int idx(){ return index; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java index 89bfefc8f19..761b1b43c08 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/ComplexWriterImpl.java @@ -19,6 +19,7 @@ import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.StateTool; import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; import org.apache.arrow.vector.types.Types.MinorType; @@ -29,7 +30,7 @@ public class ComplexWriterImpl extends AbstractFieldWriter implements ComplexWriter { // private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(ComplexWriterImpl.class); - private SingleMapWriter mapRoot; + private NullableMapWriter mapRoot; private UnionListWriter listRoot; private final MapVector container; @@ -121,8 +122,8 @@ public MapWriter directMap(){ switch(mode){ case INIT: - MapVector map = (MapVector) container; - mapRoot = new SingleMapWriter(map); + NullableMapVector map = (NullableMapVector) container; + mapRoot = new NullableMapWriter(map); mapRoot.setPosition(idx()); mode = Mode.MAP; break; @@ -142,8 +143,8 @@ public MapWriter rootAsMap() { switch(mode){ case INIT: - MapVector map = container.addOrGet(name, MinorType.MAP, MapVector.class); - mapRoot = new SingleMapWriter(map); + NullableMapVector map = container.addOrGet(name, MinorType.MAP, NullableMapVector.class); + mapRoot = new NullableMapWriter(map); mapRoot.setPosition(idx()); mode = Mode.MAP; break; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java new file mode 100644 index 00000000000..18b35c194a1 --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java @@ -0,0 +1,45 @@ +/******************************************************************************* + + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.complex.impl; + +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; + +public class NullableMapReaderImpl extends SingleMapReaderImpl { + + private NullableMapVector nullableMapVector; + + public NullableMapReaderImpl(MapVector vector) { + super((NullableMapVector)vector); + this.nullableMapVector = (NullableMapVector)vector; + } + + @Override + public void copyAsValue(MapWriter writer){ + NullableMapWriter impl = (NullableMapWriter) writer; + impl.container.copyFromSafe(idx(), impl.idx(), nullableMapVector); + } + + @Override + public void copyAsField(String name, MapWriter writer){ + NullableMapWriter impl = (NullableMapWriter) writer.map(name); + impl.container.copyFromSafe(idx(), impl.idx(), nullableMapVector); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java index 1c43240901c..ae17b4bbb10 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/SingleMapReaderImpl.java @@ -1,5 +1,3 @@ - - /******************************************************************************* * Licensed to the Apache Software Foundation (ASF) under one @@ -27,9 +25,9 @@ import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.types.Types.MinorType; import com.google.common.collect.Maps; -import org.apache.arrow.vector.types.Types.MinorType; @SuppressWarnings("unused") public class SingleMapReaderImpl extends AbstractFieldReader{ diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 15cd49865bd..9f1efd056cb 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -90,8 +90,7 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { @Override public TypeLayout visit(Tuple type) { List vectors = asList( - // TODO: add validity vector in Map -// validityVector() + validityVector() ); return new TypeLayout(vectors); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 4d0d9ee114a..5eef8a008a9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -47,7 +47,7 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.UnionVector; import org.apache.arrow.vector.complex.impl.BigIntWriterImpl; import org.apache.arrow.vector.complex.impl.BitWriterImpl; @@ -58,7 +58,7 @@ import org.apache.arrow.vector.complex.impl.IntWriterImpl; import org.apache.arrow.vector.complex.impl.IntervalDayWriterImpl; import org.apache.arrow.vector.complex.impl.IntervalYearWriterImpl; -import org.apache.arrow.vector.complex.impl.SingleMapWriter; +import org.apache.arrow.vector.complex.impl.NullableMapWriter; import org.apache.arrow.vector.complex.impl.SmallIntWriterImpl; import org.apache.arrow.vector.complex.impl.TimeStampWriterImpl; import org.apache.arrow.vector.complex.impl.TimeWriterImpl; @@ -139,12 +139,12 @@ public Field getField() { @Override public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack callBack, int... precisionScale) { - return new MapVector(name, allocator, callBack); + return new NullableMapVector(name, allocator, callBack); } @Override public FieldWriter getNewFieldWriter(ValueVector vector) { - return new SingleMapWriter((MapVector) vector); + return new NullableMapWriter((NullableMapVector) vector); } }, // an empty map column. Useful for conceptual setup. Children listed within here diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java index 85bb2cfc99f..7dcb8977c0d 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestVectorUnloadLoad.java @@ -22,6 +22,7 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; @@ -60,14 +61,14 @@ public void test() throws IOException { } writer.setValueCount(count); - VectorUnloader vectorUnloader = new VectorUnloader((MapVector)parent.getChild("root")); + VectorUnloader vectorUnloader = new VectorUnloader(parent.getChild("root")); schema = vectorUnloader.getSchema(); try ( ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); BufferAllocator finalVectorsAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); MapVector newParent = new MapVector("parent", finalVectorsAllocator, null)) { - MapVector root = newParent.addOrGet("root", MinorType.MAP, MapVector.class); + FieldVector root = newParent.addOrGet("root", MinorType.MAP, NullableMapVector.class); VectorLoader vectorLoader = new VectorLoader(schema, root); vectorLoader.load(recordBatch); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index 24f00f14df0..689c96fda92 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -25,8 +25,8 @@ import org.apache.arrow.vector.DirtyRootAllocator; import org.apache.arrow.vector.complex.AbstractMapVector; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.UnionVector; -import org.apache.arrow.vector.holders.UInt4Holder; import org.apache.arrow.vector.types.Types.MinorType; import org.junit.After; import org.junit.Before; @@ -51,7 +51,7 @@ public void terminate() throws Exception { public void testPromoteToUnion() throws Exception { try (final AbstractMapVector container = new MapVector(EMPTY_SCHEMA_PATH, allocator, null); - final MapVector v = container.addOrGet("test", MinorType.MAP, MapVector.class); + final NullableMapVector v = container.addOrGet("test", MinorType.MAP, NullableMapVector.class); final PromotableWriter writer = new PromotableWriter(v, container)) { container.allocateNew(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index bc17a2b2835..fa710dae5ee 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -17,7 +17,6 @@ */ package org.apache.arrow.vector.complex.writer; -import io.netty.buffer.ArrowBuf; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.complex.ListVector; @@ -41,6 +40,8 @@ import org.junit.Assert; import org.junit.Test; +import io.netty.buffer.ArrowBuf; + public class TestComplexWriter { static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); @@ -71,6 +72,36 @@ public void simpleNestedTypes() { parent.close(); } + @Test + public void nullableMap() { + MapVector parent = new MapVector("parent", allocator, null); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + MapWriter mapWriter = rootWriter.map("map"); + BigIntWriter nested = mapWriter.bigInt("nested"); + for (int i = 0; i < COUNT; i++) { + if (i % 2 == 0) { + mapWriter.setPosition(i); + mapWriter.start(); + nested.writeBigInt(i); + mapWriter.end(); + } + } + writer.setValueCount(COUNT); + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + for (int i = 0; i < COUNT; i++) { + rootReader.setPosition(i); + if (i % 2 == 0) { + Assert.assertNotNull(rootReader.reader("map").readObject()); + Assert.assertEquals(i, rootReader.reader("map").reader("nested").readLong().longValue()); + } else { + Assert.assertNull(rootReader.reader("map").readObject()); + } + } + + parent.close(); + } + @Test public void listScalarType() { ListVector listVector = new ListVector("list", allocator, null); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index 11de0a2ef00..ad301689cd1 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -31,6 +31,7 @@ import org.apache.arrow.vector.VectorLoader; import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; @@ -47,10 +48,13 @@ import org.junit.Assert; import org.junit.Before; import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import io.netty.buffer.ArrowBuf; public class TestArrowFile { + private static final Logger LOGGER = LoggerFactory.getLogger(TestArrowFile.class); private static final int COUNT = 10; private BufferAllocator allocator; @@ -72,7 +76,7 @@ public void testWrite() throws IOException { BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); MapVector parent = new MapVector("parent", vectorAllocator, null)) { writeData(count, parent); - write((MapVector)parent.getChild("root"), file); + write(parent.getChild("root"), file); } } @@ -82,10 +86,10 @@ public void testWriteComplex() throws IOException { int count = COUNT; try ( BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); - MapVector parent = new MapVector("parent", vectorAllocator, null)) { + NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null)) { writeComplexData(count, parent); validateComplexContent(count, parent); - write((MapVector)parent.getChild("root"), file); + write(parent.getChild("root"), file); } } @@ -147,7 +151,7 @@ public void testWriteRead() throws IOException { BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); MapVector parent = new MapVector("parent", originalVectorAllocator, null)) { writeData(count, parent); - write((MapVector)parent.getChild("root"), file); + write(parent.getChild("root"), file); } // read @@ -160,11 +164,11 @@ public void testWriteRead() throws IOException { ) { ArrowFooter footer = arrowReader.readFooter(); Schema schema = footer.getSchema(); - System.out.println("reading schema: " + schema); + LOGGER.debug("reading schema: " + schema); // initialize vectors - MapVector root = parent.addOrGet("root", MinorType.MAP, MapVector.class); + NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); VectorLoader vectorLoader = new VectorLoader(schema, root); @@ -204,7 +208,7 @@ public void testWriteReadComplex() throws IOException { BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); MapVector parent = new MapVector("parent", originalVectorAllocator, null)) { writeComplexData(count, parent); - write((MapVector)parent.getChild("root"), file); + write(parent.getChild("root"), file); } // read @@ -213,16 +217,15 @@ public void testWriteReadComplex() throws IOException { FileInputStream fileInputStream = new FileInputStream(file); ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); - MapVector parent = new MapVector("parent", vectorAllocator, null) + NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null) ) { ArrowFooter footer = arrowReader.readFooter(); Schema schema = footer.getSchema(); - System.out.println("reading schema: " + schema); + LOGGER.debug("reading schema: " + schema); // initialize vectors - MapVector root = parent.addOrGet("root", MinorType.MAP, MapVector.class); - + NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); VectorLoader vectorLoader = new VectorLoader(schema, root); List recordBatches = footer.getRecordBatches(); @@ -237,16 +240,16 @@ public void testWriteReadComplex() throws IOException { public void printVectors(List vectors) { for (FieldVector vector : vectors) { - System.out.println(vector.getField().getName()); + LOGGER.debug(vector.getField().getName()); Accessor accessor = vector.getAccessor(); int valueCount = accessor.getValueCount(); for (int i = 0; i < valueCount; i++) { - System.out.println(accessor.getObject(i)); + LOGGER.debug(String.valueOf(accessor.getObject(i))); } } } - private void validateComplexContent(int count, MapVector parent) { + private void validateComplexContent(int count, NullableMapVector parent) { printVectors(parent.getChildrenFromFields()); MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); @@ -259,10 +262,10 @@ private void validateComplexContent(int count, MapVector parent) { } } - private void write(MapVector parent, File file) throws FileNotFoundException, IOException { + private void write(FieldVector parent, File file) throws FileNotFoundException, IOException { VectorUnloader vectorUnloader = new VectorUnloader(parent); Schema schema = vectorUnloader.getSchema(); - System.out.println("writing schema: " + schema); + LOGGER.debug("writing schema: " + schema); try ( FileOutputStream fileOutputStream = new FileOutputStream(file); ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); @@ -308,8 +311,8 @@ public void testWriteReadMultipleRBs() throws IOException { ) { ArrowFooter footer = arrowReader.readFooter(); Schema schema = footer.getSchema(); - System.out.println("reading schema: " + schema); - MapVector root = parent.addOrGet("root", MinorType.MAP, MapVector.class); + LOGGER.debug("reading schema: " + schema); + NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); VectorLoader vectorLoader = new VectorLoader(schema, root); List recordBatches = footer.getRecordBatches(); Assert.assertEquals(2, recordBatches.size()); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index e557cc84f3b..61327f1970e 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -22,8 +22,6 @@ import static org.junit.Assert.assertEquals; import org.apache.arrow.flatbuf.UnionMode; -import static org.junit.Assert.assertEquals; - import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.List; From 637584becb2db88fc510824c22b87e6effb2232f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 6 Sep 2016 23:59:30 -0400 Subject: [PATCH 127/210] ARROW-284: Disable arrow_parquet module in Travis CI to triage builds Author: Wes McKinney Closes #132 from wesm/ARROW-284 and squashes the following commits: e3410cf [Wes McKinney] Install miniconda in $HOME to avoid long prefix issues in conda-build 2.0 9fd94f5 [Wes McKinney] Do not run death test when valgrind is enabled. Gracefully skip pyarrow.parquet when ARROW_PARQUET=off ccf56f8 [Wes McKinney] Disable arrow_parquet module in Travis CI --- ci/travis_before_script_cpp.sh | 4 +-- ci/travis_install_conda.sh | 4 ++- ci/travis_script_python.sh | 6 ++-- cpp/cmake_modules/FindParquet.cmake | 1 + cpp/src/arrow/util/memory-pool-test.cc | 6 ++++ python/CMakeLists.txt | 41 ++++++++++++++++---------- python/cmake_modules/FindArrow.cmake | 26 +++++++++------- python/pyarrow/tests/test_io.py | 1 + python/pyarrow/tests/test_parquet.py | 38 ++++++++++++++++-------- python/pyarrow/tests/test_table.py | 7 +---- python/setup.py | 27 ++++++++++------- 11 files changed, 101 insertions(+), 60 deletions(-) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 08551f3b009..2f02ef247af 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -25,8 +25,8 @@ echo $GTEST_HOME CMAKE_COMMON_FLAGS="\ -DARROW_BUILD_BENCHMARKS=ON \ --DARROW_PARQUET=ON \ --DARROW_HDFS=on \ +-DARROW_PARQUET=OFF \ +-DARROW_HDFS=ON \ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" if [ $TRAVIS_OS_NAME == "linux" ]; then diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh index 3a8f57bf8f1..e9225259e6d 100644 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -9,7 +9,9 @@ else fi wget -O miniconda.sh $MINICONDA_URL -export MINICONDA=$TRAVIS_BUILD_DIR/miniconda + +export MINICONDA=$HOME/miniconda + bash miniconda.sh -b -p $MINICONDA export PATH="$MINICONDA/bin:$PATH" conda update -y -q conda diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 4a377428ae4..61c8e444361 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -5,7 +5,7 @@ set -e PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ -export MINICONDA=$TRAVIS_BUILD_DIR/miniconda +export MINICONDA=$HOME/miniconda export PATH="$MINICONDA/bin:$PATH" export PARQUET_HOME=$MINICONDA @@ -31,7 +31,9 @@ python_version_tests() { # Expensive dependencies install from Continuum package repo conda install -y pip numpy pandas cython - conda install -y parquet-cpp arrow-cpp -c apache/channel/dev + # conda install -y parquet-cpp + + conda install -y arrow-cpp -c apache/channel/dev # Other stuff pip install pip install -r requirements.txt diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index e3350d6e13d..36f4828a999 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -72,6 +72,7 @@ else () endif () mark_as_advanced( + PARQUET_FOUND PARQUET_INCLUDE_DIR PARQUET_LIBS PARQUET_LIBRARIES diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index deb7ffd03ba..e767e955524 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -46,6 +46,10 @@ TEST(DefaultMemoryPool, OOM) { ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); } +// Death tests and valgrind are known to not play well 100% of the time. See +// googletest documentation +#ifndef ARROW_VALGRIND + TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { MemoryPool* pool = default_memory_pool(); @@ -60,4 +64,6 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { pool->Free(data, 100); } +#endif // ARROW_VALGRIND + } // namespace arrow diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index fdbfce99656..522895808de 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -340,8 +340,10 @@ if (PYARROW_BUILD_TESTS) endif() ## Parquet -find_package(Parquet REQUIRED) -include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) +find_package(Parquet) +if(PARQUET_FOUND) + include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) +endif() ## Arrow find_package(Arrow REQUIRED) @@ -350,8 +352,6 @@ ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) ADD_THIRDPARTY_LIB(arrow_io SHARED_LIB ${ARROW_IO_SHARED_LIB}) -ADD_THIRDPARTY_LIB(arrow_parquet - SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) ############################################################ # Linker setup @@ -418,6 +418,16 @@ endif() add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) +set(CYTHON_EXTENSIONS + array + config + error + io + scalar + schema + table +) + set(PYARROW_SRCS src/pyarrow/common.cc src/pyarrow/config.cc @@ -431,9 +441,19 @@ set(PYARROW_SRCS set(LINK_LIBS arrow arrow_io - arrow_parquet ) +if(PARQUET_FOUND AND ARROW_PARQUET_FOUND) + ADD_THIRDPARTY_LIB(arrow_parquet + SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) + set(LINK_LIBS + ${LINK_LIBS} + arrow_parquet) + set(CYTHON_EXTENSIONS + ${CYTHON_EXTENSIONS} + parquet) +endif() + SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) add_library(pyarrow SHARED @@ -448,17 +468,6 @@ endif() # Setup and build Cython modules ############################################################ -set(CYTHON_EXTENSIONS - array - config - error - io - parquet - scalar - schema - table -) - foreach(module ${CYTHON_EXTENSIONS}) string(REPLACE "." ";" directories ${module}) list(GET directories -1 module_name) diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 6bd305615fc..5d5efc431a4 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -52,7 +52,7 @@ find_library(ARROW_IO_LIB_PATH NAMES arrow_io ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) -if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) +if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) set(ARROW_IO_LIB_NAME libarrow_io) @@ -64,18 +64,9 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) set(ARROW_IO_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_IO_LIB_NAME}.a) set(ARROW_IO_SHARED_LIB ${ARROW_LIBS}/${ARROW_IO_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) - - set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) - set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) -else () - set(ARROW_FOUND FALSE) -endif () - -if (ARROW_FOUND) if (NOT Arrow_FIND_QUIETLY) message(STATUS "Found the Arrow core library: ${ARROW_LIB_PATH}") message(STATUS "Found the Arrow IO library: ${ARROW_IO_LIB_PATH}") - message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") endif () else () if (NOT Arrow_FIND_QUIETLY) @@ -88,8 +79,23 @@ else () message(STATUS "${ARROW_ERR_MSG}") endif (Arrow_FIND_REQUIRED) endif () + set(ARROW_FOUND FALSE) endif () +if(ARROW_PARQUET_LIB_PATH) + set(ARROW_PARQUET_FOUND TRUE) + set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) + set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + if (NOT Arrow_FIND_QUIETLY) + message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") + endif () +else() + if (NOT Arrow_FIND_QUIETLY) + message(STATUS "Could not find Arrow Parquet library") + endif() + set(ARROW_PARQUET_FOUND FALSE) +endif() + mark_as_advanced( ARROW_INCLUDE_DIR ARROW_LIBS diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 328e923b941..eb92e8ea93a 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -46,6 +46,7 @@ def hdfs_test_client(): HDFS_TMP_PATH = '/tmp/pyarrow-test-{0}'.format(random.randint(0, 1000)) + @pytest.fixture(scope='session') def hdfs(request): fixture = hdfs_test_client() diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index d89d947b7b6..8a2d8cab572 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -15,33 +15,45 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest -import pyarrow as arrow -import pyarrow.parquet +import pytest -A = arrow +import pyarrow as A import numpy as np -import os.path import pandas as pd import pandas.util.testing as pdt +try: + import pyarrow.parquet as pq + HAVE_PARQUET = True +except ImportError: + HAVE_PARQUET = False +# XXX: Make Parquet tests opt-in rather than skip-if-not-build +parquet = pytest.mark.skipif(not HAVE_PARQUET, + reason='Parquet support not built') + + +@parquet def test_single_pylist_column_roundtrip(tmpdir): for dtype in [int, float]: - filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__)) + filename = tmpdir.join('single_{}_column.parquet' + .format(dtype.__name__)) data = [A.from_pylist(list(map(dtype, range(5))))] table = A.Table.from_arrays(('a', 'b'), data, 'table_name') A.parquet.write_table(table, filename.strpath) - table_read = pyarrow.parquet.read_table(filename.strpath) - for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): + table_read = pq.read_table(filename.strpath) + for col_written, col_read in zip(table.itercolumns(), + table_read.itercolumns()): assert col_written.name == col_read.name assert col_read.data.num_chunks == 1 data_written = col_written.data.chunk(0) data_read = col_read.data.chunk(0) assert data_written.equals(data_read) + +@parquet def test_pandas_parquet_2_0_rountrip(tmpdir): size = 10000 np.random.seed(0) @@ -58,17 +70,20 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, # Pandas only support ns resolution, Arrow at the moment only ms - 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'), + 'datetime': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[ms]'), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None] }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True) A.parquet.write_table(arrow_table, filename.strpath, version="2.0") - table_read = pyarrow.parquet.read_table(filename.strpath) + table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read) + +@parquet def test_pandas_parquet_1_0_rountrip(tmpdir): size = 10000 np.random.seed(0) @@ -88,11 +103,10 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename.strpath, version="1.0") - table_read = pyarrow.parquet.read_table(filename.strpath) + table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) pdt.assert_frame_equal(df, df_read) - diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 83fcbb8faff..abf143199fe 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -16,11 +16,7 @@ # under the License. from pyarrow.compat import unittest -import pyarrow as arrow - -A = arrow - -import pandas as pd +import pyarrow as A class TestRowBatch(unittest.TestCase): @@ -76,4 +72,3 @@ def test_pandas(self): assert set(df.columns) == set(('a', 'b')) assert df.shape == (5, 2) assert df.ix[0, 'b'] == -10 - diff --git a/python/setup.py b/python/setup.py index 59410d75a61..a5db2b025e6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -97,6 +97,18 @@ def initialize_options(self): _build_ext.initialize_options(self) self.extra_cmake_args = '' + CYTHON_MODULE_NAMES = [ + 'array', + 'config', + 'error', + 'io', + 'parquet', + 'scalar', + 'schema', + 'table'] + + CYTHON_ALLOWED_FAILURES = ['parquet'] + def _run_cmake(self): # The directory containing this setup.py source = osp.dirname(osp.abspath(__file__)) @@ -172,10 +184,13 @@ def _run_cmake(self): # Move the built C-extension to the place expected by the Python build self._found_names = [] - for name in self.get_cmake_cython_names(): + for name in self.CYTHON_MODULE_NAMES: built_path = self.get_ext_built(name) if not os.path.exists(built_path): print(built_path) + if name in self.CYTHON_ALLOWED_FAILURES: + print('Cython module {0} failure permitted'.format(name)) + continue raise RuntimeError('libpyarrow C-extension failed to build:', os.path.abspath(built_path)) @@ -213,16 +228,6 @@ def get_ext_built(self, name): suffix = sysconfig.get_config_var('SO') return name + suffix - def get_cmake_cython_names(self): - return ['array', - 'config', - 'error', - 'io', - 'parquet', - 'scalar', - 'schema', - 'table'] - def get_names(self): return self._found_names From 214b861ae8f40f5fba544247d40c8995b93eca83 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 7 Sep 2016 00:20:51 -0400 Subject: [PATCH 128/210] ARROW-283: [C++] Account for upstream changes in parquet-cpp Author: Wes McKinney Closes #131 from wesm/ARROW-283 and squashes the following commits: 52dfb28 [Wes McKinney] Update arrow_parquet for API changes in parquet-cpp --- cpp/src/arrow/parquet/reader.cc | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 9f6212570dc..440ec84e2c7 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -149,11 +149,13 @@ bool FileReader::Impl::CheckForFlatColumn(const ::parquet::ColumnDescriptor* des } Status FileReader::Impl::GetFlatColumn(int i, std::unique_ptr* out) { - if (!CheckForFlatColumn(reader_->descr()->Column(i))) { + const ::parquet::SchemaDescriptor* schema = reader_->metadata()->schema_descriptor(); + + if (!CheckForFlatColumn(schema->Column(i))) { return Status::Invalid("The requested column is not flat"); } std::unique_ptr impl( - new FlatColumnReader::Impl(pool_, reader_->descr()->Column(i), reader_.get(), i)); + new FlatColumnReader::Impl(pool_, schema->Column(i), reader_.get(), i)); *out = std::unique_ptr(new FlatColumnReader(std::move(impl))); return Status::OK(); } @@ -161,16 +163,20 @@ Status FileReader::Impl::GetFlatColumn(int i, std::unique_ptr* Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr* out) { std::unique_ptr flat_column_reader; RETURN_NOT_OK(GetFlatColumn(i, &flat_column_reader)); - return flat_column_reader->NextBatch(reader_->num_rows(), out); + return flat_column_reader->NextBatch(reader_->metadata()->num_rows(), out); } Status FileReader::Impl::ReadFlatTable(std::shared_ptr

* table) { - const std::string& name = reader_->descr()->schema()->name(); + auto descr = reader_->metadata()->schema_descriptor(); + + const std::string& name = descr->schema()->name(); std::shared_ptr schema; - RETURN_NOT_OK(FromParquetSchema(reader_->descr(), &schema)); + RETURN_NOT_OK(FromParquetSchema(descr, &schema)); + + int num_columns = reader_->metadata()->num_columns(); - std::vector> columns(reader_->num_columns()); - for (int i = 0; i < reader_->num_columns(); i++) { + std::vector> columns(num_columns); + for (int i = 0; i < num_columns; i++) { std::shared_ptr array; RETURN_NOT_OK(ReadFlatColumn(i, &array)); columns[i] = std::make_shared(schema->field(i), array); @@ -375,7 +381,7 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr* } void FlatColumnReader::Impl::NextRowGroup() { - if (next_row_group_ < reader_->num_row_groups()) { + if (next_row_group_ < reader_->metadata()->num_row_groups()) { column_reader_ = reader_->RowGroup(next_row_group_)->Column(column_index_); next_row_group_++; } else { From 270ab4e94dba3ec45cfd2297d4f901d51d4a053b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 7 Sep 2016 14:15:16 -0700 Subject: [PATCH 129/210] ARROW-278: [Format] Rename Tuple to Struct_ in flatbuffers IDL "Struct" is a reserved keyword in generated bindings for C++. We had used "Tuple" to sidestep this but we discussed and decided to mangle "Struct" instead in the Flatbuffers. Author: Wes McKinney Closes #130 from wesm/ARROW-278 and squashes the following commits: 841a721 [Wes McKinney] Rename Tuple to Struct_ in flatbuffers IDL --- cpp/src/arrow/ipc/metadata-internal.cc | 6 +++--- format/Message.fbs | 10 +++++----- java/vector/src/main/codegen/data/ArrowTypes.tdd | 2 +- .../org/apache/arrow/vector/complex/MapVector.java | 4 ++-- .../org/apache/arrow/vector/schema/TypeLayout.java | 6 +++--- .../main/java/org/apache/arrow/vector/types/Types.java | 4 ++-- .../java/org/apache/arrow/vector/pojo/TestConvert.java | 6 +++--- 7 files changed, 19 insertions(+), 19 deletions(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index c921e4d8e01..1c15218c0ba 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -115,7 +115,7 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, } *out = std::make_shared(children[0]); return Status::OK(); - case flatbuf::Type_Tuple: + case flatbuf::Type_Struct_: *out = std::make_shared(children); return Status::OK(); case flatbuf::Type_Union: @@ -153,7 +153,7 @@ static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr& type RETURN_NOT_OK(FieldToFlatbuffer(fbb, type->child(i), &field)); out_children->push_back(field); } - *offset = flatbuf::CreateTuple(fbb).Union(); + *offset = flatbuf::CreateStruct_(fbb).Union(); return Status::OK(); } @@ -197,7 +197,7 @@ static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, *out_type = flatbuf::Type_List; return ListToFlatbuffer(fbb, type, children, offset); case Type::STRUCT: - *out_type = flatbuf::Type_Tuple; + *out_type = flatbuf::Type_Struct_; return StructToFlatbuffer(fbb, type, children, offset); default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy diff --git a/format/Message.fbs b/format/Message.fbs index 9c957248977..78bdaeb35f5 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -8,10 +8,10 @@ namespace org.apache.arrow.flatbuf; table Null { } -/// A Tuple in the flatbuffer metadata is the same as an Arrow Struct -/// (according to the physical memory layout). We used Tuple here as Struct is -/// a reserved word in Flatbuffers -table Tuple { +/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct +/// (according to the physical memory layout). We used Struct_ here as +/// Struct is a reserved word in Flatbuffers +table Struct_ { } table List { @@ -87,7 +87,7 @@ union Type { IntervalDay, IntervalYear, List, - Tuple, + Struct_, Union, JSONScalar } diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 2ecad3d3140..5cb43bed2b6 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -21,7 +21,7 @@ fields: [] }, { - name: "Tuple", + name: "Struct_", fields: [] }, { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java index 1b8483a3d41..aaecb956434 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/MapVector.java @@ -34,7 +34,7 @@ import org.apache.arrow.vector.holders.ComplexHolder; import org.apache.arrow.vector.types.Types; import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.Field; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.util.JsonStringHashMap; @@ -290,7 +290,7 @@ public Field getField() { for (ValueVector child : getChildren()) { children.add(child.getField()); } - return new Field(name, false, Tuple.INSTANCE, children); + return new Field(name, false, Struct_.INSTANCE, children); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 9f1efd056cb..885ac2ac3d7 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -45,7 +45,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.ArrowType.Time; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; -import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; @@ -54,7 +54,7 @@ /** * The layout of vectors for a given type * It defines its own vectors followed by the vectors for the children - * if it is a nested type (Tuple, List, Union) + * if it is a nested type (Struct_, List, Union) */ public class TypeLayout { @@ -88,7 +88,7 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { return new TypeLayout(vectors); } - @Override public TypeLayout visit(Tuple type) { + @Override public TypeLayout visit(Struct_ type) { List vectors = asList( validityVector() ); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 5eef8a008a9..66ef7562ced 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -84,7 +84,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.ArrowType.Time; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; -import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; import org.apache.arrow.vector.types.pojo.Field; @@ -131,7 +131,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return null; } }, - MAP(Tuple.INSTANCE) { + MAP(Struct_.INSTANCE) { @Override public Field getField() { throw new UnsupportedOperationException("Cannot get simple field for Map type"); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index 61327f1970e..448117d84dc 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -26,7 +26,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.List; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; -import org.apache.arrow.vector.types.pojo.ArrowType.Tuple; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; import org.apache.arrow.vector.types.pojo.Field; @@ -53,7 +53,7 @@ public void complex() { childrenBuilder.add(new Field("child1", true, Utf8.INSTANCE, null)); childrenBuilder.add(new Field("child2", true, new FloatingPoint(SINGLE), ImmutableList.of())); - Field initialField = new Field("a", true, Tuple.INSTANCE, childrenBuilder.build()); + Field initialField = new Field("a", true, Struct_.INSTANCE, childrenBuilder.build()); run(initialField); } @@ -71,7 +71,7 @@ public void nestedSchema() { ImmutableList.Builder childrenBuilder = ImmutableList.builder(); childrenBuilder.add(new Field("child1", true, Utf8.INSTANCE, null)); childrenBuilder.add(new Field("child2", true, new FloatingPoint(SINGLE), ImmutableList.of())); - childrenBuilder.add(new Field("child3", true, new Tuple(), ImmutableList.of( + childrenBuilder.add(new Field("child3", true, new Struct_(), ImmutableList.of( new Field("child3.1", true, Utf8.INSTANCE, null), new Field("child3.2", true, new FloatingPoint(DOUBLE), ImmutableList.of()) ))); From 52089d609dff3d8d2abe99c7b94f7af9fe4735bd Mon Sep 17 00:00:00 2001 From: Laurent Goujon Date: Thu, 8 Sep 2016 11:35:08 -0700 Subject: [PATCH 130/210] ARROW-285: Optional flatc download For platforms which don't have a flatc compiler artifact on maven central, allow to skip the download and manually provide a flatc compiler usage: mvn -Dflatc.download.skip -Dflatc.executable=/usr/local/bin/flatc Author: Laurent Goujon Closes #129 from laurentgo/laurent/optional-flatc-download and squashes the following commits: 229c6d5 [Laurent Goujon] Optional flatc download --- java/format/pom.xml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/java/format/pom.xml b/java/format/pom.xml index dc5897581b5..4cf68bbe057 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -25,6 +25,8 @@ 1.2.0-3f79e055 + false + ${project.build.directory}/flatc-${os.detected.classifier}-${fbs.version}.exe 3.3 2.10 1.5.0.Final @@ -71,6 +73,7 @@ ${project.build.directory} + ${flatc.download.skip} @@ -92,6 +95,7 @@ +x ${project.build.directory}/flatc-${os.detected.classifier}-${fbs.version}.exe + ${flatc.download.skip} @@ -100,11 +104,11 @@ generate-sources - ${project.build.directory}/flatc-${os.detected.classifier}-${fbs.version}.exe + ${flatc.executable} -j -o - target/generated-sources/ + target/generated-sources/flatc ../../format/Message.fbs ../../format/File.fbs From a5f28617499a63ec44886bed35253f790e3674e1 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Thu, 8 Sep 2016 22:58:37 -0400 Subject: [PATCH 131/210] ARROW-286: Build thirdparty dependencies in parallel Author: Uwe L. Korn Closes #133 from xhochy/ARROW-286 and squashes the following commits: cb5a990 [Uwe L. Korn] ARROW-286: Build thirdparty dependencies in parallel --- cpp/thirdparty/build_thirdparty.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index f1738ff7482..6cc776d0904 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -62,7 +62,7 @@ if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then CXXFLAGS=-fPIC cmake . || { echo "cmake $GOOGLETEST_ERROR"; exit 1; } fi - make VERBOSE=1 || { echo "Make $GOOGLETEST_ERROR" ; exit 1; } + make -j$PARALLEL VERBOSE=1 || { echo "Make $GOOGLETEST_ERROR" ; exit 1; } fi # build google benchmark @@ -76,7 +76,7 @@ if [ -n "$F_ALL" -o -n "$F_GBENCHMARK" ]; then fi cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_CXX_FLAGS="-fPIC $CMAKE_CXX_FLAGS" . || { echo "cmake $GBENCHMARK_ERROR" ; exit 1; } - make VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; } + make -j$PARALLEL VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; } fi FLATBUFFERS_ERROR="failed for flatbuffers" From 077c72bc6adf07c5311785596cb03088ae11ae5e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 9 Sep 2016 00:02:35 -0400 Subject: [PATCH 132/210] ARROW-256: [Format] Add a version number to the IPC/RPC metadata See "Schema evolution examples" in https://google.github.io/flatbuffers/flatbuffers_guide_writing_schema.html. In the future, if we need to add some other message types (like `RecordBatchV2`), then this should permit this without too much trouble. Author: Wes McKinney Closes #125 from wesm/ARROW-256 and squashes the following commits: 60ee5c0 [Wes McKinney] Rename current version to V1_SNAPSHOT to reflect changing nature bab2749 [Wes McKinney] Add a version number / enum to the Message and File metadata --- cpp/src/arrow/ipc/metadata-internal.cc | 3 ++- cpp/src/arrow/ipc/metadata-internal.h | 3 +++ format/File.fbs | 1 + format/Message.fbs | 5 +++++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 1c15218c0ba..8cc902c2967 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -295,7 +295,8 @@ Status WriteDataHeader(int32_t length, int64_t body_length, } Status MessageBuilder::Finish() { - auto message = flatbuf::CreateMessage(fbb_, header_type_, header_, body_length_); + auto message = flatbuf::CreateMessage(fbb_, kMetadataVersion, + header_type_, header_, body_length_); fbb_.Finish(message); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index 5faa8c947b5..db9a83f6a8d 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -37,6 +37,9 @@ class Status; namespace ipc { +static constexpr flatbuf::MetadataVersion kMetadataVersion = + flatbuf::MetadataVersion_V1_SNAPSHOT; + Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr* out); class MessageBuilder { diff --git a/format/File.fbs b/format/File.fbs index f7ad1e1594a..a29bbc694bc 100644 --- a/format/File.fbs +++ b/format/File.fbs @@ -7,6 +7,7 @@ namespace org.apache.arrow.flatbuf; /// table Footer { + version: org.apache.arrow.flatbuf.MetadataVersion; schema: org.apache.arrow.flatbuf.Schema; diff --git a/format/Message.fbs b/format/Message.fbs index 78bdaeb35f5..657904a7032 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -1,5 +1,9 @@ namespace org.apache.arrow.flatbuf; +enum MetadataVersion:short { + V1_SNAPSHOT +} + /// ---------------------------------------------------------------------- /// Logical types and their metadata (if any) /// @@ -237,6 +241,7 @@ union MessageHeader { } table Message { + version: org.apache.arrow.flatbuf.MetadataVersion; header: MessageHeader; bodyLength: long; } From 6b8abb4402ff1f39fc5944a7df6e3b4755691d87 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 12 Sep 2016 23:15:10 -0400 Subject: [PATCH 133/210] ARROW-289: Install test-util.h Author: Uwe L. Korn Closes #135 from xhochy/arrow-289 and squashes the following commits: 5e4aadf [Uwe L. Korn] ARROW-289: Install test-util.h --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/test-util.h | 12 ++++++------ cpp/src/arrow/util/CMakeLists.txt | 1 + cpp/src/arrow/util/bit-util.h | 4 +++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 2d42edcfbd4..a9b2feca28c 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -24,6 +24,7 @@ install(FILES schema.h table.h type.h + test-util.h DESTINATION include/arrow) ####################################### diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 055dac74444..e632ffb1d89 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -40,22 +40,22 @@ #define ASSERT_RAISES(ENUM, expr) \ do { \ - Status s = (expr); \ + ::arrow::Status s = (expr); \ if (!s.Is##ENUM()) { FAIL() << s.ToString(); } \ } while (0) #define ASSERT_OK(expr) \ do { \ - Status s = (expr); \ + ::arrow::Status s = (expr); \ if (!s.ok()) { FAIL() << s.ToString(); } \ } while (0) #define ASSERT_OK_NO_THROW(expr) ASSERT_NO_THROW(ASSERT_OK(expr)) -#define EXPECT_OK(expr) \ - do { \ - Status s = (expr); \ - EXPECT_TRUE(s.ok()); \ +#define EXPECT_OK(expr) \ + do { \ + ::arrow::Status s = (expr); \ + EXPECT_TRUE(s.ok()); \ } while (0) namespace arrow { diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt index 13c0d7514fe..fd23c1aa3b8 100644 --- a/cpp/src/arrow/util/CMakeLists.txt +++ b/cpp/src/arrow/util/CMakeLists.txt @@ -26,6 +26,7 @@ install(FILES logging.h macros.h memory-pool.h + random.h status.h visibility.h DESTINATION include/arrow/util) diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index a6c8dd904d8..873a1959865 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -22,6 +22,8 @@ #include #include +#include "arrow/util/visibility.h" + namespace arrow { class Buffer; @@ -76,7 +78,7 @@ static inline bool is_multiple_of_64(int64_t n) { } void bytes_to_bits(const std::vector& bytes, uint8_t* bits); -Status bytes_to_bits(const std::vector&, std::shared_ptr*); +ARROW_EXPORT Status bytes_to_bits(const std::vector&, std::shared_ptr*); } // namespace util From 6f99156c3bb01329e33f74a57d9aaff1ed8304bc Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 13 Sep 2016 11:13:54 -0700 Subject: [PATCH 134/210] ARROW-287: Make nullable vectors use a BitVecor instead of UInt1Vector for bits Author: Julien Le Dem Closes #134 from julienledem/bits and squashes the following commits: d4e5084 [Julien Le Dem] add nullable vector test that verifies Bit based buffers 15fde9d [Julien Le Dem] ARROW-287: Make nullable vectors use a BitVecor instead of UInt1Vector for bits --- .../templates/NullableValueVectors.java | 6 +- .../org/apache/arrow/vector/BitVector.java | 16 ++++- .../arrow/vector/complex/ListVector.java | 6 +- .../vector/complex/NullableMapVector.java | 8 +-- .../apache/arrow/vector/TestValueVector.java | 67 +++++++++++++++++-- 5 files changed, 87 insertions(+), 16 deletions(-) diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index bb2c0012160..486cfeefc7a 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -53,7 +53,7 @@ public final class ${className} extends BaseDataValueVector implements <#if type private final String valuesField = "$values$"; private final Field field; - final UInt1Vector bits = new UInt1Vector(bitsField, allocator); + final BitVector bits = new BitVector(bitsField, allocator); final ${valuesName} values; private final Mutator mutator; @@ -446,7 +446,7 @@ public void copyFromSafe(int fromIndex, int thisIndex, Nullable${minor.class}Vec } public final class Accessor extends BaseDataValueVector.BaseAccessor <#if type.major = "VarLen">implements VariableWidthVector.VariableWidthAccessor { - final UInt1Vector.Accessor bAccessor = bits.getAccessor(); + final BitVector.Accessor bAccessor = bits.getAccessor(); final ${valuesName}.Accessor vAccessor = values.getAccessor(); /** @@ -545,7 +545,7 @@ public void setIndexDefined(int index){ public void set(int index, <#if type.major == "VarLen">byte[]<#elseif (type.width < 4)>int<#else>${minor.javaType!type.javaType} value) { setCount++; final ${valuesName}.Mutator valuesMutator = values.getMutator(); - final UInt1Vector.Mutator bitsMutator = bits.getMutator(); + final BitVector.Mutator bitsMutator = bits.getMutator(); <#if type.major == "VarLen"> for (int i = lastSet + 1; i < index; i++) { valuesMutator.set(i, emptyByteArray); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java index fee6e9cdef7..c12db5045c2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BitVector.java @@ -17,8 +17,6 @@ */ package org.apache.arrow.vector; -import io.netty.buffer.ArrowBuf; - import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.complex.reader.FieldReader; @@ -29,6 +27,8 @@ import org.apache.arrow.vector.util.OversizedAllocationException; import org.apache.arrow.vector.util.TransferPair; +import io.netty.buffer.ArrowBuf; + /** * Bit implements a vector of bit-width values. Elements in the vector are accessed by position from the logical start * of the vector. The width of each element is 1 bit. The equivalent Java primitive is an int containing the value '0' @@ -435,6 +435,18 @@ public final void generateTestData(int values) { setValueCount(values); } + public void generateTestDataAlt(int size) { + setValueCount(size); + boolean even = true; + final int valueCount = getAccessor().getValueCount(); + for(int i = 0; i < valueCount; i++, even = !even) { + if(even){ + set(i, (byte) 1); + }else{ + set(i, (byte) 0); + } + } + } } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java index 2984c362514..dd99c734f7f 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/ListVector.java @@ -28,9 +28,9 @@ import org.apache.arrow.memory.OutOfMemoryException; import org.apache.arrow.vector.AddOrGetResult; import org.apache.arrow.vector.BaseDataValueVector; +import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.BufferBacked; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.UInt1Vector; import org.apache.arrow.vector.UInt4Vector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.ZeroVector; @@ -55,7 +55,7 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector { final UInt4Vector offsets; - final UInt1Vector bits; + final BitVector bits; private final List innerVectors; private Mutator mutator = new Mutator(); private Accessor accessor = new Accessor(); @@ -65,7 +65,7 @@ public class ListVector extends BaseRepeatedValueVector implements FieldVector { public ListVector(String name, BufferAllocator allocator, CallBack callBack) { super(name, allocator); - this.bits = new UInt1Vector("$bits$", allocator); + this.bits = new BitVector("$bits$", allocator); this.offsets = getOffsetVector(); this.innerVectors = Collections.unmodifiableList(Arrays.asList(bits, offsets)); this.writer = new UnionListWriter(this); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java index 6b257c095d2..8e1bbfabdc9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/NullableMapVector.java @@ -25,10 +25,10 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.BaseDataValueVector; +import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.BufferBacked; import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.NullableVectorDefinitionSetter; -import org.apache.arrow.vector.UInt1Vector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.complex.impl.NullableMapReaderImpl; import org.apache.arrow.vector.complex.reader.FieldReader; @@ -45,7 +45,7 @@ public class NullableMapVector extends MapVector implements FieldVector { private final NullableMapReaderImpl reader = new NullableMapReaderImpl(this); - protected final UInt1Vector bits; + protected final BitVector bits; private final List innerVectors; @@ -54,7 +54,7 @@ public class NullableMapVector extends MapVector implements FieldVector { public NullableMapVector(String name, BufferAllocator allocator, CallBack callBack) { super(name, checkNotNull(allocator), callBack); - this.bits = new UInt1Vector("$bits$", allocator); + this.bits = new BitVector("$bits$", allocator); this.innerVectors = Collections.unmodifiableList(Arrays.asList(bits)); this.accessor = new Accessor(); this.mutator = new Mutator(); @@ -186,7 +186,7 @@ public boolean allocateNewSafe() { return success; } public final class Accessor extends MapVector.Accessor { - final UInt1Vector.Accessor bAccessor = bits.getAccessor(); + final BitVector.Accessor bAccessor = bits.getAccessor(); @Override public Object getObject(int index) { diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java index 21cdc4f4d8d..124452e96ee 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java @@ -17,17 +17,23 @@ */ package org.apache.arrow.vector; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.nio.charset.Charset; +import java.util.List; + import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.schema.TypeLayout; import org.apache.arrow.vector.types.Types.MinorType; -import org.apache.arrow.vector.util.OversizedAllocationException; +import org.apache.arrow.vector.types.pojo.Field; import org.junit.After; import org.junit.Before; import org.junit.Test; -import java.nio.charset.Charset; - -import static org.junit.Assert.*; +import io.netty.buffer.ArrowBuf; public class TestValueVector { @@ -223,6 +229,59 @@ public void testNullableFloat() { } } + @Test + public void testNullableInt() { + // Create a new value vector for 1024 integers + try (final NullableIntVector vector = (NullableIntVector) MinorType.INT.getNewVector(EMPTY_SCHEMA_PATH, allocator, null)) { + final NullableIntVector.Mutator m = vector.getMutator(); + vector.allocateNew(1024); + + // Put and set a few values. + m.set(0, 1); + m.set(1, 2); + m.set(100, 3); + m.set(1022, 4); + m.set(1023, 5); + + m.setValueCount(1024); + + final NullableIntVector.Accessor accessor = vector.getAccessor(); + assertEquals(1, accessor.get(0)); + assertEquals(2, accessor.get(1)); + assertEquals(3, accessor.get(100)); + assertEquals(4, accessor.get(1022)); + assertEquals(5, accessor.get(1023)); + + // Ensure null values. + assertTrue(vector.getAccessor().isNull(3)); + + Field field = vector.getField(); + TypeLayout typeLayout = field.getTypeLayout(); + + List buffers = vector.getFieldBuffers(); + + assertEquals(2, typeLayout.getVectors().size()); + assertEquals(2, buffers.size()); + + ArrowBuf validityVectorBuf = buffers.get(0); + assertEquals(128, validityVectorBuf.readableBytes()); + assertEquals(3, validityVectorBuf.getByte(0)); // 1st and second bit defined + for (int i = 1; i < 12; i++) { + assertEquals(0, validityVectorBuf.getByte(i)); // nothing defined until 100 + } + assertEquals(16, validityVectorBuf.getByte(12)); // 100th bit is defined (12 * 8 + 4) + for (int i = 13; i < 127; i++) { + assertEquals(0, validityVectorBuf.getByte(i)); // nothing defined between 100th and 1022nd + } + assertEquals(-64, validityVectorBuf.getByte(127)); // 1022nd and 1023rd bit defined + + vector.allocateNew(2048); + // vector has been erased + assertTrue(vector.getAccessor().isNull(0)); + } + } + + @Test public void testBitVector() { // Create a new value vector for 1024 integers From 3487c2f0cdc2297a80ba3525c192745313b3da48 Mon Sep 17 00:00:00 2001 From: adeneche Date: Wed, 14 Sep 2016 14:46:27 -0700 Subject: [PATCH 135/210] ARROW-292: [Java] Upgrade Netty to 4.0.41 this closes #137 --- .../main/java/io/netty/buffer/ArrowBuf.java | 2 +- .../netty/buffer/PooledByteBufAllocatorL.java | 2 +- .../buffer/UnsafeDirectLittleEndian.java | 30 +++++++++++++++---- java/pom.xml | 2 +- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index d10f00247e6..b7a268a0070 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -452,7 +452,7 @@ public String toString(int index, int length, Charset charset) { return ""; } - return ByteBufUtil.decodeString(nioBuffer(index, length), charset); + return ByteBufUtil.decodeString(this, index, length, charset); } @Override diff --git a/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java b/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java index 0b6e3f7f839..f6feb65cccd 100644 --- a/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java +++ b/java/memory/src/main/java/io/netty/buffer/PooledByteBufAllocatorL.java @@ -145,7 +145,7 @@ public boolean matches(String name, Metric metric) { } private UnsafeDirectLittleEndian newDirectBufferL(int initialCapacity, int maxCapacity) { - PoolThreadCache cache = threadCache.get(); + PoolThreadCache cache = threadCache(); PoolArena directArena = cache.directArena; if (directArena != null) { diff --git a/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java index a94c6d19883..dc93602100e 100644 --- a/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java +++ b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java @@ -20,6 +20,9 @@ import io.netty.util.internal.PlatformDependent; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; import java.nio.ByteOrder; import java.util.concurrent.atomic.AtomicLong; @@ -93,11 +96,6 @@ public ByteBuf slice(int index, int length) { return new SlicedByteBuf(this, index, length); } - @Override - public ByteOrder order() { - return ByteOrder.LITTLE_ENDIAN; - } - @Override public ByteBuf order(ByteOrder endianness) { return this; @@ -254,6 +252,28 @@ public boolean release(int decrement) { return released; } + @Override + public int setBytes(int index, InputStream in, int length) throws IOException { + wrapped.checkIndex(index, length); + byte[] tmp = new byte[length]; + int readBytes = in.read(tmp); + if (readBytes > 0) { + PlatformDependent.copyMemory(tmp, 0, addr(index), readBytes); + } + return readBytes; + } + + @Override + public ByteBuf getBytes(int index, OutputStream out, int length) throws IOException { + wrapped.checkIndex(index, length); + if (length != 0) { + byte[] tmp = new byte[length]; + PlatformDependent.copyMemory(addr(index), tmp, 0, length); + out.write(tmp); + } + return this; + } + @Override public int hashCode() { return System.identityHashCode(this); diff --git a/java/pom.xml b/java/pom.xml index 8eb25af7545..a8e24ed054c 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -395,7 +395,7 @@ io.netty netty-handler - 4.0.27.Final + 4.0.41.Final From 17e90e1d88266ea224244647831f49d5bd1dac72 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 16 Sep 2016 16:01:17 -0700 Subject: [PATCH 136/210] ARROW-290: Specialize alloc() in ArrowBuf Author: Julien Le Dem Closes #136 from julienledem/alloc and squashes the following commits: a19d16f [Julien Le Dem] ARROW-290: Specialize alloc() in ArrowBuf --- .../src/main/java/io/netty/buffer/ArrowBuf.java | 9 +++++---- .../io/netty/buffer/UnsafeDirectLittleEndian.java | 2 ++ .../apache/arrow/memory/ArrowByteBufAllocator.java | 4 ++++ .../java/org/apache/arrow/memory/BaseAllocator.java | 11 +++++------ 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index b7a268a0070..a5989c1518d 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicLong; import org.apache.arrow.memory.AllocationManager.BufferLedger; +import org.apache.arrow.memory.ArrowByteBufAllocator; import org.apache.arrow.memory.BaseAllocator; import org.apache.arrow.memory.BaseAllocator.Verbosity; import org.apache.arrow.memory.BoundsChecking; @@ -52,7 +53,7 @@ public final class ArrowBuf extends AbstractByteBuf implements AutoCloseable { private final int offset; private final BufferLedger ledger; private final BufferManager bufManager; - private final ByteBufAllocator alloc; + private final ArrowByteBufAllocator alloc; private final boolean isEmpty; private volatile int length; private final HistoricalLog historicalLog = BaseAllocator.DEBUG ? @@ -63,7 +64,7 @@ public ArrowBuf( final BufferLedger ledger, final UnsafeDirectLittleEndian byteBuf, final BufferManager manager, - final ByteBufAllocator alloc, + final ArrowByteBufAllocator alloc, final int offset, final int length, boolean isEmpty) { @@ -297,8 +298,8 @@ public synchronized ArrowBuf capacity(int newCapacity) { } @Override - public ByteBufAllocator alloc() { - return udle.alloc(); + public ArrowByteBufAllocator alloc() { + return alloc; } @Override diff --git a/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java index dc93602100e..023a6a2892b 100644 --- a/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java +++ b/java/memory/src/main/java/io/netty/buffer/UnsafeDirectLittleEndian.java @@ -26,6 +26,8 @@ import java.nio.ByteOrder; import java.util.concurrent.atomic.AtomicLong; +import io.netty.util.internal.PlatformDependent; + /** * The underlying class we use for little-endian access to memory. Is used underneath ArrowBufs to abstract away the * Netty classes and underlying Netty memory management. diff --git a/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java index f3f72fa57c3..5dc5ac397bd 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/ArrowByteBufAllocator.java @@ -39,6 +39,10 @@ public ArrowByteBufAllocator(BufferAllocator allocator) { this.allocator = allocator; } + public BufferAllocator unwrap() { + return allocator; + } + @Override public ByteBuf buffer() { return buffer(DEFAULT_BUFFER_SIZE); diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java index f1503c902d0..dbb0705045c 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -17,10 +17,6 @@ */ package org.apache.arrow.memory; -import io.netty.buffer.ArrowBuf; -import io.netty.buffer.ByteBufAllocator; -import io.netty.buffer.UnsafeDirectLittleEndian; - import java.util.Arrays; import java.util.IdentityHashMap; import java.util.Set; @@ -33,6 +29,9 @@ import com.google.common.base.Preconditions; +import io.netty.buffer.ArrowBuf; +import io.netty.buffer.UnsafeDirectLittleEndian; + public abstract class BaseAllocator extends Accountant implements BufferAllocator { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(BaseAllocator.class); @@ -47,7 +46,7 @@ public abstract class BaseAllocator extends Accountant implements BufferAllocato private final Object DEBUG_LOCK = DEBUG ? new Object() : null; private final BaseAllocator parentAllocator; - private final ByteBufAllocator thisAsByteBufAllocator; + private final ArrowByteBufAllocator thisAsByteBufAllocator; private final IdentityHashMap childAllocators; private final ArrowBuf empty; @@ -247,7 +246,7 @@ private ArrowBuf bufferWithoutReservation(final int size, BufferManager bufferMa } @Override - public ByteBufAllocator getAsByteBufAllocator() { + public ArrowByteBufAllocator getAsByteBufAllocator() { return thisAsByteBufAllocator; } From 559b865226ec0f5d78e87957c2ff0f7711bec9a8 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 18 Sep 2016 16:01:58 -0400 Subject: [PATCH 137/210] ARROW-280: [C++] Refactor IPC / memory map IO to use common arrow_io interfaces. Create arrow_ipc leaf library Several things here * Clean up IO interface class structure to be able to indicate precise characteristics of an implementation * Make the IPC reader/writer use more generic interfaces -- writing only needs an output stream, reading only needs a random access reader. This will unblock ARROW-267 * Create a separate arrow_ipc shared library Author: Wes McKinney Closes #138 from wesm/ARROW-280 and squashes the following commits: 6a59eb6 [Wes McKinney] * Restructure IO interfaces to accommodate more configurations. * Refactor memory mapped IO interfaces to be in line with other arrow::io classes. * Split arrow_ipc into a leaf library * Refactor pyarrow and arrow_parquet to suit. Move BufferReader to arrow_io. Pyarrow parquet tests currently segfault --- cpp/CMakeLists.txt | 6 - cpp/src/arrow/io/CMakeLists.txt | 11 +- cpp/src/arrow/io/hdfs.cc | 35 ++- cpp/src/arrow/io/hdfs.h | 29 +- cpp/src/arrow/io/interfaces.h | 71 ++++- .../io/{hdfs-io-test.cc => io-hdfs-test.cc} | 2 +- .../io-memory-test.cc} | 50 ++-- cpp/src/arrow/io/libhdfs_shim.cc | 3 +- cpp/src/arrow/io/memory.cc | 262 ++++++++++++++++++ cpp/src/arrow/io/memory.h | 130 +++++++++ cpp/src/arrow/io/test-common.h | 63 +++++ cpp/src/arrow/ipc/CMakeLists.txt | 58 +++- cpp/src/arrow/ipc/adapter.cc | 61 ++-- cpp/src/arrow/ipc/adapter.h | 39 +-- cpp/src/arrow/ipc/ipc-adapter-test.cc | 33 ++- cpp/src/arrow/ipc/memory.cc | 182 ------------ cpp/src/arrow/ipc/memory.h | 150 ---------- cpp/src/arrow/ipc/metadata-internal.cc | 9 +- cpp/src/arrow/ipc/metadata-internal.h | 2 +- cpp/src/arrow/ipc/metadata.h | 11 +- cpp/src/arrow/ipc/symbols.map | 18 ++ cpp/src/arrow/ipc/test-common.h | 25 -- cpp/src/arrow/ipc/util.h | 56 ++++ cpp/src/arrow/parquet/CMakeLists.txt | 1 + cpp/src/arrow/parquet/io.cc | 4 +- cpp/src/arrow/parquet/io.h | 4 +- cpp/src/arrow/parquet/parquet-io-test.cc | 51 +--- cpp/src/arrow/parquet/parquet-schema-test.cc | 3 +- cpp/src/arrow/parquet/reader.cc | 8 +- cpp/src/arrow/parquet/reader.h | 2 +- cpp/src/arrow/parquet/schema.cc | 2 +- cpp/src/arrow/parquet/writer.cc | 2 +- cpp/src/arrow/type.h | 4 +- cpp/src/arrow/util/memory-pool-test.cc | 2 +- python/pyarrow/includes/libarrow_io.pxd | 42 ++- python/pyarrow/includes/parquet.pxd | 18 +- python/pyarrow/io.pxd | 7 +- python/pyarrow/io.pyx | 14 +- python/pyarrow/parquet.pyx | 6 +- 39 files changed, 873 insertions(+), 603 deletions(-) rename cpp/src/arrow/io/{hdfs-io-test.cc => io-hdfs-test.cc} (99%) rename cpp/src/arrow/{ipc/ipc-memory-test.cc => io/io-memory-test.cc} (66%) create mode 100644 cpp/src/arrow/io/memory.cc create mode 100644 cpp/src/arrow/io/memory.h create mode 100644 cpp/src/arrow/io/test-common.h delete mode 100644 cpp/src/arrow/ipc/memory.cc delete mode 100644 cpp/src/arrow/ipc/memory.h create mode 100644 cpp/src/arrow/ipc/symbols.map create mode 100644 cpp/src/arrow/ipc/util.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a39a7521231..be95dabf318 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -626,12 +626,6 @@ set(ARROW_SRCS src/arrow/table.cc src/arrow/type.cc - # IPC / Shared memory library; to be turned into an optional component - src/arrow/ipc/adapter.cc - src/arrow/ipc/memory.cc - src/arrow/ipc/metadata.cc - src/arrow/ipc/metadata-internal.cc - src/arrow/types/construct.cc src/arrow/types/decimal.cc src/arrow/types/json.cc diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index b8c0e138afb..87e227ef80d 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -20,6 +20,7 @@ set(ARROW_IO_LINK_LIBS arrow_shared + dl ) if (ARROW_BOOST_USE_SHARED) @@ -37,6 +38,7 @@ set(ARROW_IO_TEST_LINK_LIBS ${ARROW_IO_PRIVATE_LINK_LIBS}) set(ARROW_IO_SRCS + memory.cc ) if(ARROW_HDFS) @@ -71,8 +73,8 @@ if(ARROW_HDFS) ${ARROW_HDFS_SRCS} ${ARROW_IO_SRCS}) - ADD_ARROW_TEST(hdfs-io-test) - ARROW_TEST_LINK_LIBRARIES(hdfs-io-test + ADD_ARROW_TEST(io-hdfs-test) + ARROW_TEST_LINK_LIBRARIES(io-hdfs-test ${ARROW_IO_TEST_LINK_LIBS}) endif() @@ -101,10 +103,15 @@ if (APPLE) INSTALL_NAME_DIR "@rpath") endif() +ADD_ARROW_TEST(io-memory-test) +ARROW_TEST_LINK_LIBRARIES(io-memory-test + ${ARROW_IO_TEST_LINK_LIBS}) + # Headers: top level install(FILES hdfs.h interfaces.h + memory.h DESTINATION include/arrow/io) install(TARGETS arrow_io diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 800c3edf4f3..a6b4b2f3846 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -142,6 +142,15 @@ Status HdfsReadableFile::ReadAt( return impl_->ReadAt(position, nbytes, bytes_read, buffer); } +Status HdfsReadableFile::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { + return Status::NotImplemented("Not yet implemented"); +} + +bool HdfsReadableFile::supports_zero_copy() const { + return false; +} + Status HdfsReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { return impl_->Read(nbytes, bytes_read, buffer); } @@ -162,9 +171,9 @@ Status HdfsReadableFile::Tell(int64_t* position) { // File writing // Private implementation for writeable-only files -class HdfsWriteableFile::HdfsWriteableFileImpl : public HdfsAnyFileImpl { +class HdfsOutputStream::HdfsOutputStreamImpl : public HdfsAnyFileImpl { public: - HdfsWriteableFileImpl() {} + HdfsOutputStreamImpl() {} Status Close() { if (is_open_) { @@ -185,29 +194,29 @@ class HdfsWriteableFile::HdfsWriteableFileImpl : public HdfsAnyFileImpl { } }; -HdfsWriteableFile::HdfsWriteableFile() { - impl_.reset(new HdfsWriteableFileImpl()); +HdfsOutputStream::HdfsOutputStream() { + impl_.reset(new HdfsOutputStreamImpl()); } -HdfsWriteableFile::~HdfsWriteableFile() { +HdfsOutputStream::~HdfsOutputStream() { impl_->Close(); } -Status HdfsWriteableFile::Close() { +Status HdfsOutputStream::Close() { return impl_->Close(); } -Status HdfsWriteableFile::Write( +Status HdfsOutputStream::Write( const uint8_t* buffer, int64_t nbytes, int64_t* bytes_read) { return impl_->Write(buffer, nbytes, bytes_read); } -Status HdfsWriteableFile::Write(const uint8_t* buffer, int64_t nbytes) { +Status HdfsOutputStream::Write(const uint8_t* buffer, int64_t nbytes) { int64_t bytes_written_dummy = 0; return Write(buffer, nbytes, &bytes_written_dummy); } -Status HdfsWriteableFile::Tell(int64_t* position) { +Status HdfsOutputStream::Tell(int64_t* position) { return impl_->Tell(position); } @@ -347,7 +356,7 @@ class HdfsClient::HdfsClientImpl { Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, int16_t replication, int64_t default_block_size, - std::shared_ptr* file) { + std::shared_ptr* file) { int flags = O_WRONLY; if (append) flags |= O_APPEND; @@ -362,7 +371,7 @@ class HdfsClient::HdfsClientImpl { } // std::make_shared does not work with private ctors - *file = std::shared_ptr(new HdfsWriteableFile()); + *file = std::shared_ptr(new HdfsOutputStream()); (*file)->impl_->set_members(path, fs_, handle); return Status::OK(); @@ -440,13 +449,13 @@ Status HdfsClient::OpenReadable( Status HdfsClient::OpenWriteable(const std::string& path, bool append, int32_t buffer_size, int16_t replication, int64_t default_block_size, - std::shared_ptr* file) { + std::shared_ptr* file) { return impl_->OpenWriteable( path, append, buffer_size, replication, default_block_size, file); } Status HdfsClient::OpenWriteable( - const std::string& path, bool append, std::shared_ptr* file) { + const std::string& path, bool append, std::shared_ptr* file) { return OpenWriteable(path, append, 0, 0, 0, file); } diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index b6449fcb88a..39720cc17e4 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -29,13 +29,14 @@ namespace arrow { +class Buffer; class Status; namespace io { class HdfsClient; class HdfsReadableFile; -class HdfsWriteableFile; +class HdfsOutputStream; struct HdfsPathInfo { ObjectType::type kind; @@ -139,14 +140,14 @@ class ARROW_EXPORT HdfsClient : public FileSystemClient { // @param default_block_size, 0 for default Status OpenWriteable(const std::string& path, bool append, int32_t buffer_size, int16_t replication, int64_t default_block_size, - std::shared_ptr* file); + std::shared_ptr* file); Status OpenWriteable( - const std::string& path, bool append, std::shared_ptr* file); + const std::string& path, bool append, std::shared_ptr* file); private: friend class HdfsReadableFile; - friend class HdfsWriteableFile; + friend class HdfsOutputStream; class ARROW_NO_EXPORT HdfsClientImpl; std::unique_ptr impl_; @@ -155,7 +156,7 @@ class ARROW_EXPORT HdfsClient : public FileSystemClient { DISALLOW_COPY_AND_ASSIGN(HdfsClient); }; -class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { +class ARROW_EXPORT HdfsReadableFile : public ReadableFileInterface { public: ~HdfsReadableFile(); @@ -166,6 +167,10 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { Status ReadAt( int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + bool supports_zero_copy() const override; + Status Seek(int64_t position) override; Status Tell(int64_t* position) override; @@ -183,9 +188,11 @@ class ARROW_EXPORT HdfsReadableFile : public RandomAccessFile { DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); }; -class ARROW_EXPORT HdfsWriteableFile : public WriteableFile { +// Naming this file OutputStream because it does not support seeking (like the +// WriteableFile interface) +class ARROW_EXPORT HdfsOutputStream : public OutputStream { public: - ~HdfsWriteableFile(); + ~HdfsOutputStream(); Status Close() override; @@ -196,14 +203,14 @@ class ARROW_EXPORT HdfsWriteableFile : public WriteableFile { Status Tell(int64_t* position) override; private: - class ARROW_NO_EXPORT HdfsWriteableFileImpl; - std::unique_ptr impl_; + class ARROW_NO_EXPORT HdfsOutputStreamImpl; + std::unique_ptr impl_; friend class HdfsClient::HdfsClientImpl; - HdfsWriteableFile(); + HdfsOutputStream(); - DISALLOW_COPY_AND_ASSIGN(HdfsWriteableFile); + DISALLOW_COPY_AND_ASSIGN(HdfsOutputStream); }; Status ARROW_EXPORT ConnectLibHdfs(); diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index c2128525371..fa34b43b2c9 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -21,8 +21,11 @@ #include #include +#include "arrow/util/macros.h" + namespace arrow { +class Buffer; class Status; namespace io { @@ -40,30 +43,78 @@ class FileSystemClient { virtual ~FileSystemClient() {} }; -class FileBase { +class FileInterface { public: + virtual ~FileInterface() {} virtual Status Close() = 0; virtual Status Tell(int64_t* position) = 0; + + FileMode::type mode() const { return mode_; } + + protected: + FileInterface() {} + FileMode::type mode_; + + void set_mode(FileMode::type mode) { mode_ = mode; } + + private: + DISALLOW_COPY_AND_ASSIGN(FileInterface); }; -class ReadableFile : public FileBase { +class Seekable { public: - virtual Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) = 0; + virtual Status Seek(int64_t position) = 0; +}; - virtual Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) = 0; +class Writeable { + public: + virtual Status Write(const uint8_t* data, int64_t nbytes) = 0; +}; - virtual Status GetSize(int64_t* size) = 0; +class Readable { + public: + virtual Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) = 0; +}; + +class OutputStream : public FileInterface, public Writeable { + protected: + OutputStream() {} }; -class RandomAccessFile : public ReadableFile { +class InputStream : public FileInterface, public Readable { + protected: + InputStream() {} +}; + +class ReadableFileInterface : public InputStream, public Seekable { public: - virtual Status Seek(int64_t position) = 0; + virtual Status ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) = 0; + + virtual Status GetSize(int64_t* size) = 0; + + // Does not copy if not necessary + virtual Status ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) = 0; + + virtual bool supports_zero_copy() const = 0; + + protected: + ReadableFileInterface() { set_mode(FileMode::READ); } }; -class WriteableFile : public FileBase { +class WriteableFileInterface : public OutputStream, public Seekable { public: - virtual Status Write(const uint8_t* buffer, int64_t nbytes) = 0; + virtual Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) = 0; + + protected: + WriteableFileInterface() { set_mode(FileMode::READ); } +}; + +class ReadWriteFileInterface : public ReadableFileInterface, + public WriteableFileInterface { + protected: + ReadWriteFileInterface() { ReadableFileInterface::set_mode(FileMode::READWRITE); } }; } // namespace io diff --git a/cpp/src/arrow/io/hdfs-io-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc similarity index 99% rename from cpp/src/arrow/io/hdfs-io-test.cc rename to cpp/src/arrow/io/io-hdfs-test.cc index e48a28142fa..7901932dee6 100644 --- a/cpp/src/arrow/io/hdfs-io-test.cc +++ b/cpp/src/arrow/io/io-hdfs-test.cc @@ -49,7 +49,7 @@ class TestHdfsClient : public ::testing::Test { Status WriteDummyFile(const std::string& path, const uint8_t* buffer, int64_t size, bool append = false, int buffer_size = 0, int replication = 0, int default_block_size = 0) { - std::shared_ptr file; + std::shared_ptr file; RETURN_NOT_OK(client_->OpenWriteable( path, append, buffer_size, replication, default_block_size, &file)); diff --git a/cpp/src/arrow/ipc/ipc-memory-test.cc b/cpp/src/arrow/io/io-memory-test.cc similarity index 66% rename from cpp/src/arrow/ipc/ipc-memory-test.cc rename to cpp/src/arrow/io/io-memory-test.cc index a2dbd35728c..6de35dab59b 100644 --- a/cpp/src/arrow/ipc/ipc-memory-test.cc +++ b/cpp/src/arrow/io/io-memory-test.cc @@ -24,20 +24,20 @@ #include "gtest/gtest.h" -#include "arrow/ipc/memory.h" -#include "arrow/ipc/test-common.h" +#include "arrow/io/memory.h" +#include "arrow/io/test-common.h" namespace arrow { -namespace ipc { +namespace io { -class TestMemoryMappedSource : public ::testing::Test, public MemoryMapFixture { +class TestMemoryMappedFile : public ::testing::Test, public MemoryMapFixture { public: void TearDown() { MemoryMapFixture::TearDown(); } }; -TEST_F(TestMemoryMappedSource, InvalidUsages) {} +TEST_F(TestMemoryMappedFile, InvalidUsages) {} -TEST_F(TestMemoryMappedSource, WriteRead) { +TEST_F(TestMemoryMappedFile, WriteRead) { const int64_t buffer_size = 1024; std::vector buffer(buffer_size); @@ -48,14 +48,13 @@ TEST_F(TestMemoryMappedSource, WriteRead) { std::string path = "ipc-write-read-test"; CreateFile(path, reps * buffer_size); - std::shared_ptr result; - ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &result)); + std::shared_ptr result; + ASSERT_OK(MemoryMappedFile::Open(path, FileMode::READWRITE, &result)); int64_t position = 0; - std::shared_ptr out_buffer; for (int i = 0; i < reps; ++i) { - ASSERT_OK(result->Write(position, buffer.data(), buffer_size)); + ASSERT_OK(result->Write(buffer.data(), buffer_size)); ASSERT_OK(result->ReadAt(position, buffer_size, &out_buffer)); ASSERT_EQ(0, memcmp(out_buffer->data(), buffer.data(), buffer_size)); @@ -64,7 +63,7 @@ TEST_F(TestMemoryMappedSource, WriteRead) { } } -TEST_F(TestMemoryMappedSource, ReadOnly) { +TEST_F(TestMemoryMappedFile, ReadOnly) { const int64_t buffer_size = 1024; std::vector buffer(buffer_size); @@ -75,19 +74,18 @@ TEST_F(TestMemoryMappedSource, ReadOnly) { std::string path = "ipc-read-only-test"; CreateFile(path, reps * buffer_size); - std::shared_ptr rwmmap; - ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_WRITE, &rwmmap)); + std::shared_ptr rwmmap; + ASSERT_OK(MemoryMappedFile::Open(path, FileMode::READWRITE, &rwmmap)); int64_t position = 0; for (int i = 0; i < reps; ++i) { - ASSERT_OK(rwmmap->Write(position, buffer.data(), buffer_size)); - + ASSERT_OK(rwmmap->Write(buffer.data(), buffer_size)); position += buffer_size; } rwmmap->Close(); - std::shared_ptr rommap; - ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_ONLY, &rommap)); + std::shared_ptr rommap; + ASSERT_OK(MemoryMappedFile::Open(path, FileMode::READ, &rommap)); position = 0; std::shared_ptr out_buffer; @@ -100,7 +98,7 @@ TEST_F(TestMemoryMappedSource, ReadOnly) { rommap->Close(); } -TEST_F(TestMemoryMappedSource, InvalidMode) { +TEST_F(TestMemoryMappedFile, InvalidMode) { const int64_t buffer_size = 1024; std::vector buffer(buffer_size); @@ -109,19 +107,19 @@ TEST_F(TestMemoryMappedSource, InvalidMode) { std::string path = "ipc-invalid-mode-test"; CreateFile(path, buffer_size); - std::shared_ptr rommap; - ASSERT_OK(MemoryMappedSource::Open(path, MemorySource::READ_ONLY, &rommap)); + std::shared_ptr rommap; + ASSERT_OK(MemoryMappedFile::Open(path, FileMode::READ, &rommap)); - ASSERT_RAISES(IOError, rommap->Write(0, buffer.data(), buffer_size)); + ASSERT_RAISES(IOError, rommap->Write(buffer.data(), buffer_size)); } -TEST_F(TestMemoryMappedSource, InvalidFile) { +TEST_F(TestMemoryMappedFile, InvalidFile) { std::string non_existent_path = "invalid-file-name-asfd"; - std::shared_ptr result; - ASSERT_RAISES(IOError, - MemoryMappedSource::Open(non_existent_path, MemorySource::READ_ONLY, &result)); + std::shared_ptr result; + ASSERT_RAISES( + IOError, MemoryMappedFile::Open(non_existent_path, FileMode::READ, &result)); } -} // namespace ipc +} // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index 003570d4fde..0b805abf94c 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -51,8 +51,7 @@ extern "C" { #include #include -#include // NOLINT -#include // NOLINT +#include // NOLINT #include "arrow/util/status.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc new file mode 100644 index 00000000000..1dd6c3a0230 --- /dev/null +++ b/cpp/src/arrow/io/memory.cc @@ -0,0 +1,262 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/io/memory.h" + +#include // For memory-mapping + +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/io/interfaces.h" + +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace io { + +// Implement MemoryMappedFile + +class MemoryMappedFile::MemoryMappedFileImpl { + public: + MemoryMappedFileImpl() + : file_(nullptr), is_open_(false), is_writable_(false), data_(nullptr) {} + + ~MemoryMappedFileImpl() { + if (is_open_) { + munmap(data_, size_); + fclose(file_); + } + } + + Status Open(const std::string& path, FileMode::type mode) { + if (is_open_) { return Status::IOError("A file is already open"); } + + int prot_flags = PROT_READ; + + if (mode == FileMode::READWRITE) { + file_ = fopen(path.c_str(), "r+b"); + prot_flags |= PROT_WRITE; + is_writable_ = true; + } else { + file_ = fopen(path.c_str(), "rb"); + } + if (file_ == nullptr) { + std::stringstream ss; + ss << "Unable to open file, errno: " << errno; + return Status::IOError(ss.str()); + } + + fseek(file_, 0L, SEEK_END); + if (ferror(file_)) { return Status::IOError("Unable to seek to end of file"); } + size_ = ftell(file_); + + fseek(file_, 0L, SEEK_SET); + is_open_ = true; + position_ = 0; + + void* result = mmap(nullptr, size_, prot_flags, MAP_SHARED, fileno(file_), 0); + if (result == MAP_FAILED) { + std::stringstream ss; + ss << "Memory mapping file failed, errno: " << errno; + return Status::IOError(ss.str()); + } + data_ = reinterpret_cast(result); + + return Status::OK(); + } + + int64_t size() const { return size_; } + + Status Seek(int64_t position) { + if (position < 0 || position >= size_) { + return Status::Invalid("position is out of bounds"); + } + position_ = position; + return Status::OK(); + } + + int64_t position() { return position_; } + + void advance(int64_t nbytes) { position_ = std::min(size_, position_ + nbytes); } + + uint8_t* data() { return data_; } + + uint8_t* head() { return data_ + position_; } + + bool writable() { return is_writable_; } + + bool opened() { return is_open_; } + + private: + FILE* file_; + int64_t position_; + int64_t size_; + bool is_open_; + bool is_writable_; + + // The memory map + uint8_t* data_; +}; + +MemoryMappedFile::MemoryMappedFile(FileMode::type mode) { + ReadableFileInterface::set_mode(mode); +} + +Status MemoryMappedFile::Open(const std::string& path, FileMode::type mode, + std::shared_ptr* out) { + std::shared_ptr result(new MemoryMappedFile(mode)); + + result->impl_.reset(new MemoryMappedFileImpl()); + RETURN_NOT_OK(result->impl_->Open(path, mode)); + + *out = result; + return Status::OK(); +} + +Status MemoryMappedFile::GetSize(int64_t* size) { + *size = impl_->size(); + return Status::OK(); +} + +Status MemoryMappedFile::Tell(int64_t* position) { + *position = impl_->position(); + return Status::OK(); +} + +Status MemoryMappedFile::Seek(int64_t position) { + return impl_->Seek(position); +} + +Status MemoryMappedFile::Close() { + // munmap handled in pimpl dtor + return Status::OK(); +} + +Status MemoryMappedFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + nbytes = std::min(nbytes, impl_->size() - impl_->position()); + std::memcpy(out, impl_->head(), nbytes); + *bytes_read = nbytes; + impl_->advance(nbytes); + return Status::OK(); +} + +Status MemoryMappedFile::ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + RETURN_NOT_OK(impl_->Seek(position)); + return Read(nbytes, bytes_read, out); +} + +Status MemoryMappedFile::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { + nbytes = std::min(nbytes, impl_->size() - position); + RETURN_NOT_OK(impl_->Seek(position)); + *out = std::make_shared(impl_->head(), nbytes); + impl_->advance(nbytes); + return Status::OK(); +} + +bool MemoryMappedFile::supports_zero_copy() const { + return true; +} + +Status MemoryMappedFile::WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) { + if (!impl_->opened() || !impl_->writable()) { + return Status::IOError("Unable to write"); + } + + RETURN_NOT_OK(impl_->Seek(position)); + return WriteInternal(data, nbytes); +} + +Status MemoryMappedFile::Write(const uint8_t* data, int64_t nbytes) { + if (!impl_->opened() || !impl_->writable()) { + return Status::IOError("Unable to write"); + } + if (nbytes + impl_->position() > impl_->size()) { + return Status::Invalid("Cannot write past end of memory map"); + } + + return WriteInternal(data, nbytes); +} + +Status MemoryMappedFile::WriteInternal(const uint8_t* data, int64_t nbytes) { + memcpy(impl_->head(), data, nbytes); + impl_->advance(nbytes); + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// In-memory buffer reader + +Status BufferReader::Close() { + // no-op + return Status::OK(); +} + +Status BufferReader::Tell(int64_t* position) { + *position = position_; + return Status::OK(); +} + +Status BufferReader::ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, bytes_read, buffer); +} + +Status BufferReader::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { + int64_t size = std::min(nbytes, buffer_size_ - position_); + *out = std::make_shared(buffer_ + position, size); + position_ += nbytes; + return Status::OK(); +} + +bool BufferReader::supports_zero_copy() const { + return true; +} + +Status BufferReader::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { + memcpy(buffer, buffer_ + position_, nbytes); + *bytes_read = std::min(nbytes, buffer_size_ - position_); + position_ += *bytes_read; + return Status::OK(); +} + +Status BufferReader::GetSize(int64_t* size) { + *size = buffer_size_; + return Status::OK(); +} + +Status BufferReader::Seek(int64_t position) { + if (position < 0 || position >= buffer_size_) { + return Status::IOError("position out of bounds"); + } + + position_ = position; + return Status::OK(); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h new file mode 100644 index 00000000000..6fe47c3b515 --- /dev/null +++ b/cpp/src/arrow/io/memory.h @@ -0,0 +1,130 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Public API for different memory sharing / IO mechanisms + +#ifndef ARROW_IO_MEMORY_H +#define ARROW_IO_MEMORY_H + +#include +#include +#include + +#include "arrow/io/interfaces.h" + +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MutableBuffer; +class Status; + +namespace io { + +// An output stream that writes to a MutableBuffer, such as one obtained from a +// memory map +// +// TODO(wesm): Implement this class +class ARROW_EXPORT BufferOutputStream : public OutputStream { + public: + explicit BufferOutputStream(const std::shared_ptr& buffer) + : buffer_(buffer) {} + + // Implement the OutputStream interface + Status Close() override; + Status Tell(int64_t* position) override; + Status Write(const uint8_t* data, int64_t length) override; + + // Returns the number of bytes remaining in the buffer + int64_t bytes_remaining() const; + + private: + std::shared_ptr buffer_; + int64_t capacity_; + int64_t position_; +}; + +// A memory source that uses memory-mapped files for memory interactions +class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { + public: + static Status Open(const std::string& path, FileMode::type mode, + std::shared_ptr* out); + + Status Close() override; + + Status Tell(int64_t* position) override; + + Status Seek(int64_t position) override; + + // Required by ReadableFileInterface, copies memory into out + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; + + Status ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; + + // Read into a buffer, zero copy if possible + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + bool supports_zero_copy() const override; + + Status Write(const uint8_t* data, int64_t nbytes) override; + + Status WriteAt(int64_t position, const uint8_t* data, int64_t nbytes) override; + + // @return: the size in bytes of the memory source + Status GetSize(int64_t* size) override; + + private: + explicit MemoryMappedFile(FileMode::type mode); + + Status WriteInternal(const uint8_t* data, int64_t nbytes); + + // Hide the internal details of this class for now + class MemoryMappedFileImpl; + std::unique_ptr impl_; +}; + +class ARROW_EXPORT BufferReader : public ReadableFileInterface { + public: + BufferReader(const uint8_t* buffer, int buffer_size) + : buffer_(buffer), buffer_size_(buffer_size), position_(0) {} + + Status Close() override; + Status Tell(int64_t* position) override; + + Status ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status GetSize(int64_t* size) override; + Status Seek(int64_t position) override; + + bool supports_zero_copy() const override; + + private: + const uint8_t* buffer_; + int buffer_size_; + int64_t position_; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_MEMORY_H diff --git a/cpp/src/arrow/io/test-common.h b/cpp/src/arrow/io/test-common.h new file mode 100644 index 00000000000..1954d479e39 --- /dev/null +++ b/cpp/src/arrow/io/test-common.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IO_TEST_COMMON_H +#define ARROW_IO_TEST_COMMON_H + +#include +#include +#include +#include +#include + +#include "arrow/io/memory.h" +#include "arrow/test-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" + +namespace arrow { +namespace io { + +class MemoryMapFixture { + public: + void TearDown() { + for (auto path : tmp_files_) { + std::remove(path.c_str()); + } + } + + void CreateFile(const std::string path, int64_t size) { + FILE* file = fopen(path.c_str(), "w"); + if (file != nullptr) { tmp_files_.push_back(path); } + ftruncate(fileno(file), size); + fclose(file); + } + + Status InitMemoryMap( + int64_t size, const std::string& path, std::shared_ptr* mmap) { + CreateFile(path, size); + return MemoryMappedFile::Open(path, FileMode::READWRITE, mmap); + } + + private: + std::vector tmp_files_; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_TEST_COMMON_H diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 82634169ed9..e5553a63581 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -19,16 +19,50 @@ # arrow_ipc ####################################### -# Headers: top level -install(FILES - adapter.h - metadata.h - memory.h - DESTINATION include/arrow/ipc) +set(ARROW_IPC_LINK_LIBS + arrow_io + arrow_shared +) + +set(ARROW_IPC_PRIVATE_LINK_LIBS + ) + +set(ARROW_IPC_TEST_LINK_LIBS + arrow_ipc + ${ARROW_IPC_PRIVATE_LINK_LIBS}) + +set(ARROW_IPC_SRCS + adapter.cc + metadata.cc + metadata-internal.cc +) + +# TODO(wesm): SHARED and STATIC targets +add_library(arrow_ipc SHARED + ${ARROW_IPC_SRCS} +) +target_link_libraries(arrow_ipc + LINK_PUBLIC ${ARROW_IPC_LINK_LIBS} + LINK_PRIVATE ${ARROW_IPC_PRIVATE_LINK_LIBS}) + +if(NOT APPLE) + # Localize thirdparty symbols using a linker version script. This hides them + # from the client application. The OS X linker does not support the + # version-script option. + set(ARROW_IPC_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/symbols.map") +endif() + +SET_TARGET_PROPERTIES(arrow_ipc PROPERTIES + LINKER_LANGUAGE CXX + LINK_FLAGS "${ARROW_IPC_LINK_FLAGS}") ADD_ARROW_TEST(ipc-adapter-test) -ADD_ARROW_TEST(ipc-memory-test) +ARROW_TEST_LINK_LIBRARIES(ipc-adapter-test + ${ARROW_IPC_TEST_LINK_LIBS}) + ADD_ARROW_TEST(ipc-metadata-test) +ARROW_TEST_LINK_LIBRARIES(ipc-metadata-test + ${ARROW_IPC_TEST_LINK_LIBS}) # make clean will delete the generated file set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE) @@ -49,3 +83,13 @@ add_custom_command( add_custom_target(metadata_fbs DEPENDS ${FBS_OUTPUT_FILES}) add_dependencies(arrow_objlib metadata_fbs) + +# Headers: top level +install(FILES + adapter.h + metadata.h + DESTINATION include/arrow/ipc) + +install(TARGETS arrow_ipc + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 40d372bbd35..0e101c89303 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -24,9 +24,11 @@ #include "arrow/array.h" #include "arrow/ipc/Message_generated.h" -#include "arrow/ipc/memory.h" #include "arrow/ipc/metadata-internal.h" #include "arrow/ipc/metadata.h" +#include "arrow/ipc/util.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" #include "arrow/schema.h" #include "arrow/table.h" #include "arrow/type.h" @@ -144,10 +146,15 @@ class RowBatchWriter { return Status::OK(); } - Status Write(MemorySource* dst, int64_t position, int64_t* data_header_offset) { + Status Write(io::OutputStream* dst, int64_t* data_header_offset) { // Write out all the buffers contiguously and compute the total size of the // memory payload int64_t offset = 0; + + // Get the starting position + int64_t position; + RETURN_NOT_OK(dst->Tell(&position)); + for (size_t i = 0; i < buffers_.size(); ++i) { const Buffer* buffer = buffers_[i].get(); int64_t size = 0; @@ -171,7 +178,7 @@ class RowBatchWriter { buffer_meta_.push_back(flatbuf::Buffer(0, position + offset, size)); if (size > 0) { - RETURN_NOT_OK(dst->Write(position + offset, buffer->data(), size)); + RETURN_NOT_OK(dst->Write(buffer->data(), size)); offset += size; } } @@ -180,7 +187,7 @@ class RowBatchWriter { // memory, the data header can be converted to a flatbuffer and written out // // Note: The memory written here is prefixed by the size of the flatbuffer - // itself as an int32_t. On reading from a MemorySource, you will have to + // itself as an int32_t. On reading from a input, you will have to // determine the data header size then request a buffer such that you can // construct the flatbuffer data accessor object (see arrow::ipc::Message) std::shared_ptr data_header; @@ -188,8 +195,7 @@ class RowBatchWriter { batch_->num_rows(), offset, field_nodes_, buffer_meta_, &data_header)); // Write the data header at the end - RETURN_NOT_OK( - dst->Write(position + offset, data_header->data(), data_header->size())); + RETURN_NOT_OK(dst->Write(data_header->data(), data_header->size())); *data_header_offset = position + offset; return Status::OK(); @@ -199,9 +205,9 @@ class RowBatchWriter { Status GetTotalSize(int64_t* size) { // emulates the behavior of Write without actually writing int64_t data_header_offset; - MockMemorySource source(0); - RETURN_NOT_OK(Write(&source, 0, &data_header_offset)); - *size = source.GetExtentBytesWritten(); + MockOutputStream dst; + RETURN_NOT_OK(Write(&dst, &data_header_offset)); + *size = dst.GetExtentBytesWritten(); return Status::OK(); } @@ -214,12 +220,12 @@ class RowBatchWriter { int max_recursion_depth_; }; -Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, - int64_t* header_offset, int max_recursion_depth) { +Status WriteRowBatch(io::OutputStream* dst, const RowBatch* batch, int64_t* header_offset, + int max_recursion_depth) { DCHECK_GT(max_recursion_depth, 0); RowBatchWriter serializer(batch, max_recursion_depth); RETURN_NOT_OK(serializer.AssemblePayload()); - return serializer.Write(dst, position, header_offset); + return serializer.Write(dst, header_offset); } Status GetRowBatchSize(const RowBatch* batch, int64_t* size) { @@ -234,11 +240,11 @@ Status GetRowBatchSize(const RowBatch* batch, int64_t* size) { static constexpr int64_t INIT_METADATA_SIZE = 4096; -class RowBatchReader::Impl { +class RowBatchReader::RowBatchReaderImpl { public: - Impl(MemorySource* source, const std::shared_ptr& metadata, - int max_recursion_depth) - : source_(source), metadata_(metadata), max_recursion_depth_(max_recursion_depth) { + RowBatchReaderImpl(io::ReadableFileInterface* file, + const std::shared_ptr& metadata, int max_recursion_depth) + : file_(file), metadata_(metadata), max_recursion_depth_(max_recursion_depth) { num_buffers_ = metadata->num_buffers(); num_flattened_fields_ = metadata->num_fields(); } @@ -339,10 +345,11 @@ class RowBatchReader::Impl { Status GetBuffer(int buffer_index, std::shared_ptr* out) { BufferMetadata metadata = metadata_->buffer(buffer_index); RETURN_NOT_OK(CheckMultipleOf64(metadata.length)); - return source_->ReadAt(metadata.offset, metadata.length, out); + return file_->ReadAt(metadata.offset, metadata.length, out); } - MemorySource* source_; + private: + io::ReadableFileInterface* file_; std::shared_ptr metadata_; int field_index_; @@ -352,22 +359,22 @@ class RowBatchReader::Impl { int num_flattened_fields_; }; -Status RowBatchReader::Open( - MemorySource* source, int64_t position, std::shared_ptr* out) { - return Open(source, position, kMaxIpcRecursionDepth, out); +Status RowBatchReader::Open(io::ReadableFileInterface* file, int64_t position, + std::shared_ptr* out) { + return Open(file, position, kMaxIpcRecursionDepth, out); } -Status RowBatchReader::Open(MemorySource* source, int64_t position, +Status RowBatchReader::Open(io::ReadableFileInterface* file, int64_t position, int max_recursion_depth, std::shared_ptr* out) { std::shared_ptr metadata; - RETURN_NOT_OK(source->ReadAt(position, INIT_METADATA_SIZE, &metadata)); + RETURN_NOT_OK(file->ReadAt(position, INIT_METADATA_SIZE, &metadata)); int32_t metadata_size = *reinterpret_cast(metadata->data()); - // We may not need to call source->ReadAt again + // We may not need to call ReadAt again if (metadata_size > static_cast(INIT_METADATA_SIZE - sizeof(int32_t))) { // We don't have enough data, read the indicated metadata size. - RETURN_NOT_OK(source->ReadAt(position + sizeof(int32_t), metadata_size, &metadata)); + RETURN_NOT_OK(file->ReadAt(position + sizeof(int32_t), metadata_size, &metadata)); } // TODO(wesm): buffer slicing here would be better in case ReadAt returns @@ -383,14 +390,14 @@ Status RowBatchReader::Open(MemorySource* source, int64_t position, std::shared_ptr batch_meta = message->GetRecordBatch(); std::shared_ptr result(new RowBatchReader()); - result->impl_.reset(new Impl(source, batch_meta, max_recursion_depth)); + result->impl_.reset(new RowBatchReaderImpl(file, batch_meta, max_recursion_depth)); *out = result; return Status::OK(); } // Here the explicit destructor is required for compilers to be aware of -// the complete information of RowBatchReader::Impl class +// the complete information of RowBatchReader::RowBatchReaderImpl class RowBatchReader::~RowBatchReader() {} Status RowBatchReader::GetRowBatch( diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index 6231af66aa1..215b46f8f65 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -33,9 +33,15 @@ class RowBatch; class Schema; class Status; +namespace io { + +class ReadableFileInterface; +class OutputStream; + +} // namespace io + namespace ipc { -class MemorySource; class RecordBatchMessage; // ---------------------------------------------------------------------- @@ -43,22 +49,21 @@ class RecordBatchMessage; // We have trouble decoding flatbuffers if the size i > 70, so 64 is a nice round number // TODO(emkornfield) investigate this more constexpr int kMaxIpcRecursionDepth = 64; -// Write the RowBatch (collection of equal-length Arrow arrays) to the memory -// source at the indicated position + +// Write the RowBatch (collection of equal-length Arrow arrays) to the output +// stream // -// First, each of the memory buffers are written out end-to-end in starting at -// the indicated position. +// First, each of the memory buffers are written out end-to-end // // Then, this function writes the batch metadata as a flatbuffer (see // format/Message.fbs -- the RecordBatch message type) like so: // // // -// Finally, the memory offset to the start of the metadata / data header is -// returned in an out-variable -ARROW_EXPORT Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, - int64_t position, int64_t* header_offset, - int max_recursion_depth = kMaxIpcRecursionDepth); +// Finally, the absolute offset (relative to the start of the output stream) to +// the start of the metadata / data header is returned in an out-variable +ARROW_EXPORT Status WriteRowBatch(io::OutputStream* dst, const RowBatch* batch, + int64_t* header_offset, int max_recursion_depth = kMaxIpcRecursionDepth); // int64_t GetRowBatchMetadata(const RowBatch* batch); @@ -68,16 +73,16 @@ ARROW_EXPORT Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, ARROW_EXPORT Status GetRowBatchSize(const RowBatch* batch, int64_t* size); // ---------------------------------------------------------------------- -// "Read" path; does not copy data if the MemorySource does not +// "Read" path; does not copy data if the input supports zero copy reads class ARROW_EXPORT RowBatchReader { public: - static Status Open( - MemorySource* source, int64_t position, std::shared_ptr* out); - - static Status Open(MemorySource* source, int64_t position, int max_recursion_depth, + static Status Open(io::ReadableFileInterface* file, int64_t position, std::shared_ptr* out); + static Status Open(io::ReadableFileInterface* file, int64_t position, + int max_recursion_depth, std::shared_ptr* out); + virtual ~RowBatchReader(); // Reassemble the row batch. A Schema is required to be able to construct the @@ -86,8 +91,8 @@ class ARROW_EXPORT RowBatchReader { const std::shared_ptr& schema, std::shared_ptr* out); private: - class Impl; - std::unique_ptr impl_; + class RowBatchReaderImpl; + std::unique_ptr impl_; }; } // namespace ipc diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 6740e0fc5ac..ca4d0152b90 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -24,9 +24,11 @@ #include "gtest/gtest.h" +#include "arrow/io/memory.h" +#include "arrow/io/test-common.h" #include "arrow/ipc/adapter.h" -#include "arrow/ipc/memory.h" #include "arrow/ipc/test-common.h" +#include "arrow/ipc/util.h" #include "arrow/test-util.h" #include "arrow/types/list.h" @@ -49,17 +51,18 @@ const auto LIST_LIST_INT32 = std::make_shared(LIST_INT32); typedef Status MakeRowBatch(std::shared_ptr* out); class TestWriteRowBatch : public ::testing::TestWithParam, - public MemoryMapFixture { + public io::MemoryMapFixture { public: void SetUp() { pool_ = default_memory_pool(); } - void TearDown() { MemoryMapFixture::TearDown(); } + void TearDown() { io::MemoryMapFixture::TearDown(); } Status RoundTripHelper(const RowBatch& batch, int memory_map_size, std::shared_ptr* batch_result) { std::string path = "test-write-row-batch"; - MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); + io::MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); int64_t header_location; - RETURN_NOT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); + + RETURN_NOT_OK(WriteRowBatch(mmap_.get(), &batch, &header_location)); std::shared_ptr reader; RETURN_NOT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); @@ -69,7 +72,7 @@ class TestWriteRowBatch : public ::testing::TestWithParam, } protected: - std::shared_ptr mmap_; + std::shared_ptr mmap_; MemoryPool* pool_; }; @@ -276,12 +279,12 @@ INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, &MakeStringTypesRowBatch, &MakeStruct)); void TestGetRowBatchSize(std::shared_ptr batch) { - MockMemorySource mock_source(1 << 16); + ipc::MockOutputStream mock; int64_t mock_header_location = -1; int64_t size = -1; - ASSERT_OK(WriteRowBatch(&mock_source, batch.get(), 0, &mock_header_location)); + ASSERT_OK(WriteRowBatch(&mock, batch.get(), &mock_header_location)); ASSERT_OK(GetRowBatchSize(batch.get(), &size)); - ASSERT_EQ(mock_source.GetExtentBytesWritten(), size); + ASSERT_EQ(mock.GetExtentBytesWritten(), size); } TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { @@ -303,10 +306,10 @@ TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { TestGetRowBatchSize(batch); } -class RecursionLimits : public ::testing::Test, public MemoryMapFixture { +class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { public: void SetUp() { pool_ = default_memory_pool(); } - void TearDown() { MemoryMapFixture::TearDown(); } + void TearDown() { io::MemoryMapFixture::TearDown(); } Status WriteToMmap(int recursion_level, bool override_level, int64_t* header_out = nullptr, std::shared_ptr* schema_out = nullptr) { @@ -329,19 +332,19 @@ class RecursionLimits : public ::testing::Test, public MemoryMapFixture { std::string path = "test-write-past-max-recursion"; const int memory_map_size = 1 << 16; - MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); + io::MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); int64_t header_location; int64_t* header_out_param = header_out == nullptr ? &header_location : header_out; if (override_level) { return WriteRowBatch( - mmap_.get(), batch.get(), 0, header_out_param, recursion_level + 1); + mmap_.get(), batch.get(), header_out_param, recursion_level + 1); } else { - return WriteRowBatch(mmap_.get(), batch.get(), 0, header_out_param); + return WriteRowBatch(mmap_.get(), batch.get(), header_out_param); } } protected: - std::shared_ptr mmap_; + std::shared_ptr mmap_; MemoryPool* pool_; }; diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc deleted file mode 100644 index a6c56d64f4a..00000000000 --- a/cpp/src/arrow/ipc/memory.cc +++ /dev/null @@ -1,182 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/ipc/memory.h" - -#include // For memory-mapping - -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/util/buffer.h" -#include "arrow/util/status.h" - -namespace arrow { -namespace ipc { - -MemorySource::MemorySource(AccessMode access_mode) : access_mode_(access_mode) {} - -MemorySource::~MemorySource() {} - -// Implement MemoryMappedSource - -class MemoryMappedSource::Impl { - public: - Impl() : file_(nullptr), is_open_(false), is_writable_(false), data_(nullptr) {} - - ~Impl() { - if (is_open_) { - munmap(data_, size_); - fclose(file_); - } - } - - Status Open(const std::string& path, MemorySource::AccessMode mode) { - if (is_open_) { return Status::IOError("A file is already open"); } - - int prot_flags = PROT_READ; - - if (mode == MemorySource::READ_WRITE) { - file_ = fopen(path.c_str(), "r+b"); - prot_flags |= PROT_WRITE; - is_writable_ = true; - } else { - file_ = fopen(path.c_str(), "rb"); - } - if (file_ == nullptr) { - std::stringstream ss; - ss << "Unable to open file, errno: " << errno; - return Status::IOError(ss.str()); - } - - fseek(file_, 0L, SEEK_END); - if (ferror(file_)) { return Status::IOError("Unable to seek to end of file"); } - size_ = ftell(file_); - - fseek(file_, 0L, SEEK_SET); - is_open_ = true; - - void* result = mmap(nullptr, size_, prot_flags, MAP_SHARED, fileno(file_), 0); - if (result == MAP_FAILED) { - std::stringstream ss; - ss << "Memory mapping file failed, errno: " << errno; - return Status::IOError(ss.str()); - } - data_ = reinterpret_cast(result); - - return Status::OK(); - } - - int64_t size() const { return size_; } - - uint8_t* data() { return data_; } - - bool writable() { return is_writable_; } - - bool opened() { return is_open_; } - - private: - FILE* file_; - int64_t size_; - bool is_open_; - bool is_writable_; - - // The memory map - uint8_t* data_; -}; - -MemoryMappedSource::MemoryMappedSource(AccessMode access_mode) - : MemorySource(access_mode) {} - -Status MemoryMappedSource::Open(const std::string& path, AccessMode access_mode, - std::shared_ptr* out) { - std::shared_ptr result(new MemoryMappedSource(access_mode)); - - result->impl_.reset(new Impl()); - RETURN_NOT_OK(result->impl_->Open(path, access_mode)); - - *out = result; - return Status::OK(); -} - -int64_t MemoryMappedSource::Size() const { - return impl_->size(); -} - -Status MemoryMappedSource::Close() { - // munmap handled in ::Impl dtor - return Status::OK(); -} - -Status MemoryMappedSource::ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) { - if (position < 0 || position >= impl_->size()) { - return Status::Invalid("position is out of bounds"); - } - - nbytes = std::min(nbytes, impl_->size() - position); - *out = std::make_shared(impl_->data() + position, nbytes); - return Status::OK(); -} - -Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { - if (!impl_->opened() || !impl_->writable()) { - return Status::IOError("Unable to write"); - } - if (position < 0 || position >= impl_->size()) { - return Status::Invalid("position is out of bounds"); - } - - // TODO(wesm): verify we are not writing past the end of the buffer - uint8_t* dst = impl_->data() + position; - memcpy(dst, data, nbytes); - - return Status::OK(); -} - -MockMemorySource::MockMemorySource(int64_t size) - : size_(size), extent_bytes_written_(0) {} - -Status MockMemorySource::Close() { - return Status::OK(); -} - -Status MockMemorySource::ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) { - return Status::OK(); -} - -Status MockMemorySource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { - extent_bytes_written_ = std::max(extent_bytes_written_, position + nbytes); - return Status::OK(); -} - -int64_t MockMemorySource::Size() const { - return size_; -} - -int64_t MockMemorySource::GetExtentBytesWritten() const { - return extent_bytes_written_; -} - -} // namespace ipc -} // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h deleted file mode 100644 index 377401d85c0..00000000000 --- a/cpp/src/arrow/ipc/memory.h +++ /dev/null @@ -1,150 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Public API for different interprocess memory sharing mechanisms - -#ifndef ARROW_IPC_MEMORY_H -#define ARROW_IPC_MEMORY_H - -#include -#include -#include - -#include "arrow/util/macros.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Buffer; -class MutableBuffer; -class Status; - -namespace ipc { - -// Abstract output stream -class OutputStream { - public: - virtual ~OutputStream() {} - // Close the output stream - virtual Status Close() = 0; - - // The current position in the output stream - virtual int64_t Tell() const = 0; - - // Write bytes to the stream - virtual Status Write(const uint8_t* data, int64_t length) = 0; -}; - -// An output stream that writes to a MutableBuffer, such as one obtained from a -// memory map -class BufferOutputStream : public OutputStream { - public: - explicit BufferOutputStream(const std::shared_ptr& buffer) - : buffer_(buffer) {} - - // Implement the OutputStream interface - Status Close() override; - int64_t Tell() const override; - Status Write(const uint8_t* data, int64_t length) override; - - // Returns the number of bytes remaining in the buffer - int64_t bytes_remaining() const; - - private: - std::shared_ptr buffer_; - int64_t capacity_; - int64_t position_; -}; - -class ARROW_EXPORT MemorySource { - public: - // Indicates the access permissions of the memory source - enum AccessMode { READ_ONLY, READ_WRITE }; - - virtual ~MemorySource(); - - // Retrieve a buffer of memory from the source of the indicates size and at - // the indicated location - // @returns: arrow::Status indicating success / failure. The buffer is set - // into the *out argument - virtual Status ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) = 0; - - virtual Status Close() = 0; - - virtual Status Write(int64_t position, const uint8_t* data, int64_t nbytes) = 0; - - // @return: the size in bytes of the memory source - virtual int64_t Size() const = 0; - - protected: - explicit MemorySource(AccessMode access_mode = AccessMode::READ_WRITE); - - AccessMode access_mode_; - - private: - DISALLOW_COPY_AND_ASSIGN(MemorySource); -}; - -// A memory source that uses memory-mapped files for memory interactions -class ARROW_EXPORT MemoryMappedSource : public MemorySource { - public: - static Status Open(const std::string& path, AccessMode access_mode, - std::shared_ptr* out); - - Status Close() override; - - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - - Status Write(int64_t position, const uint8_t* data, int64_t nbytes) override; - - // @return: the size in bytes of the memory source - int64_t Size() const override; - - private: - explicit MemoryMappedSource(AccessMode access_mode); - // Hide the internal details of this class for now - class Impl; - std::unique_ptr impl_; -}; - -// A MemorySource that tracks the size of allocations from a memory source -class MockMemorySource : public MemorySource { - public: - explicit MockMemorySource(int64_t size); - - Status Close() override; - - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - - Status Write(int64_t position, const uint8_t* data, int64_t nbytes) override; - - int64_t Size() const override; - - // @return: the smallest number of bytes containing the modified region of the - // MockMemorySource - int64_t GetExtentBytesWritten() const; - - private: - int64_t size_; - int64_t extent_bytes_written_; -}; - -} // namespace ipc -} // namespace arrow - -#endif // ARROW_IPC_MEMORY_H diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 8cc902c2967..05e9c7ad4d3 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -220,9 +220,8 @@ static Status FieldToFlatbuffer( auto fb_children = fbb.CreateVector(children); // TODO: produce the list of VectorTypes - *offset = flatbuf::CreateField( - fbb, fb_name, field->nullable, type_enum, type_data, field->dictionary, - fb_children); + *offset = flatbuf::CreateField(fbb, fb_name, field->nullable, type_enum, type_data, + field->dictionary, fb_children); return Status::OK(); } @@ -295,8 +294,8 @@ Status WriteDataHeader(int32_t length, int64_t body_length, } Status MessageBuilder::Finish() { - auto message = flatbuf::CreateMessage(fbb_, kMetadataVersion, - header_type_, header_, body_length_); + auto message = + flatbuf::CreateMessage(fbb_, kMetadataVersion, header_type_, header_, body_length_); fbb_.Finish(message); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index db9a83f6a8d..d38df840ba0 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -38,7 +38,7 @@ class Status; namespace ipc { static constexpr flatbuf::MetadataVersion kMetadataVersion = - flatbuf::MetadataVersion_V1_SNAPSHOT; + flatbuf::MetadataVersion_V1_SNAPSHOT; Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr* out); diff --git a/cpp/src/arrow/ipc/metadata.h b/cpp/src/arrow/ipc/metadata.h index 838a4a676ea..d5ec53317e6 100644 --- a/cpp/src/arrow/ipc/metadata.h +++ b/cpp/src/arrow/ipc/metadata.h @@ -23,6 +23,8 @@ #include #include +#include "arrow/util/visibility.h" + namespace arrow { class Buffer; @@ -36,6 +38,7 @@ namespace ipc { // Message read/write APIs // Serialize arrow::Schema as a Flatbuffer +ARROW_EXPORT Status WriteSchema(const Schema* schema, std::shared_ptr* out); //---------------------------------------------------------------------- @@ -47,7 +50,7 @@ Status WriteSchema(const Schema* schema, std::shared_ptr* out); class Message; // Container for serialized Schema metadata contained in an IPC message -class SchemaMessage { +class ARROW_EXPORT SchemaMessage { public: // Accepts an opaque flatbuffer pointer SchemaMessage(const std::shared_ptr& message, const void* schema); @@ -82,7 +85,7 @@ struct BufferMetadata { }; // Container for serialized record batch metadata contained in an IPC message -class RecordBatchMessage { +class ARROW_EXPORT RecordBatchMessage { public: // Accepts an opaque flatbuffer pointer RecordBatchMessage(const std::shared_ptr& message, const void* batch_meta); @@ -102,13 +105,13 @@ class RecordBatchMessage { std::unique_ptr impl_; }; -class DictionaryBatchMessage { +class ARROW_EXPORT DictionaryBatchMessage { public: int64_t id() const; std::unique_ptr data() const; }; -class Message : public std::enable_shared_from_this { +class ARROW_EXPORT Message : public std::enable_shared_from_this { public: enum Type { NONE, SCHEMA, DICTIONARY_BATCH, RECORD_BATCH }; diff --git a/cpp/src/arrow/ipc/symbols.map b/cpp/src/arrow/ipc/symbols.map new file mode 100644 index 00000000000..b4ad98cd7f2 --- /dev/null +++ b/cpp/src/arrow/ipc/symbols.map @@ -0,0 +1,18 @@ +{ + # Symbols marked as 'local' are not exported by the DSO and thus may not + # be used by client applications. + local: + # devtoolset / static-libstdc++ symbols + __cxa_*; + + extern "C++" { + # boost + boost::*; + + # devtoolset or -static-libstdc++ - the Red Hat devtoolset statically + # links c++11 symbols into binaries so that the result may be executed on + # a system with an older libstdc++ which doesn't include the necessary + # c++11 symbols. + std::*; + }; +}; diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index e7dbb84d790..f6582fc883b 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -34,31 +34,6 @@ namespace arrow { namespace ipc { -class MemoryMapFixture { - public: - void TearDown() { - for (auto path : tmp_files_) { - std::remove(path.c_str()); - } - } - - void CreateFile(const std::string path, int64_t size) { - FILE* file = fopen(path.c_str(), "w"); - if (file != nullptr) { tmp_files_.push_back(path); } - ftruncate(fileno(file), size); - fclose(file); - } - - Status InitMemoryMap( - int64_t size, const std::string& path, std::shared_ptr* mmap) { - CreateFile(path, size); - return MemoryMappedSource::Open(path, MemorySource::READ_WRITE, mmap); - } - - private: - std::vector tmp_files_; -}; - Status MakeRandomInt32Array( int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { std::shared_ptr data; diff --git a/cpp/src/arrow/ipc/util.h b/cpp/src/arrow/ipc/util.h new file mode 100644 index 00000000000..3f4001b21a9 --- /dev/null +++ b/cpp/src/arrow/ipc/util.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_UTIL_H +#define ARROW_IPC_UTIL_H + +#include + +#include "arrow/array.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +// A helper class to tracks the size of allocations +class MockOutputStream : public io::OutputStream { + public: + MockOutputStream() : extent_bytes_written_(0) {} + + Status Close() override { return Status::OK(); } + + Status Write(const uint8_t* data, int64_t nbytes) override { + extent_bytes_written_ += nbytes; + return Status::OK(); + } + + Status Tell(int64_t* position) override { + *position = extent_bytes_written_; + return Status::OK(); + } + + int64_t GetExtentBytesWritten() const { return extent_bytes_written_; } + + private: + int64_t extent_bytes_written_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_UTIL_H diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index f2a90b71a49..c400e14ea47 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -27,6 +27,7 @@ set(PARQUET_SRCS set(PARQUET_LIBS arrow_shared + arrow_io parquet_shared ) diff --git a/cpp/src/arrow/parquet/io.cc b/cpp/src/arrow/parquet/io.cc index b6fdd67d15b..a50d753f305 100644 --- a/cpp/src/arrow/parquet/io.cc +++ b/cpp/src/arrow/parquet/io.cc @@ -27,7 +27,7 @@ #include "arrow/util/status.h" // To assist with readability -using ArrowROFile = arrow::io::RandomAccessFile; +using ArrowROFile = arrow::io::ReadableFileInterface; namespace arrow { namespace parquet { @@ -58,7 +58,7 @@ void ParquetAllocator::Free(uint8_t* buffer, int64_t size) { ParquetReadSource::ParquetReadSource(ParquetAllocator* allocator) : file_(nullptr), allocator_(allocator) {} -Status ParquetReadSource::Open(const std::shared_ptr& file) { +Status ParquetReadSource::Open(const std::shared_ptr& file) { int64_t file_size; RETURN_NOT_OK(file->GetSize(&file_size)); diff --git a/cpp/src/arrow/parquet/io.h b/cpp/src/arrow/parquet/io.h index 1c59695c6c1..1734863acf1 100644 --- a/cpp/src/arrow/parquet/io.h +++ b/cpp/src/arrow/parquet/io.h @@ -62,7 +62,7 @@ class ARROW_EXPORT ParquetReadSource : public ::parquet::RandomAccessSource { explicit ParquetReadSource(ParquetAllocator* allocator); // We need to ask for the file size on opening the file, and this can fail - Status Open(const std::shared_ptr& file); + Status Open(const std::shared_ptr& file); void Close() override; int64_t Tell() const override; @@ -72,7 +72,7 @@ class ARROW_EXPORT ParquetReadSource : public ::parquet::RandomAccessSource { private: // An Arrow readable file of some kind - std::shared_ptr file_; + std::shared_ptr file_; // The allocator is required for creating managed buffers ParquetAllocator* allocator_; diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index 6615457c483..208b3e867d3 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" +#include "arrow/io/memory.h" #include "arrow/parquet/io.h" #include "arrow/test-util.h" #include "arrow/util/memory-pool.h" @@ -96,61 +97,13 @@ TEST(TestParquetAllocator, CustomPool) { // ---------------------------------------------------------------------- // Read source tests -class BufferReader : public io::RandomAccessFile { - public: - BufferReader(const uint8_t* buffer, int buffer_size) - : buffer_(buffer), buffer_size_(buffer_size), position_(0) {} - - Status Close() override { - // no-op - return Status::OK(); - } - - Status Tell(int64_t* position) override { - *position = position_; - return Status::OK(); - } - - Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override { - RETURN_NOT_OK(Seek(position)); - return Read(nbytes, bytes_read, buffer); - } - - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override { - memcpy(buffer, buffer_ + position_, nbytes); - *bytes_read = std::min(nbytes, buffer_size_ - position_); - position_ += *bytes_read; - return Status::OK(); - } - - Status GetSize(int64_t* size) override { - *size = buffer_size_; - return Status::OK(); - } - - Status Seek(int64_t position) override { - if (position < 0 || position >= buffer_size_) { - return Status::IOError("position out of bounds"); - } - - position_ = position; - return Status::OK(); - } - - private: - const uint8_t* buffer_; - int buffer_size_; - int64_t position_; -}; - TEST(TestParquetReadSource, Basics) { std::string data = "this is the data"; auto data_buffer = reinterpret_cast(data.c_str()); ParquetAllocator allocator(default_memory_pool()); - auto file = std::make_shared(data_buffer, data.size()); + auto file = std::make_shared(data_buffer, data.size()); auto source = std::make_shared(&allocator); ASSERT_OK(source->Open(file)); diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc index a2bcd3e05c3..63ad8fba465 100644 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -178,8 +178,7 @@ class TestConvertArrowSchema : public ::testing::Test { NodePtr schema_node = GroupNode::Make("schema", Repetition::REPEATED, nodes); const GroupNode* expected_schema_node = static_cast(schema_node.get()); - const GroupNode* result_schema_node = - static_cast(result_schema_->schema().get()); + const GroupNode* result_schema_node = result_schema_->group_node(); ASSERT_EQ(expected_schema_node->field_count(), result_schema_node->field_count()); diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 440ec84e2c7..0c2fc6e8fc7 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -149,7 +149,7 @@ bool FileReader::Impl::CheckForFlatColumn(const ::parquet::ColumnDescriptor* des } Status FileReader::Impl::GetFlatColumn(int i, std::unique_ptr* out) { - const ::parquet::SchemaDescriptor* schema = reader_->metadata()->schema_descriptor(); + const ::parquet::SchemaDescriptor* schema = reader_->metadata()->schema(); if (!CheckForFlatColumn(schema->Column(i))) { return Status::Invalid("The requested column is not flat"); @@ -167,9 +167,9 @@ Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr* out) { } Status FileReader::Impl::ReadFlatTable(std::shared_ptr
* table) { - auto descr = reader_->metadata()->schema_descriptor(); + auto descr = reader_->metadata()->schema(); - const std::string& name = descr->schema()->name(); + const std::string& name = descr->name(); std::shared_ptr schema; RETURN_NOT_OK(FromParquetSchema(descr, &schema)); @@ -193,7 +193,7 @@ FileReader::FileReader( FileReader::~FileReader() {} // Static ctor -Status OpenFile(const std::shared_ptr& file, +Status OpenFile(const std::shared_ptr& file, ParquetAllocator* allocator, std::unique_ptr* reader) { std::unique_ptr source(new ParquetReadSource(allocator)); RETURN_NOT_OK(source->Open(file)); diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h index f1492f64521..a9c64eca997 100644 --- a/cpp/src/arrow/parquet/reader.h +++ b/cpp/src/arrow/parquet/reader.h @@ -137,7 +137,7 @@ class ARROW_EXPORT FlatColumnReader { // Helper function to create a file reader from an implementation of an Arrow // readable file ARROW_EXPORT -Status OpenFile(const std::shared_ptr& file, +Status OpenFile(const std::shared_ptr& file, ParquetAllocator* allocator, std::unique_ptr* reader); } // namespace parquet diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc index cd91df32271..ff32e51bacd 100644 --- a/cpp/src/arrow/parquet/schema.cc +++ b/cpp/src/arrow/parquet/schema.cc @@ -202,7 +202,7 @@ Status FromParquetSchema( // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes // from the root Parquet node const GroupNode* schema_node = - static_cast(parquet_schema->schema().get()); + static_cast(parquet_schema->group_node()); std::vector> fields(schema_node->field_count()); for (int i = 0; i < schema_node->field_count(); i++) { diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index ddee573fa1e..2b47f1461c9 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -334,7 +334,7 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool, std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema; RETURN_NOT_OK( ToParquetSchema(table->schema().get(), *properties.get(), &parquet_schema)); - auto schema_node = std::static_pointer_cast(parquet_schema->schema()); + auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); std::unique_ptr parquet_writer = ParquetFileWriter::Open(sink, schema_node, properties); FileWriter writer(pool, std::move(parquet_writer)); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 02677d5e18b..b4c3721a728 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -149,7 +149,7 @@ struct ARROW_EXPORT Field { int64_t dictionary; Field(const std::string& name, const TypePtr& type, bool nullable = true, - int64_t dictionary = 0) + int64_t dictionary = 0) : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } @@ -159,7 +159,7 @@ struct ARROW_EXPORT Field { bool Equals(const Field& other) const { return (this == &other) || (this->name == other.name && this->nullable == other.nullable && - this->dictionary == dictionary && this->type->Equals(other.type.get())); + this->dictionary == dictionary && this->type->Equals(other.type.get())); } bool Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index e767e955524..5d60376f794 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -64,6 +64,6 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { pool->Free(data, 100); } -#endif // ARROW_VALGRIND +#endif // ARROW_VALGRIND } // namespace arrow diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd index 734ace6c923..f338a436814 100644 --- a/python/pyarrow/includes/libarrow_io.pxd +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -29,25 +29,41 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: ObjectType_FILE" arrow::io::ObjectType::FILE" ObjectType_DIRECTORY" arrow::io::ObjectType::DIRECTORY" - cdef cppclass FileBase: + cdef cppclass FileInterface: CStatus Close() CStatus Tell(int64_t* position) + FileMode mode() - cdef cppclass ReadableFile(FileBase): + cdef cppclass Readable: + CStatus Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) + + cdef cppclass Seekable: + CStatus Seek(int64_t position) + + cdef cppclass Writeable: + CStatus Write(const uint8_t* data, int64_t nbytes) + + cdef cppclass OutputStream(FileInterface, Writeable): + pass + + cdef cppclass InputStream(FileInterface, Readable): + pass + + cdef cppclass ReadableFileInterface(InputStream, Seekable): CStatus GetSize(int64_t* size) - CStatus Read(int64_t nbytes, int64_t* bytes_read, - uint8_t* buffer) CStatus ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) + CStatus ReadAt(int64_t position, int64_t nbytes, + int64_t* bytes_read, shared_ptr[Buffer]* out) - cdef cppclass RandomAccessFile(ReadableFile): - CStatus Seek(int64_t position) + cdef cppclass WriteableFileInterface(OutputStream, Seekable): + CStatus WriteAt(int64_t position, const uint8_t* data, + int64_t nbytes) - cdef cppclass WriteableFile(FileBase): - CStatus Write(const uint8_t* buffer, int64_t nbytes) - # CStatus Write(const uint8_t* buffer, int64_t nbytes, - # int64_t* bytes_written) + cdef cppclass ReadWriteFileInterface(ReadableFileInterface, + WriteableFileInterface): + pass cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: @@ -70,10 +86,10 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: int64_t block_size int16_t permissions - cdef cppclass HdfsReadableFile(RandomAccessFile): + cdef cppclass HdfsReadableFile(ReadableFileInterface): pass - cdef cppclass HdfsWriteableFile(WriteableFile): + cdef cppclass HdfsOutputStream(OutputStream): pass cdef cppclass CHdfsClient" arrow::io::HdfsClient": @@ -103,4 +119,4 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: CStatus OpenWriteable(const c_string& path, c_bool append, int32_t buffer_size, int16_t replication, int64_t default_block_size, - shared_ptr[HdfsWriteableFile]* handle) + shared_ptr[HdfsOutputStream]* handle) diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index fe24f593e32..f932a931493 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -19,7 +19,7 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport CSchema, CStatus, CTable, MemoryPool -from pyarrow.includes.libarrow_io cimport RandomAccessFile +from pyarrow.includes.libarrow_io cimport ReadableFileInterface cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: @@ -78,10 +78,10 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: unique_ptr[ParquetFileReader] OpenFile(const c_string& path) cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: - cdef cppclass OutputStream: + cdef cppclass ParquetOutputStream" parquet::OutputStream": pass - cdef cppclass LocalFileOutputStream(OutputStream): + cdef cppclass LocalFileOutputStream(ParquetOutputStream): LocalFileOutputStream(const c_string& path) void Close() @@ -100,11 +100,11 @@ cdef extern from "arrow/parquet/io.h" namespace "arrow::parquet" nogil: cdef cppclass ParquetReadSource: ParquetReadSource(ParquetAllocator* allocator) - Open(const shared_ptr[RandomAccessFile]& file) + Open(const shared_ptr[ReadableFileInterface]& file) cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil: - CStatus OpenFile(const shared_ptr[RandomAccessFile]& file, + CStatus OpenFile(const shared_ptr[ReadableFileInterface]& file, ParquetAllocator* allocator, unique_ptr[FileReader]* reader) @@ -121,6 +121,8 @@ cdef extern from "arrow/parquet/schema.h" namespace "arrow::parquet" nogil: cdef extern from "arrow/parquet/writer.h" namespace "arrow::parquet" nogil: - cdef CStatus WriteFlatTable(const CTable* table, MemoryPool* pool, - const shared_ptr[OutputStream]& sink, int64_t chunk_size, - const shared_ptr[WriterProperties]& properties) + cdef CStatus WriteFlatTable( + const CTable* table, MemoryPool* pool, + const shared_ptr[ParquetOutputStream]& sink, + int64_t chunk_size, + const shared_ptr[WriterProperties]& properties) diff --git a/python/pyarrow/io.pxd b/python/pyarrow/io.pxd index b92af72704a..f55fc0ab53a 100644 --- a/python/pyarrow/io.pxd +++ b/python/pyarrow/io.pxd @@ -19,7 +19,8 @@ from pyarrow.includes.common cimport * from pyarrow.includes.libarrow cimport * -from pyarrow.includes.libarrow_io cimport RandomAccessFile, WriteableFile +from pyarrow.includes.libarrow_io cimport (ReadableFileInterface, + OutputStream) cdef class NativeFileInterface: @@ -28,5 +29,5 @@ cdef class NativeFileInterface: # extension classes are technically virtual in the C++ sense)m we can # expose the arrow::io abstract file interfaces to other components # throughout the suite of Arrow C++ libraries - cdef read_handle(self, shared_ptr[RandomAccessFile]* file) - cdef write_handle(self, shared_ptr[WriteableFile]* file) + cdef read_handle(self, shared_ptr[ReadableFileInterface]* file) + cdef write_handle(self, shared_ptr[OutputStream]* file) diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index b8bf8835620..f2eee260c33 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -316,16 +316,16 @@ cdef class HdfsClient: cdef class NativeFileInterface: - cdef read_handle(self, shared_ptr[RandomAccessFile]* file): + cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): raise NotImplementedError - cdef write_handle(self, shared_ptr[WriteableFile]* file): + cdef write_handle(self, shared_ptr[OutputStream]* file): raise NotImplementedError cdef class HdfsFile(NativeFileInterface): cdef: shared_ptr[HdfsReadableFile] rd_file - shared_ptr[HdfsWriteableFile] wr_file + shared_ptr[HdfsOutputStream] wr_file bint is_readonly bint is_open object parent @@ -364,13 +364,13 @@ cdef class HdfsFile(NativeFileInterface): if self.is_readonly: raise IOError("only valid on writeonly files") - cdef read_handle(self, shared_ptr[RandomAccessFile]* file): + cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): self._assert_readable() - file[0] = self.rd_file + file[0] = self.rd_file - cdef write_handle(self, shared_ptr[WriteableFile]* file): + cdef write_handle(self, shared_ptr[OutputStream]* file): self._assert_writeable() - file[0] = self.wr_file + file[0] = self.wr_file def size(self): cdef int64_t size diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index ebba1a17ac7..fb36b2967c0 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -21,7 +21,7 @@ from pyarrow.includes.libarrow cimport * from pyarrow.includes.parquet cimport * -from pyarrow.includes.libarrow_io cimport RandomAccessFile, WriteableFile +from pyarrow.includes.libarrow_io cimport ReadableFileInterface cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.compat import tobytes @@ -55,7 +55,7 @@ cdef class ParquetReader: ParquetFileReader.OpenFile(path))) cdef open_native_file(self, NativeFileInterface file): - cdef shared_ptr[RandomAccessFile] cpp_handle + cdef shared_ptr[ReadableFileInterface] cpp_handle file.read_handle(&cpp_handle) check_cstatus(OpenFile(cpp_handle, &self.allocator, &self.reader)) @@ -105,7 +105,7 @@ def write_table(table, filename, chunk_size=None, version=None): """ cdef Table table_ = table cdef CTable* ctable_ = table_.table - cdef shared_ptr[OutputStream] sink + cdef shared_ptr[ParquetOutputStream] sink cdef WriterProperties.Builder properties_builder cdef int64_t chunk_size_ = 0 if chunk_size is None: From 5f1556c011446a9fc524e91042c859365ed7afc1 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Mon, 19 Sep 2016 14:08:32 -0700 Subject: [PATCH 138/210] ARROW-297: Fix Arrow pom for release Author: Julien Le Dem Closes #140 from julienledem/fix_pom_for_release and squashes the following commits: 9618eaf [Julien Le Dem] ARROW-297: Fix Arrow pom for release --- java/format/pom.xml | 19 ++++++++++--------- java/pom.xml | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/java/format/pom.xml b/java/format/pom.xml index 4cf68bbe057..78300047862 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -27,6 +27,7 @@ 1.2.0-3f79e055 false ${project.build.directory}/flatc-${os.detected.classifier}-${fbs.version}.exe + ${project.build.directory}/generated-sources/flatc 3.3 2.10 1.5.0.Final @@ -51,7 +52,7 @@ - + org.apache.maven.plugins maven-dependency-plugin ${maven-dependency-plugin.version} @@ -83,7 +84,7 @@ exec-maven-plugin 1.4.0 - + script-chmod exec @@ -98,7 +99,7 @@ ${flatc.download.skip} - + exec @@ -108,7 +109,7 @@ -j -o - target/generated-sources/flatc + ${flatc.generated.files} ../../format/Message.fbs ../../format/File.fbs @@ -116,7 +117,7 @@ - + com.mycila license-maven-plugin 2.3 @@ -135,26 +136,26 @@ - + org.codehaus.mojo build-helper-maven-plugin 1.9.1 - add-sources-as-resources + add-generated-sources-to-classpath generate-sources add-source - ${project.build.directory}/generated-sources + ${flatc.generated.files} - + org.apache.maven.plugins maven-checkstyle-plugin diff --git a/java/pom.xml b/java/pom.xml index a8e24ed054c..fc2c18d0e51 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -16,7 +16,7 @@ org.apache apache - 14 + 18 org.apache.arrow From 53583281b2af3e4ecedd3b130cef588680a44c4f Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Wed, 21 Sep 2016 13:38:52 -0700 Subject: [PATCH 139/210] ARROW-298: create release scripts Author: Julien Le Dem Closes #141 from julienledem/release and squashes the following commits: 1a5114d [Julien Le Dem] ARROW-298: create release scripts --- dev/release/00-prepare.sh | 46 ++++++++++++++++++++++ dev/release/01-perform.sh | 27 +++++++++++++ dev/release/02-source.sh | 80 +++++++++++++++++++++++++++++++++++++++ dev/release/README | 15 ++++++++ java/README.md | 14 +++++++ 5 files changed, 182 insertions(+) create mode 100644 dev/release/00-prepare.sh create mode 100644 dev/release/01-perform.sh create mode 100644 dev/release/02-source.sh create mode 100644 dev/release/README create mode 100644 java/README.md diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh new file mode 100644 index 00000000000..3c1fb9a0938 --- /dev/null +++ b/dev/release/00-prepare.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +if [ -z "$1" ]; then + echo "Usage: $0 " + exit +fi + +if [ -z "$2" ]; then + echo "Usage: $0 " + exit +fi + +version=$1 + +tag=apache-arrow-${version} + +nextVersion=$2 + +cd "${SOURCE_DIR}/../../java" + +mvn release:clean +mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmodules -DdevelopmentVersion=${nextVersion}-SNAPSHOT + +cd - + +echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" diff --git a/dev/release/01-perform.sh b/dev/release/01-perform.sh new file mode 100644 index 00000000000..d7140f6cba1 --- /dev/null +++ b/dev/release/01-perform.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +cd "${SOURCE_DIR}/../../java" + +mvn release:perform + +cd - diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh new file mode 100644 index 00000000000..f44692d5e9d --- /dev/null +++ b/dev/release/02-source.sh @@ -0,0 +1,80 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +if [ -z "$1" ]; then + echo "Usage: $0 " + exit +fi + +if [ -z "$2" ]; then + echo "Usage: $0 " + exit +fi + +version=$1 +rc=$2 + +if [ -d tmp/ ]; then + echo "Cannot run: tmp/ exists" + exit +fi + +tag=apache-arrow-$version +tagrc=${tag}-rc${rc} + +echo "Preparing source for $tagrc" + +release_hash=`git rev-list $tag 2> /dev/null | head -n 1 ` + +if [ -z "$release_hash" ]; then + echo "Cannot continue: unknown git tag: $tag" + exit +fi + +echo "Using commit $release_hash" + +tarball=$tag.tar.gz + +# be conservative and use the release hash, even though git produces the same +# archive (identical hashes) using the scm tag +git archive $release_hash --prefix $tag/ -o $tarball + +# sign the archive +gpg --armor --output ${tarball}.asc --detach-sig $tarball +gpg --print-md MD5 $tarball > ${tarball}.md5 +shasum $tarball > ${tarball}.sha + +# check out the parquet RC folder +svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow tmp + +# add the release candidate for the tag +mkdir -p tmp/$tagrc +cp ${tarball}* tmp/$tagrc +svn add tmp/$tagrc +svn ci -m 'Apache Arrow $version RC${rc}' tmp/$tagrc + +# clean up +rm -rf tmp + +echo "Success! The release candidate is available here:" +echo " https://dist.apache.org/repos/dist/dev/arrow/$tagrc" +echo "" +echo "Commit SHA1: $release_hash" + diff --git a/dev/release/README b/dev/release/README new file mode 100644 index 00000000000..4fcc5d9728c --- /dev/null +++ b/dev/release/README @@ -0,0 +1,15 @@ +requirements: +- being a committer to be able to push to dist and maven repository +- a gpg key to sign the artifacts + +to release, run the following (replace 0.1.0 with version to release): +# prepare release v 0.1.0 (run tests, sign artifacts). Next version will be 0.1.1-SNAPSHOT +dev/release/00-prepare.sh 0.1.0 0.1.1 +# tag and push to maven repo (repo will have to be finalized separately) +dev/release/01-perform.sh +# create the source release +dev/release/02-source.sh 0.1.0 0 + +useful commands: +to set the mvn version in the poms +mvn versions:set -DnewVersion=0.1-SNAPSHOT diff --git a/java/README.md b/java/README.md new file mode 100644 index 00000000000..5e1d30d9fd2 --- /dev/null +++ b/java/README.md @@ -0,0 +1,14 @@ +# Arrow Java + +## Setup Build Environment + +install: + - java 7 or later + - maven 3.3 or later + +## Building running tests + +``` +cd java +mvn install +``` From 430bd9576ceb14456cd6853f6d75ca19b333efc2 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 21 Sep 2016 18:14:00 -0400 Subject: [PATCH 140/210] ARROW-299: Use absolute namespace in macros Author: Uwe L. Korn Closes #142 from xhochy/arrow-299 and squashes the following commits: b7967fa [Uwe L. Korn] ARROW-299: Use absolute namespace in macros --- cpp/src/arrow/util/logging.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 54f67593bec..d320d6adb7c 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -35,7 +35,7 @@ namespace arrow { #define ARROW_ERROR 2 #define ARROW_FATAL 3 -#define ARROW_LOG_INTERNAL(level) arrow::internal::CerrLog(level) +#define ARROW_LOG_INTERNAL(level) ::arrow::internal::CerrLog(level) #define ARROW_LOG(level) ARROW_LOG_INTERNAL(ARROW_##level) #define ARROW_CHECK(condition) \ @@ -47,25 +47,25 @@ namespace arrow { #define DCHECK(condition) \ while (false) \ - arrow::internal::NullLog() + ::arrow::internal::NullLog() #define DCHECK_EQ(val1, val2) \ while (false) \ - arrow::internal::NullLog() + ::arrow::internal::NullLog() #define DCHECK_NE(val1, val2) \ while (false) \ - arrow::internal::NullLog() + ::arrow::internal::NullLog() #define DCHECK_LE(val1, val2) \ while (false) \ - arrow::internal::NullLog() + ::arrow::internal::NullLog() #define DCHECK_LT(val1, val2) \ while (false) \ - arrow::internal::NullLog() + ::arrow::internal::NullLog() #define DCHECK_GE(val1, val2) \ while (false) \ - arrow::internal::NullLog() + ::arrow::internal::NullLog() #define DCHECK_GT(val1, val2) \ while (false) \ - arrow::internal::NullLog() + ::arrow::internal::NullLog() #else #define ARROW_DFATAL ARROW_FATAL From 7e39747eec05379710e1a42ecbaf1d9795bc3cf0 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 21 Sep 2016 18:15:58 -0400 Subject: [PATCH 141/210] ARROW-267: [C++] Implement file format layout for IPC/RPC Standing up the PR to get some feedback. I still have to implement the read path for record batches and then add a test suite. I'd also like to add some documentation about the structure of the file format and some of the implicit assumptions (e.g. word alignment) -- I put a placeholder `IPC.md` document here for this. I also conformed the language re: record batches (had been using "row batch" in the C++ code) to make things more sane. Note we are not yet able to write OS files here, see ARROW-293. Will tackle that in a follow up PR, and then we should be in a position to integration test. Author: Wes McKinney Closes #139 from wesm/ARROW-267 and squashes the following commits: 9bdbbd4 [Wes McKinney] Get test suite passing, add missing metadata adapters for string, binary 4d3cc1d [Wes McKinney] cpplint 2ec1aad [Wes McKinney] Draft failing file roundtrip test 358309b [Wes McKinney] Move record batch test fixtures into test-common.h b88bce0 [Wes McKinney] Finish draft of FileReader::GetRecordBatch. Add body end offset to ipc adapter edf36e7 [Wes McKinney] Start drafting FileReader IPC implementation. Change record batch data header to write metadata size int32_t as suffix rather than prefix 95157f2 [Wes McKinney] Make record batch writes aligned on word boundaries 7c50251 [Wes McKinney] Make the interface for WriteRecordBatch more flexible (not require constructing a RecordBatch object) ab4056f [Wes McKinney] Drafting file reader/writer API. Implement BufferOutputStream and write file footers to an OutputStream 113ac7b [Wes McKinney] Draft file footer metadata write/read path with simple unit test --- NOTICE.txt | 6 + cpp/src/arrow/io/memory.cc | 37 ++++ cpp/src/arrow/io/memory.h | 18 +- cpp/src/arrow/ipc/CMakeLists.txt | 18 +- cpp/src/arrow/ipc/adapter.cc | 126 ++++++----- cpp/src/arrow/ipc/adapter.h | 47 ++-- cpp/src/arrow/ipc/file.cc | 210 ++++++++++++++++++ cpp/src/arrow/ipc/file.h | 146 +++++++++++++ cpp/src/arrow/ipc/ipc-adapter-test.cc | 284 +++++-------------------- cpp/src/arrow/ipc/ipc-file-test.cc | 125 +++++++++++ cpp/src/arrow/ipc/ipc-metadata-test.cc | 77 ++++++- cpp/src/arrow/ipc/metadata-internal.cc | 46 ++-- cpp/src/arrow/ipc/metadata-internal.h | 9 + cpp/src/arrow/ipc/metadata.cc | 171 ++++++++++++--- cpp/src/arrow/ipc/metadata.h | 64 +++++- cpp/src/arrow/ipc/test-common.h | 193 ++++++++++++++++- cpp/src/arrow/ipc/util.h | 8 + cpp/src/arrow/parquet/reader.h | 2 +- cpp/src/arrow/parquet/writer.h | 2 +- cpp/src/arrow/table.cc | 4 +- cpp/src/arrow/table.h | 16 +- format/IPC.md | 3 + format/README.md | 1 + 23 files changed, 1231 insertions(+), 382 deletions(-) create mode 100644 cpp/src/arrow/ipc/file.cc create mode 100644 cpp/src/arrow/ipc/file.h create mode 100644 cpp/src/arrow/ipc/ipc-file-test.cc create mode 100644 format/IPC.md diff --git a/NOTICE.txt b/NOTICE.txt index a85101617ce..ce6e567dcb5 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -12,3 +12,9 @@ This product includes software from the Numpy project (BSD-new) https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910 * Copyright (c) 1995, 1996, 1997 Jim Hugunin, hugunin@mit.edu * Copyright (c) 2005 Travis E. Oliphant oliphant@ee.byu.edu Brigham Young University + +This product includes software from the Feather project (Apache 2.0) +https://github.com/wesm/feather + +This product includes software from the DyND project (BSD 2-clause) +https://github.com/libdynd diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 1dd6c3a0230..c168c91c5f8 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -206,6 +206,43 @@ Status MemoryMappedFile::WriteInternal(const uint8_t* data, int64_t nbytes) { return Status::OK(); } +// ---------------------------------------------------------------------- +// OutputStream that writes to resizable buffer + +static constexpr int64_t kBufferMinimumSize = 256; + +BufferOutputStream::BufferOutputStream(const std::shared_ptr& buffer) + : buffer_(buffer), + capacity_(buffer->size()), + position_(0), + mutable_data_(buffer->mutable_data()) {} + +Status BufferOutputStream::Close() { + return Status::OK(); +} + +Status BufferOutputStream::Tell(int64_t* position) { + *position = position_; + return Status::OK(); +} + +Status BufferOutputStream::Write(const uint8_t* data, int64_t nbytes) { + RETURN_NOT_OK(Reserve(nbytes)); + std::memcpy(mutable_data_ + position_, data, nbytes); + position_ += nbytes; + return Status::OK(); +} + +Status BufferOutputStream::Reserve(int64_t nbytes) { + while (position_ + nbytes > capacity_) { + int64_t new_capacity = std::max(kBufferMinimumSize, capacity_ * 2); + RETURN_NOT_OK(buffer_->Resize(new_capacity)); + capacity_ = new_capacity; + } + mutable_data_ = buffer_->mutable_data(); + return Status::OK(); +} + // ---------------------------------------------------------------------- // In-memory buffer reader diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 6fe47c3b515..51601a0a626 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -32,32 +32,30 @@ namespace arrow { class Buffer; -class MutableBuffer; +class ResizableBuffer; class Status; namespace io { // An output stream that writes to a MutableBuffer, such as one obtained from a // memory map -// -// TODO(wesm): Implement this class class ARROW_EXPORT BufferOutputStream : public OutputStream { public: - explicit BufferOutputStream(const std::shared_ptr& buffer) - : buffer_(buffer) {} + explicit BufferOutputStream(const std::shared_ptr& buffer); // Implement the OutputStream interface Status Close() override; Status Tell(int64_t* position) override; - Status Write(const uint8_t* data, int64_t length) override; - - // Returns the number of bytes remaining in the buffer - int64_t bytes_remaining() const; + Status Write(const uint8_t* data, int64_t nbytes) override; private: - std::shared_ptr buffer_; + // Ensures there is sufficient space available to write nbytes + Status Reserve(int64_t nbytes); + + std::shared_ptr buffer_; int64_t capacity_; int64_t position_; + uint8_t* mutable_data_; }; // A memory source that uses memory-mapped files for memory interactions diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index e5553a63581..bde8c5bf738 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -33,6 +33,7 @@ set(ARROW_IPC_TEST_LINK_LIBS set(ARROW_IPC_SRCS adapter.cc + file.cc metadata.cc metadata-internal.cc ) @@ -60,6 +61,10 @@ ADD_ARROW_TEST(ipc-adapter-test) ARROW_TEST_LINK_LIBRARIES(ipc-adapter-test ${ARROW_IPC_TEST_LINK_LIBS}) +ADD_ARROW_TEST(ipc-file-test) +ARROW_TEST_LINK_LIBRARIES(ipc-file-test + ${ARROW_IPC_TEST_LINK_LIBS}) + ADD_ARROW_TEST(ipc-metadata-test) ARROW_TEST_LINK_LIBRARIES(ipc-metadata-test ${ARROW_IPC_TEST_LINK_LIBS}) @@ -70,14 +75,20 @@ set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE) set(OUTPUT_DIR ${CMAKE_SOURCE_DIR}/src/arrow/ipc) set(FBS_OUTPUT_FILES "${OUTPUT_DIR}/Message_generated.h") -set(FBS_SRC ${CMAKE_SOURCE_DIR}/../format/Message.fbs) -get_filename_component(ABS_FBS_SRC ${FBS_SRC} ABSOLUTE) +set(FBS_SRC + ${CMAKE_SOURCE_DIR}/../format/Message.fbs + ${CMAKE_SOURCE_DIR}/../format/File.fbs) + +foreach(FIL ${FBS_SRC}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + list(APPEND ABS_FBS_SRC ${ABS_FIL}) +endforeach() add_custom_command( OUTPUT ${FBS_OUTPUT_FILES} COMMAND ${FLATBUFFERS_COMPILER} -c -o ${OUTPUT_DIR} ${ABS_FBS_SRC} DEPENDS ${ABS_FBS_SRC} - COMMENT "Running flatc compiler on ${FBS_SRC}" + COMMENT "Running flatc compiler on ${ABS_FBS_SRC}" VERBATIM ) @@ -87,6 +98,7 @@ add_dependencies(arrow_objlib metadata_fbs) # Headers: top level install(FILES adapter.h + file.h metadata.h DESTINATION include/arrow/ipc) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 0e101c89303..89b7fb987c6 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -95,7 +95,7 @@ static bool IsListType(const DataType* type) { } // ---------------------------------------------------------------------- -// Row batch write path +// Record batch write path Status VisitArray(const Array* arr, std::vector* field_nodes, std::vector>* buffers, int max_recursion_depth) { @@ -132,28 +132,32 @@ Status VisitArray(const Array* arr, std::vector* field_nodes return Status::OK(); } -class RowBatchWriter { +class RecordBatchWriter { public: - RowBatchWriter(const RowBatch* batch, int max_recursion_depth) - : batch_(batch), max_recursion_depth_(max_recursion_depth) {} + RecordBatchWriter(const std::vector>& columns, int32_t num_rows, + int max_recursion_depth) + : columns_(&columns), + num_rows_(num_rows), + max_recursion_depth_(max_recursion_depth) {} Status AssemblePayload() { // Perform depth-first traversal of the row-batch - for (int i = 0; i < batch_->num_columns(); ++i) { - const Array* arr = batch_->column(i).get(); + for (size_t i = 0; i < columns_->size(); ++i) { + const Array* arr = (*columns_)[i].get(); RETURN_NOT_OK(VisitArray(arr, &field_nodes_, &buffers_, max_recursion_depth_)); } return Status::OK(); } - Status Write(io::OutputStream* dst, int64_t* data_header_offset) { - // Write out all the buffers contiguously and compute the total size of the - // memory payload - int64_t offset = 0; - + Status Write( + io::OutputStream* dst, int64_t* body_end_offset, int64_t* header_end_offset) { // Get the starting position - int64_t position; - RETURN_NOT_OK(dst->Tell(&position)); + int64_t start_position; + RETURN_NOT_OK(dst->Tell(&start_position)); + + // Keep track of the current position so we can determine the size of the + // message body + int64_t position = start_position; for (size_t i = 0; i < buffers_.size(); ++i) { const Buffer* buffer = buffers_[i].get(); @@ -175,14 +179,16 @@ class RowBatchWriter { // are using from any OS-level shared memory. The thought is that systems // may (in the future) associate integer page id's with physical memory // pages (according to whatever is the desired shared memory mechanism) - buffer_meta_.push_back(flatbuf::Buffer(0, position + offset, size)); + buffer_meta_.push_back(flatbuf::Buffer(0, position, size)); if (size > 0) { RETURN_NOT_OK(dst->Write(buffer->data(), size)); - offset += size; + position += size; } } + *body_end_offset = position; + // Now that we have computed the locations of all of the buffers in shared // memory, the data header can be converted to a flatbuffer and written out // @@ -192,27 +198,43 @@ class RowBatchWriter { // construct the flatbuffer data accessor object (see arrow::ipc::Message) std::shared_ptr data_header; RETURN_NOT_OK(WriteDataHeader( - batch_->num_rows(), offset, field_nodes_, buffer_meta_, &data_header)); + num_rows_, position - start_position, field_nodes_, buffer_meta_, &data_header)); // Write the data header at the end RETURN_NOT_OK(dst->Write(data_header->data(), data_header->size())); - *data_header_offset = position + offset; + position += data_header->size(); + *header_end_offset = position; + + return Align(dst, &position); + } + + Status Align(io::OutputStream* dst, int64_t* position) { + // Write all buffers here on word boundaries + // TODO(wesm): Is there benefit to 64-byte padding in IPC? + int64_t remainder = PaddedLength(*position) - *position; + if (remainder > 0) { + RETURN_NOT_OK(dst->Write(kPaddingBytes, remainder)); + *position += remainder; + } return Status::OK(); } // This must be called after invoking AssemblePayload Status GetTotalSize(int64_t* size) { // emulates the behavior of Write without actually writing + int64_t body_offset; int64_t data_header_offset; MockOutputStream dst; - RETURN_NOT_OK(Write(&dst, &data_header_offset)); + RETURN_NOT_OK(Write(&dst, &body_offset, &data_header_offset)); *size = dst.GetExtentBytesWritten(); return Status::OK(); } private: - const RowBatch* batch_; + // Do not copy this vector. Ownership must be retained elsewhere + const std::vector>* columns_; + int32_t num_rows_; std::vector field_nodes_; std::vector buffer_meta_; @@ -220,29 +242,29 @@ class RowBatchWriter { int max_recursion_depth_; }; -Status WriteRowBatch(io::OutputStream* dst, const RowBatch* batch, int64_t* header_offset, - int max_recursion_depth) { +Status WriteRecordBatch(const std::vector>& columns, + int32_t num_rows, io::OutputStream* dst, int64_t* body_end_offset, + int64_t* header_end_offset, int max_recursion_depth) { DCHECK_GT(max_recursion_depth, 0); - RowBatchWriter serializer(batch, max_recursion_depth); + RecordBatchWriter serializer(columns, num_rows, max_recursion_depth); RETURN_NOT_OK(serializer.AssemblePayload()); - return serializer.Write(dst, header_offset); + return serializer.Write(dst, body_end_offset, header_end_offset); } -Status GetRowBatchSize(const RowBatch* batch, int64_t* size) { - RowBatchWriter serializer(batch, kMaxIpcRecursionDepth); +Status GetRecordBatchSize(const RecordBatch* batch, int64_t* size) { + RecordBatchWriter serializer( + batch->columns(), batch->num_rows(), kMaxIpcRecursionDepth); RETURN_NOT_OK(serializer.AssemblePayload()); RETURN_NOT_OK(serializer.GetTotalSize(size)); return Status::OK(); } // ---------------------------------------------------------------------- -// Row batch read path +// Record batch read path -static constexpr int64_t INIT_METADATA_SIZE = 4096; - -class RowBatchReader::RowBatchReaderImpl { +class RecordBatchReader::RecordBatchReaderImpl { public: - RowBatchReaderImpl(io::ReadableFileInterface* file, + RecordBatchReaderImpl(io::ReadableFileInterface* file, const std::shared_ptr& metadata, int max_recursion_depth) : file_(file), metadata_(metadata), max_recursion_depth_(max_recursion_depth) { num_buffers_ = metadata->num_buffers(); @@ -250,7 +272,7 @@ class RowBatchReader::RowBatchReaderImpl { } Status AssembleBatch( - const std::shared_ptr& schema, std::shared_ptr* out) { + const std::shared_ptr& schema, std::shared_ptr* out) { std::vector> arrays(schema->num_fields()); // The field_index and buffer_index are incremented in NextArray based on @@ -263,7 +285,7 @@ class RowBatchReader::RowBatchReaderImpl { RETURN_NOT_OK(NextArray(field, max_recursion_depth_, &arrays[i])); } - *out = std::make_shared(schema, metadata_->length(), arrays); + *out = std::make_shared(schema, metadata_->length(), arrays); return Status::OK(); } @@ -359,29 +381,31 @@ class RowBatchReader::RowBatchReaderImpl { int num_flattened_fields_; }; -Status RowBatchReader::Open(io::ReadableFileInterface* file, int64_t position, - std::shared_ptr* out) { - return Open(file, position, kMaxIpcRecursionDepth, out); +Status RecordBatchReader::Open(io::ReadableFileInterface* file, int64_t offset, + std::shared_ptr* out) { + return Open(file, offset, kMaxIpcRecursionDepth, out); } -Status RowBatchReader::Open(io::ReadableFileInterface* file, int64_t position, - int max_recursion_depth, std::shared_ptr* out) { - std::shared_ptr metadata; - RETURN_NOT_OK(file->ReadAt(position, INIT_METADATA_SIZE, &metadata)); +Status RecordBatchReader::Open(io::ReadableFileInterface* file, int64_t offset, + int max_recursion_depth, std::shared_ptr* out) { + std::shared_ptr buffer; + RETURN_NOT_OK(file->ReadAt(offset - sizeof(int32_t), sizeof(int32_t), &buffer)); - int32_t metadata_size = *reinterpret_cast(metadata->data()); + int32_t metadata_size = *reinterpret_cast(buffer->data()); - // We may not need to call ReadAt again - if (metadata_size > static_cast(INIT_METADATA_SIZE - sizeof(int32_t))) { - // We don't have enough data, read the indicated metadata size. - RETURN_NOT_OK(file->ReadAt(position + sizeof(int32_t), metadata_size, &metadata)); + if (metadata_size + static_cast(sizeof(int32_t)) > offset) { + return Status::Invalid("metadata size invalid"); } + // Read the metadata + RETURN_NOT_OK( + file->ReadAt(offset - metadata_size - sizeof(int32_t), metadata_size, &buffer)); + // TODO(wesm): buffer slicing here would be better in case ReadAt returns // allocated memory std::shared_ptr message; - RETURN_NOT_OK(Message::Open(metadata, &message)); + RETURN_NOT_OK(Message::Open(buffer, &message)); if (message->type() != Message::RECORD_BATCH) { return Status::Invalid("Metadata message is not a record batch"); @@ -389,19 +413,19 @@ Status RowBatchReader::Open(io::ReadableFileInterface* file, int64_t position, std::shared_ptr batch_meta = message->GetRecordBatch(); - std::shared_ptr result(new RowBatchReader()); - result->impl_.reset(new RowBatchReaderImpl(file, batch_meta, max_recursion_depth)); + std::shared_ptr result(new RecordBatchReader()); + result->impl_.reset(new RecordBatchReaderImpl(file, batch_meta, max_recursion_depth)); *out = result; return Status::OK(); } // Here the explicit destructor is required for compilers to be aware of -// the complete information of RowBatchReader::RowBatchReaderImpl class -RowBatchReader::~RowBatchReader() {} +// the complete information of RecordBatchReader::RecordBatchReaderImpl class +RecordBatchReader::~RecordBatchReader() {} -Status RowBatchReader::GetRowBatch( - const std::shared_ptr& schema, std::shared_ptr* out) { +Status RecordBatchReader::GetRecordBatch( + const std::shared_ptr& schema, std::shared_ptr* out) { return impl_->AssembleBatch(schema, out); } diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index 215b46f8f65..3fde18dde83 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -23,13 +23,14 @@ #include #include +#include #include "arrow/util/visibility.h" namespace arrow { class Array; -class RowBatch; +class RecordBatch; class Schema; class Status; @@ -50,7 +51,7 @@ class RecordBatchMessage; // TODO(emkornfield) investigate this more constexpr int kMaxIpcRecursionDepth = 64; -// Write the RowBatch (collection of equal-length Arrow arrays) to the output +// Write the RecordBatch (collection of equal-length Arrow arrays) to the output // stream // // First, each of the memory buffers are written out end-to-end @@ -60,39 +61,43 @@ constexpr int kMaxIpcRecursionDepth = 64; // // // -// Finally, the absolute offset (relative to the start of the output stream) to -// the start of the metadata / data header is returned in an out-variable -ARROW_EXPORT Status WriteRowBatch(io::OutputStream* dst, const RowBatch* batch, - int64_t* header_offset, int max_recursion_depth = kMaxIpcRecursionDepth); +// Finally, the absolute offsets (relative to the start of the output stream) +// to the end of the body and end of the metadata / data header (suffixed by +// the header size) is returned in out-variables +ARROW_EXPORT Status WriteRecordBatch(const std::vector>& columns, + int32_t num_rows, io::OutputStream* dst, int64_t* body_end_offset, + int64_t* header_end_offset, int max_recursion_depth = kMaxIpcRecursionDepth); -// int64_t GetRowBatchMetadata(const RowBatch* batch); +// int64_t GetRecordBatchMetadata(const RecordBatch* batch); // Compute the precise number of bytes needed in a contiguous memory segment to -// write the row batch. This involves generating the complete serialized +// write the record batch. This involves generating the complete serialized // Flatbuffers metadata. -ARROW_EXPORT Status GetRowBatchSize(const RowBatch* batch, int64_t* size); +ARROW_EXPORT Status GetRecordBatchSize(const RecordBatch* batch, int64_t* size); // ---------------------------------------------------------------------- // "Read" path; does not copy data if the input supports zero copy reads -class ARROW_EXPORT RowBatchReader { +class ARROW_EXPORT RecordBatchReader { public: - static Status Open(io::ReadableFileInterface* file, int64_t position, - std::shared_ptr* out); + // The offset is the absolute position to the *end* of the record batch data + // header + static Status Open(io::ReadableFileInterface* file, int64_t offset, + std::shared_ptr* out); - static Status Open(io::ReadableFileInterface* file, int64_t position, - int max_recursion_depth, std::shared_ptr* out); + static Status Open(io::ReadableFileInterface* file, int64_t offset, + int max_recursion_depth, std::shared_ptr* out); - virtual ~RowBatchReader(); + virtual ~RecordBatchReader(); - // Reassemble the row batch. A Schema is required to be able to construct the - // right array containers - Status GetRowBatch( - const std::shared_ptr& schema, std::shared_ptr* out); + // Reassemble the record batch. A Schema is required to be able to construct + // the right array containers + Status GetRecordBatch( + const std::shared_ptr& schema, std::shared_ptr* out); private: - class RowBatchReaderImpl; - std::unique_ptr impl_; + class RecordBatchReaderImpl; + std::unique_ptr impl_; }; } // namespace ipc diff --git a/cpp/src/arrow/ipc/file.cc b/cpp/src/arrow/ipc/file.cc new file mode 100644 index 00000000000..2bf10dde266 --- /dev/null +++ b/cpp/src/arrow/ipc/file.cc @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/file.h" + +#include +#include +#include +#include + +#include "arrow/ipc/adapter.h" +#include "arrow/ipc/metadata.h" +#include "arrow/ipc/util.h" +#include "arrow/io/interfaces.h" +#include "arrow/util/buffer.h" +#include "arrow/util/logging.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +static constexpr const char* kArrowMagicBytes = "ARROW1"; + +// ---------------------------------------------------------------------- +// Writer implementation + +FileWriter::FileWriter(io::OutputStream* sink, const std::shared_ptr& schema) + : sink_(sink), schema_(schema), position_(-1), started_(false) {} + +Status FileWriter::UpdatePosition() { + return sink_->Tell(&position_); +} + +Status FileWriter::Open(io::OutputStream* sink, const std::shared_ptr& schema, + std::shared_ptr* out) { + *out = std::shared_ptr(new FileWriter(sink, schema)); // ctor is private + RETURN_NOT_OK((*out)->UpdatePosition()); + return Status::OK(); +} + +Status FileWriter::Write(const uint8_t* data, int64_t nbytes) { + RETURN_NOT_OK(sink_->Write(data, nbytes)); + position_ += nbytes; + return Status::OK(); +} + +Status FileWriter::Align() { + int64_t remainder = PaddedLength(position_) - position_; + if (remainder > 0) { return Write(kPaddingBytes, remainder); } + return Status::OK(); +} + +Status FileWriter::WriteAligned(const uint8_t* data, int64_t nbytes) { + RETURN_NOT_OK(Write(data, nbytes)); + return Align(); +} + +Status FileWriter::Start() { + RETURN_NOT_OK(WriteAligned( + reinterpret_cast(kArrowMagicBytes), strlen(kArrowMagicBytes))); + started_ = true; + return Status::OK(); +} + +Status FileWriter::CheckStarted() { + if (!started_) { return Start(); } + return Status::OK(); +} + +Status FileWriter::WriteRecordBatch( + const std::vector>& columns, int32_t num_rows) { + RETURN_NOT_OK(CheckStarted()); + + int64_t offset = position_; + + int64_t body_end_offset; + int64_t header_end_offset; + RETURN_NOT_OK(arrow::ipc::WriteRecordBatch( + columns, num_rows, sink_, &body_end_offset, &header_end_offset)); + RETURN_NOT_OK(UpdatePosition()); + + DCHECK(position_ % 8 == 0) << "ipc::WriteRecordBatch did not perform aligned writes"; + + // There may be padding ever the end of the metadata, so we cannot rely on + // position_ + int32_t metadata_length = header_end_offset - body_end_offset; + int32_t body_length = body_end_offset - offset; + + // Append metadata, to be written in the footer later + record_batches_.emplace_back(offset, metadata_length, body_length); + + return Status::OK(); +} + +Status FileWriter::Close() { + // Write metadata + int64_t initial_position = position_; + RETURN_NOT_OK(WriteFileFooter(schema_.get(), dictionaries_, record_batches_, sink_)); + RETURN_NOT_OK(UpdatePosition()); + + // Write footer length + int32_t footer_length = position_ - initial_position; + + if (footer_length <= 0) { return Status::Invalid("Invalid file footer"); } + + RETURN_NOT_OK(Write(reinterpret_cast(&footer_length), sizeof(int32_t))); + + // Write magic bytes to end file + return Write( + reinterpret_cast(kArrowMagicBytes), strlen(kArrowMagicBytes)); +} + +// ---------------------------------------------------------------------- +// Reader implementation + +FileReader::FileReader( + const std::shared_ptr& file, int64_t footer_offset) + : file_(file), footer_offset_(footer_offset) {} + +FileReader::~FileReader() {} + +Status FileReader::Open(const std::shared_ptr& file, + std::shared_ptr* reader) { + int64_t footer_offset; + RETURN_NOT_OK(file->GetSize(&footer_offset)); + return Open(file, footer_offset, reader); +} + +Status FileReader::Open(const std::shared_ptr& file, + int64_t footer_offset, std::shared_ptr* reader) { + *reader = std::shared_ptr(new FileReader(file, footer_offset)); + return (*reader)->ReadFooter(); +} + +Status FileReader::ReadFooter() { + int magic_size = static_cast(strlen(kArrowMagicBytes)); + + if (footer_offset_ <= magic_size * 2 + 4) { + std::stringstream ss; + ss << "File is too small: " << footer_offset_; + return Status::Invalid(ss.str()); + } + + std::shared_ptr buffer; + int file_end_size = magic_size + sizeof(int32_t); + RETURN_NOT_OK(file_->ReadAt(footer_offset_ - file_end_size, file_end_size, &buffer)); + + if (memcmp(buffer->data() + sizeof(int32_t), kArrowMagicBytes, magic_size)) { + return Status::Invalid("Not an Arrow file"); + } + + int32_t footer_length = *reinterpret_cast(buffer->data()); + + if (footer_length <= 0 || footer_length + magic_size * 2 + 4 > footer_offset_) { + return Status::Invalid("File is smaller than indicated metadata size"); + } + + // Now read the footer + RETURN_NOT_OK(file_->ReadAt( + footer_offset_ - footer_length - file_end_size, footer_length, &buffer)); + RETURN_NOT_OK(FileFooter::Open(buffer, &footer_)); + + // Get the schema + return footer_->GetSchema(&schema_); +} + +const std::shared_ptr& FileReader::schema() const { + return schema_; +} + +int FileReader::num_dictionaries() const { + return footer_->num_dictionaries(); +} + +int FileReader::num_record_batches() const { + return footer_->num_record_batches(); +} + +MetadataVersion::type FileReader::version() const { + return footer_->version(); +} + +Status FileReader::GetRecordBatch(int i, std::shared_ptr* batch) { + DCHECK_GE(i, 0); + DCHECK_LT(i, num_record_batches()); + FileBlock block = footer_->record_batch(i); + int64_t metadata_end_offset = block.offset + block.body_length + block.metadata_length; + + std::shared_ptr reader; + RETURN_NOT_OK(RecordBatchReader::Open(file_.get(), metadata_end_offset, &reader)); + + return reader->GetRecordBatch(schema_, batch); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/file.h b/cpp/src/arrow/ipc/file.h new file mode 100644 index 00000000000..4b79c98281b --- /dev/null +++ b/cpp/src/arrow/ipc/file.h @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement Arrow file layout for IPC/RPC purposes and short-lived storage + +#ifndef ARROW_IPC_FILE_H +#define ARROW_IPC_FILE_H + +#include +#include +#include + +#include "arrow/ipc/metadata.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; +class Buffer; +struct Field; +class RecordBatch; +class Schema; +class Status; + +namespace io { + +class OutputStream; +class ReadableFileInterface; + +} // namespace io + +namespace ipc { + +class ARROW_EXPORT FileWriter { + public: + static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, + std::shared_ptr* out); + + // TODO(wesm): Write dictionaries + + Status WriteRecordBatch( + const std::vector>& columns, int32_t num_rows); + + Status Close(); + + private: + FileWriter(io::OutputStream* sink, const std::shared_ptr& schema); + + Status CheckStarted(); + Status Start(); + + Status UpdatePosition(); + + // Adds padding bytes if necessary to ensure all memory blocks are written on + // 8-byte boundaries. + Status Align(); + + // Write data and update position + Status Write(const uint8_t* data, int64_t nbytes); + + // Write and align + Status WriteAligned(const uint8_t* data, int64_t nbytes); + + io::OutputStream* sink_; + std::shared_ptr schema_; + int64_t position_; + bool started_; + + std::vector dictionaries_; + std::vector record_batches_; +}; + +class ARROW_EXPORT FileReader { + public: + ~FileReader(); + + // Open a file-like object that is assumed to be self-contained; i.e., the + // end of the file interface is the end of the Arrow file. Note that there + // can be any amount of data preceding the Arrow-formatted data, because we + // need only locate the end of the Arrow file stream to discover the metadata + // and then proceed to read the data into memory. + static Status Open(const std::shared_ptr& file, + std::shared_ptr* reader); + + // If the file is embedded within some larger file or memory region, you can + // pass the absolute memory offset to the end of the file (which contains the + // metadata footer). The metadata must have been written with memory offsets + // relative to the start of the containing file + // + // @param file: the data source + // @param footer_offset: the position of the end of the Arrow "file" + static Status Open(const std::shared_ptr& file, + int64_t footer_offset, std::shared_ptr* reader); + + const std::shared_ptr& schema() const; + + // Shared dictionaries for dictionary-encoding cross record batches + // TODO(wesm): Implement dictionary reading when we also have dictionary + // encoding + int num_dictionaries() const; + + int num_record_batches() const; + + MetadataVersion::type version() const; + + // Read a record batch from the file. Does not copy memory if the input + // source supports zero-copy. + // + // TODO(wesm): Make the copy/zero-copy behavior configurable (e.g. provide an + // "always copy" option) + Status GetRecordBatch(int i, std::shared_ptr* batch); + + private: + FileReader( + const std::shared_ptr& file, int64_t footer_offset); + + Status ReadFooter(); + + std::shared_ptr file_; + + // The location where the Arrow file layout ends. May be the end of the file + // or some other location if embedded in a larger file. + int64_t footer_offset_; + + std::unique_ptr footer_; + std::shared_ptr schema_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_FILE_H diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index ca4d0152b90..f5611d4840c 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -43,31 +43,27 @@ namespace arrow { namespace ipc { -// TODO(emkornfield) convert to google style kInt32, etc? -const auto INT32 = std::make_shared(); -const auto LIST_INT32 = std::make_shared(INT32); -const auto LIST_LIST_INT32 = std::make_shared(LIST_INT32); - -typedef Status MakeRowBatch(std::shared_ptr* out); - -class TestWriteRowBatch : public ::testing::TestWithParam, - public io::MemoryMapFixture { +class TestWriteRecordBatch : public ::testing::TestWithParam, + public io::MemoryMapFixture { public: void SetUp() { pool_ = default_memory_pool(); } void TearDown() { io::MemoryMapFixture::TearDown(); } - Status RoundTripHelper(const RowBatch& batch, int memory_map_size, - std::shared_ptr* batch_result) { + Status RoundTripHelper(const RecordBatch& batch, int memory_map_size, + std::shared_ptr* batch_result) { std::string path = "test-write-row-batch"; io::MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); - int64_t header_location; - RETURN_NOT_OK(WriteRowBatch(mmap_.get(), &batch, &header_location)); + int64_t body_end_offset; + int64_t header_end_offset; - std::shared_ptr reader; - RETURN_NOT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); + RETURN_NOT_OK(WriteRecordBatch(batch.columns(), batch.num_rows(), mmap_.get(), + &body_end_offset, &header_end_offset)); - RETURN_NOT_OK(reader->GetRowBatch(batch.schema(), batch_result)); + std::shared_ptr reader; + RETURN_NOT_OK(RecordBatchReader::Open(mmap_.get(), header_end_offset, &reader)); + + RETURN_NOT_OK(reader->GetRecordBatch(batch.schema(), batch_result)); return Status::OK(); } @@ -76,10 +72,10 @@ class TestWriteRowBatch : public ::testing::TestWithParam, MemoryPool* pool_; }; -TEST_P(TestWriteRowBatch, RoundTrip) { - std::shared_ptr batch; +TEST_P(TestWriteRecordBatch, RoundTrip) { + std::shared_ptr batch; ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue - std::shared_ptr batch_result; + std::shared_ptr batch_result; ASSERT_OK(RoundTripHelper(*batch, 1 << 16, &batch_result)); // do checks @@ -93,217 +89,39 @@ TEST_P(TestWriteRowBatch, RoundTrip) { } } -Status MakeIntRowBatch(std::shared_ptr* out) { - const int length = 1000; - - // Make the schema - auto f0 = std::make_shared("f0", INT32); - auto f1 = std::make_shared("f1", INT32); - std::shared_ptr schema(new Schema({f0, f1})); - - // Example data - std::shared_ptr a0, a1; - MemoryPool* pool = default_memory_pool(); - RETURN_NOT_OK(MakeRandomInt32Array(length, false, pool, &a0)); - RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &a1)); - out->reset(new RowBatch(schema, length, {a0, a1})); - return Status::OK(); -} +INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRecordBatch, + ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, + &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeStringTypesRecordBatch, &MakeStruct)); -template -Status MakeRandomBinaryArray( - const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) { - const std::vector values = { - "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; - Builder builder(pool, type); - const auto values_len = values.size(); - for (int32_t i = 0; i < length; ++i) { - int values_index = i % values_len; - if (values_index == 0) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - const std::string& value = values[values_index]; - RETURN_NOT_OK( - builder.Append(reinterpret_cast(value.data()), value.size())); - } - } - *array = builder.Finish(); - return Status::OK(); -} - -Status MakeStringTypesRowBatch(std::shared_ptr* out) { - const int32_t length = 500; - auto string_type = std::make_shared(); - auto binary_type = std::make_shared(); - auto f0 = std::make_shared("f0", string_type); - auto f1 = std::make_shared("f1", binary_type); - std::shared_ptr schema(new Schema({f0, f1})); - - std::shared_ptr a0, a1; - MemoryPool* pool = default_memory_pool(); - - { - auto status = - MakeRandomBinaryArray(string_type, length, pool, &a0); - RETURN_NOT_OK(status); - } - { - auto status = - MakeRandomBinaryArray(binary_type, length, pool, &a1); - RETURN_NOT_OK(status); - } - out->reset(new RowBatch(schema, length, {a0, a1})); - return Status::OK(); -} - -Status MakeListRowBatch(std::shared_ptr* out) { - // Make the schema - auto f0 = std::make_shared("f0", LIST_INT32); - auto f1 = std::make_shared("f1", LIST_LIST_INT32); - auto f2 = std::make_shared("f2", INT32); - std::shared_ptr schema(new Schema({f0, f1, f2})); - - // Example data - - MemoryPool* pool = default_memory_pool(); - const int length = 200; - std::shared_ptr leaf_values, list_array, list_list_array, flat_array; - const bool include_nulls = true; - RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); - RETURN_NOT_OK( - MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); - RETURN_NOT_OK( - MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); - RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); - out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); - return Status::OK(); -} - -Status MakeZeroLengthRowBatch(std::shared_ptr* out) { - // Make the schema - auto f0 = std::make_shared("f0", LIST_INT32); - auto f1 = std::make_shared("f1", LIST_LIST_INT32); - auto f2 = std::make_shared("f2", INT32); - std::shared_ptr schema(new Schema({f0, f1, f2})); - - // Example data - MemoryPool* pool = default_memory_pool(); - const int length = 200; - const bool include_nulls = true; - std::shared_ptr leaf_values, list_array, list_list_array, flat_array; - RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &leaf_values)); - RETURN_NOT_OK(MakeRandomListArray(leaf_values, 0, include_nulls, pool, &list_array)); - RETURN_NOT_OK( - MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); - RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); - out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); - return Status::OK(); -} - -Status MakeNonNullRowBatch(std::shared_ptr* out) { - // Make the schema - auto f0 = std::make_shared("f0", LIST_INT32); - auto f1 = std::make_shared("f1", LIST_LIST_INT32); - auto f2 = std::make_shared("f2", INT32); - std::shared_ptr schema(new Schema({f0, f1, f2})); - - // Example data - MemoryPool* pool = default_memory_pool(); - const int length = 50; - std::shared_ptr leaf_values, list_array, list_list_array, flat_array; - - RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); - bool include_nulls = false; - RETURN_NOT_OK( - MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); - RETURN_NOT_OK( - MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); - RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); - out->reset(new RowBatch(schema, length, {list_array, list_list_array, flat_array})); - return Status::OK(); -} - -Status MakeDeeplyNestedList(std::shared_ptr* out) { - const int batch_length = 5; - TypePtr type = INT32; - - MemoryPool* pool = default_memory_pool(); - ArrayPtr array; - const bool include_nulls = true; - RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); - for (int i = 0; i < 63; ++i) { - type = std::static_pointer_cast(std::make_shared(type)); - RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); - } - - auto f0 = std::make_shared("f0", type); - std::shared_ptr schema(new Schema({f0})); - std::vector arrays = {array}; - out->reset(new RowBatch(schema, batch_length, arrays)); - return Status::OK(); -} - -Status MakeStruct(std::shared_ptr* out) { - // reuse constructed list columns - std::shared_ptr list_batch; - RETURN_NOT_OK(MakeListRowBatch(&list_batch)); - std::vector columns = { - list_batch->column(0), list_batch->column(1), list_batch->column(2)}; - auto list_schema = list_batch->schema(); - - // Define schema - std::shared_ptr type(new StructType( - {list_schema->field(0), list_schema->field(1), list_schema->field(2)})); - auto f0 = std::make_shared("non_null_struct", type); - auto f1 = std::make_shared("null_struct", type); - std::shared_ptr schema(new Schema({f0, f1})); - - // construct individual nullable/non-nullable struct arrays - ArrayPtr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); - std::vector null_bytes(list_batch->num_rows(), 1); - null_bytes[0] = 0; - std::shared_ptr null_bitmask; - RETURN_NOT_OK(util::bytes_to_bits(null_bytes, &null_bitmask)); - ArrayPtr with_nulls( - new StructArray(type, list_batch->num_rows(), columns, 1, null_bitmask)); - - // construct batch - std::vector arrays = {no_nulls, with_nulls}; - out->reset(new RowBatch(schema, list_batch->num_rows(), arrays)); - return Status::OK(); -} - -INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, - ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, - &MakeZeroLengthRowBatch, &MakeDeeplyNestedList, - &MakeStringTypesRowBatch, &MakeStruct)); - -void TestGetRowBatchSize(std::shared_ptr batch) { +void TestGetRecordBatchSize(std::shared_ptr batch) { ipc::MockOutputStream mock; - int64_t mock_header_location = -1; + int64_t mock_header_offset = -1; + int64_t mock_body_offset = -1; int64_t size = -1; - ASSERT_OK(WriteRowBatch(&mock, batch.get(), &mock_header_location)); - ASSERT_OK(GetRowBatchSize(batch.get(), &size)); + ASSERT_OK(WriteRecordBatch(batch->columns(), batch->num_rows(), &mock, + &mock_body_offset, &mock_header_offset)); + ASSERT_OK(GetRecordBatchSize(batch.get(), &size)); ASSERT_EQ(mock.GetExtentBytesWritten(), size); } -TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { - std::shared_ptr batch; +TEST_F(TestWriteRecordBatch, IntegerGetRecordBatchSize) { + std::shared_ptr batch; - ASSERT_OK(MakeIntRowBatch(&batch)); - TestGetRowBatchSize(batch); + ASSERT_OK(MakeIntRecordBatch(&batch)); + TestGetRecordBatchSize(batch); - ASSERT_OK(MakeListRowBatch(&batch)); - TestGetRowBatchSize(batch); + ASSERT_OK(MakeListRecordBatch(&batch)); + TestGetRecordBatchSize(batch); - ASSERT_OK(MakeZeroLengthRowBatch(&batch)); - TestGetRowBatchSize(batch); + ASSERT_OK(MakeZeroLengthRecordBatch(&batch)); + TestGetRecordBatchSize(batch); - ASSERT_OK(MakeNonNullRowBatch(&batch)); - TestGetRowBatchSize(batch); + ASSERT_OK(MakeNonNullRecordBatch(&batch)); + TestGetRecordBatchSize(batch); ASSERT_OK(MakeDeeplyNestedList(&batch)); - TestGetRowBatchSize(batch); + TestGetRecordBatchSize(batch); } class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { @@ -314,7 +132,7 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { Status WriteToMmap(int recursion_level, bool override_level, int64_t* header_out = nullptr, std::shared_ptr* schema_out = nullptr) { const int batch_length = 5; - TypePtr type = INT32; + TypePtr type = kInt32; ArrayPtr array; const bool include_nulls = true; RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool_, &array)); @@ -328,18 +146,22 @@ class RecursionLimits : public ::testing::Test, public io::MemoryMapFixture { std::shared_ptr schema(new Schema({f0})); if (schema_out != nullptr) { *schema_out = schema; } std::vector arrays = {array}; - auto batch = std::make_shared(schema, batch_length, arrays); + auto batch = std::make_shared(schema, batch_length, arrays); std::string path = "test-write-past-max-recursion"; const int memory_map_size = 1 << 16; io::MemoryMapFixture::InitMemoryMap(memory_map_size, path, &mmap_); - int64_t header_location; - int64_t* header_out_param = header_out == nullptr ? &header_location : header_out; + + int64_t body_offset; + int64_t header_offset; + + int64_t* header_out_param = header_out == nullptr ? &header_offset : header_out; if (override_level) { - return WriteRowBatch( - mmap_.get(), batch.get(), header_out_param, recursion_level + 1); + return WriteRecordBatch(batch->columns(), batch->num_rows(), mmap_.get(), + &body_offset, header_out_param, recursion_level + 1); } else { - return WriteRowBatch(mmap_.get(), batch.get(), header_out_param); + return WriteRecordBatch(batch->columns(), batch->num_rows(), mmap_.get(), + &body_offset, header_out_param); } } @@ -353,14 +175,14 @@ TEST_F(RecursionLimits, WriteLimit) { } TEST_F(RecursionLimits, ReadLimit) { - int64_t header_location = -1; + int64_t header_offset = -1; std::shared_ptr schema; - ASSERT_OK(WriteToMmap(64, true, &header_location, &schema)); + ASSERT_OK(WriteToMmap(64, true, &header_offset, &schema)); - std::shared_ptr reader; - ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &reader)); - std::shared_ptr batch_result; - ASSERT_RAISES(Invalid, reader->GetRowBatch(schema, &batch_result)); + std::shared_ptr reader; + ASSERT_OK(RecordBatchReader::Open(mmap_.get(), header_offset, &reader)); + std::shared_ptr batch_result; + ASSERT_RAISES(Invalid, reader->GetRecordBatch(schema, &batch_result)); } } // namespace ipc diff --git a/cpp/src/arrow/ipc/ipc-file-test.cc b/cpp/src/arrow/ipc/ipc-file-test.cc new file mode 100644 index 00000000000..cd424bf385c --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-file-test.cc @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/io/memory.h" +#include "arrow/io/test-common.h" +#include "arrow/ipc/adapter.h" +#include "arrow/ipc/file.h" +#include "arrow/ipc/test-common.h" +#include "arrow/ipc/util.h" + +#include "arrow/test-util.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +class TestFileFormat : public ::testing::TestWithParam { + public: + void SetUp() { + pool_ = default_memory_pool(); + buffer_ = std::make_shared(pool_); + sink_.reset(new io::BufferOutputStream(buffer_)); + } + void TearDown() {} + + Status RoundTripHelper( + const RecordBatch& batch, std::vector>* out_batches) { + // Write the file + RETURN_NOT_OK(FileWriter::Open(sink_.get(), batch.schema(), &file_writer_)); + int num_batches = 3; + for (int i = 0; i < num_batches; ++i) { + RETURN_NOT_OK(file_writer_->WriteRecordBatch(batch.columns(), batch.num_rows())); + } + RETURN_NOT_OK(file_writer_->Close()); + + // Current offset into stream is the end of the file + int64_t footer_offset; + RETURN_NOT_OK(sink_->Tell(&footer_offset)); + + // Open the file + auto reader = std::make_shared(buffer_->data(), buffer_->size()); + RETURN_NOT_OK(FileReader::Open(reader, footer_offset, &file_reader_)); + + EXPECT_EQ(num_batches, file_reader_->num_record_batches()); + + out_batches->resize(num_batches); + for (int i = 0; i < num_batches; ++i) { + RETURN_NOT_OK(file_reader_->GetRecordBatch(i, &(*out_batches)[i])); + } + + return Status::OK(); + } + + void CompareBatch(const RecordBatch* left, const RecordBatch* right) { + ASSERT_TRUE(left->schema()->Equals(right->schema())); + ASSERT_EQ(left->num_columns(), right->num_columns()) + << left->schema()->ToString() << " result: " << right->schema()->ToString(); + EXPECT_EQ(left->num_rows(), right->num_rows()); + for (int i = 0; i < left->num_columns(); ++i) { + EXPECT_TRUE(left->column(i)->Equals(right->column(i))) + << "Idx: " << i << " Name: " << left->column_name(i); + } + } + + protected: + MemoryPool* pool_; + + std::unique_ptr sink_; + std::shared_ptr buffer_; + + std::shared_ptr file_writer_; + std::shared_ptr file_reader_; +}; + +TEST_P(TestFileFormat, RoundTrip) { + std::shared_ptr batch; + ASSERT_OK((*GetParam())(&batch)); // NOLINT clang-tidy gtest issue + + std::vector> out_batches; + + ASSERT_OK(RoundTripHelper(*batch, &out_batches)); + + // Compare batches. Same + for (size_t i = 0; i < out_batches.size(); ++i) { + CompareBatch(batch.get(), out_batches[i].get()); + } +} + +INSTANTIATE_TEST_CASE_P(RoundTripTests, TestFileFormat, + ::testing::Values(&MakeIntRecordBatch, &MakeListRecordBatch, &MakeNonNullRecordBatch, + &MakeZeroLengthRecordBatch, &MakeDeeplyNestedList, + &MakeStringTypesRecordBatch, &MakeStruct)); + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/ipc-metadata-test.cc b/cpp/src/arrow/ipc/ipc-metadata-test.cc index 51d79cfb4c4..1dc39692332 100644 --- a/cpp/src/arrow/ipc/ipc-metadata-test.cc +++ b/cpp/src/arrow/ipc/ipc-metadata-test.cc @@ -21,6 +21,7 @@ #include "gtest/gtest.h" +#include "arrow/io/memory.h" #include "arrow/ipc/metadata.h" #include "arrow/schema.h" #include "arrow/test-util.h" @@ -31,6 +32,8 @@ namespace arrow { class Buffer; +namespace ipc { + static inline void assert_schema_equal(const Schema* lhs, const Schema* rhs) { if (!lhs->Equals(*rhs)) { std::stringstream ss; @@ -46,14 +49,14 @@ class TestSchemaMessage : public ::testing::Test { void CheckRoundtrip(const Schema* schema) { std::shared_ptr buffer; - ASSERT_OK(ipc::WriteSchema(schema, &buffer)); + ASSERT_OK(WriteSchema(schema, &buffer)); - std::shared_ptr message; - ASSERT_OK(ipc::Message::Open(buffer, &message)); + std::shared_ptr message; + ASSERT_OK(Message::Open(buffer, &message)); - ASSERT_EQ(ipc::Message::SCHEMA, message->type()); + ASSERT_EQ(Message::SCHEMA, message->type()); - std::shared_ptr schema_msg = message->GetSchema(); + std::shared_ptr schema_msg = message->GetSchema(); ASSERT_EQ(schema->num_fields(), schema_msg->num_fields()); std::shared_ptr schema2; @@ -94,4 +97,68 @@ TEST_F(TestSchemaMessage, NestedFields) { CheckRoundtrip(&schema); } +class TestFileFooter : public ::testing::Test { + public: + void SetUp() {} + + void CheckRoundtrip(const Schema* schema, const std::vector& dictionaries, + const std::vector& record_batches) { + auto buffer = std::make_shared(); + io::BufferOutputStream stream(buffer); + + ASSERT_OK(WriteFileFooter(schema, dictionaries, record_batches, &stream)); + + std::unique_ptr footer; + ASSERT_OK(FileFooter::Open(buffer, &footer)); + + ASSERT_EQ(MetadataVersion::V1_SNAPSHOT, footer->version()); + + // Check schema + std::shared_ptr schema2; + ASSERT_OK(footer->GetSchema(&schema2)); + assert_schema_equal(schema, schema2.get()); + + // Check blocks + ASSERT_EQ(dictionaries.size(), footer->num_dictionaries()); + ASSERT_EQ(record_batches.size(), footer->num_record_batches()); + + for (int i = 0; i < footer->num_dictionaries(); ++i) { + CheckBlocks(dictionaries[i], footer->dictionary(i)); + } + + for (int i = 0; i < footer->num_record_batches(); ++i) { + CheckBlocks(record_batches[i], footer->record_batch(i)); + } + } + + void CheckBlocks(const FileBlock& left, const FileBlock& right) { + ASSERT_EQ(left.offset, right.offset); + ASSERT_EQ(left.metadata_length, right.metadata_length); + ASSERT_EQ(left.body_length, right.body_length); + } + + private: + std::shared_ptr example_schema_; +}; + +TEST_F(TestFileFooter, Basics) { + auto f0 = std::make_shared("f0", std::make_shared()); + auto f1 = std::make_shared("f1", std::make_shared()); + Schema schema({f0, f1}); + + std::vector dictionaries; + dictionaries.emplace_back(8, 92, 900); + dictionaries.emplace_back(1000, 100, 1900); + dictionaries.emplace_back(3000, 100, 2900); + + std::vector record_batches; + record_batches.emplace_back(6000, 100, 900); + record_batches.emplace_back(7000, 100, 1900); + record_batches.emplace_back(9000, 100, 2900); + record_batches.emplace_back(12000, 100, 3900); + + CheckRoundtrip(&schema, dictionaries, record_batches); +} + +} // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 05e9c7ad4d3..7102012c29a 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -31,10 +31,6 @@ #include "arrow/util/buffer.h" #include "arrow/util/status.h" -typedef flatbuffers::FlatBufferBuilder FBB; -typedef flatbuffers::Offset FieldOffset; -typedef flatbuffers::Offset Offset; - namespace arrow { namespace flatbuf = org::apache::arrow::flatbuf; @@ -52,6 +48,8 @@ const std::shared_ptr UINT32 = std::make_shared(); const std::shared_ptr UINT64 = std::make_shared(); const std::shared_ptr FLOAT = std::make_shared(); const std::shared_ptr DOUBLE = std::make_shared(); +const std::shared_ptr STRING = std::make_shared(); +const std::shared_ptr BINARY = std::make_shared(); static Status IntFromFlatbuffer( const flatbuf::Int* int_data, std::shared_ptr* out) { @@ -102,8 +100,11 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, return FloatFromFlatuffer( static_cast(type_data), out); case flatbuf::Type_Binary: + *out = BINARY; + return Status::OK(); case flatbuf::Type_Utf8: - return Status::NotImplemented("Type is not implemented"); + *out = STRING; + return Status::OK(); case flatbuf::Type_Bool: *out = BOOL; return Status::OK(); @@ -193,6 +194,14 @@ static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, *out_type = flatbuf::Type_FloatingPoint; *offset = FloatToFlatbuffer(fbb, flatbuf::Precision_DOUBLE); break; + case Type::BINARY: + *out_type = flatbuf::Type_Binary; + *offset = flatbuf::CreateBinary(fbb).Union(); + break; + case Type::STRING: + *out_type = flatbuf::Type_Utf8; + *offset = flatbuf::CreateUtf8(fbb).Union(); + break; case Type::LIST: *out_type = flatbuf::Type_List; return ListToFlatbuffer(fbb, type, children, offset); @@ -255,19 +264,26 @@ flatbuf::Endianness endianness() { return bint.c[0] == 1 ? flatbuf::Endianness_Big : flatbuf::Endianness_Little; } -Status MessageBuilder::SetSchema(const Schema* schema) { - header_type_ = flatbuf::MessageHeader_Schema; - +Status SchemaToFlatbuffer( + FBB& fbb, const Schema* schema, flatbuffers::Offset* out) { std::vector field_offsets; for (int i = 0; i < schema->num_fields(); ++i) { const std::shared_ptr& field = schema->field(i); FieldOffset offset; - RETURN_NOT_OK(FieldToFlatbuffer(fbb_, field, &offset)); + RETURN_NOT_OK(FieldToFlatbuffer(fbb, field, &offset)); field_offsets.push_back(offset); } - header_ = - flatbuf::CreateSchema(fbb_, endianness(), fbb_.CreateVector(field_offsets)).Union(); + *out = flatbuf::CreateSchema(fbb, endianness(), fbb.CreateVector(field_offsets)); + return Status::OK(); +} + +Status MessageBuilder::SetSchema(const Schema* schema) { + flatbuffers::Offset fb_schema; + RETURN_NOT_OK(SchemaToFlatbuffer(fbb_, schema, &fb_schema)); + + header_type_ = flatbuf::MessageHeader_Schema; + header_ = fb_schema.Union(); body_length_ = 0; return Status::OK(); } @@ -301,17 +317,17 @@ Status MessageBuilder::Finish() { } Status MessageBuilder::GetBuffer(std::shared_ptr* out) { - // The message buffer is prefixed by the size of the complete flatbuffer as + // The message buffer is suffixed by the size of the complete flatbuffer as // int32_t - // + // int32_t size = fbb_.GetSize(); auto result = std::make_shared(); RETURN_NOT_OK(result->Resize(size + sizeof(int32_t))); uint8_t* dst = result->mutable_data(); - memcpy(dst, reinterpret_cast(&size), sizeof(int32_t)); - memcpy(dst + sizeof(int32_t), fbb_.GetBufferPointer(), size); + memcpy(dst, fbb_.GetBufferPointer(), size); + memcpy(dst + size, reinterpret_cast(&size), sizeof(int32_t)); *out = result; return Status::OK(); diff --git a/cpp/src/arrow/ipc/metadata-internal.h b/cpp/src/arrow/ipc/metadata-internal.h index d38df840ba0..c404cfde22c 100644 --- a/cpp/src/arrow/ipc/metadata-internal.h +++ b/cpp/src/arrow/ipc/metadata-internal.h @@ -24,7 +24,9 @@ #include "flatbuffers/flatbuffers.h" +#include "arrow/ipc/File_generated.h" #include "arrow/ipc/Message_generated.h" +#include "arrow/ipc/metadata.h" namespace arrow { @@ -37,11 +39,18 @@ class Status; namespace ipc { +using FBB = flatbuffers::FlatBufferBuilder; +using FieldOffset = flatbuffers::Offset; +using Offset = flatbuffers::Offset; + static constexpr flatbuf::MetadataVersion kMetadataVersion = flatbuf::MetadataVersion_V1_SNAPSHOT; Status FieldFromFlatbuffer(const flatbuf::Field* field, std::shared_ptr* out); +Status SchemaToFlatbuffer( + FBB& fbb, const Schema* schema, flatbuffers::Offset* out); + class MessageBuilder { public: Status SetSchema(const Schema* schema); diff --git a/cpp/src/arrow/ipc/metadata.cc b/cpp/src/arrow/ipc/metadata.cc index e510755110e..66df8a6711f 100644 --- a/cpp/src/arrow/ipc/metadata.cc +++ b/cpp/src/arrow/ipc/metadata.cc @@ -23,7 +23,8 @@ #include "flatbuffers/flatbuffers.h" -// Generated C++ flatbuffer IDL +#include "arrow/io/interfaces.h" +#include "arrow/ipc/File_generated.h" #include "arrow/ipc/Message_generated.h" #include "arrow/ipc/metadata-internal.h" @@ -47,9 +48,10 @@ Status WriteSchema(const Schema* schema, std::shared_ptr* out) { //---------------------------------------------------------------------- // Message reader -class Message::Impl { +class Message::MessageImpl { public: - explicit Impl(const std::shared_ptr& buffer, const flatbuf::Message* message) + explicit MessageImpl( + const std::shared_ptr& buffer, const flatbuf::Message* message) : buffer_(buffer), message_(message) {} Message::Type type() const { @@ -76,31 +78,16 @@ class Message::Impl { const flatbuf::Message* message_; }; -class SchemaMessage::Impl { - public: - explicit Impl(const void* schema) - : schema_(static_cast(schema)) {} - - const flatbuf::Field* field(int i) const { return schema_->fields()->Get(i); } - - int num_fields() const { return schema_->fields()->size(); } - - private: - const flatbuf::Schema* schema_; -}; - Message::Message() {} Status Message::Open( const std::shared_ptr& buffer, std::shared_ptr* out) { std::shared_ptr result(new Message()); - // The buffer is prefixed by its size as int32_t - const uint8_t* fb_head = buffer->data() + sizeof(int32_t); - const flatbuf::Message* message = flatbuf::GetMessage(fb_head); + const flatbuf::Message* message = flatbuf::GetMessage(buffer->data()); // TODO(wesm): verify message - result->impl_.reset(new Impl(buffer, message)); + result->impl_.reset(new MessageImpl(buffer, message)); *out = result; return Status::OK(); @@ -122,10 +109,26 @@ std::shared_ptr Message::GetSchema() { return std::make_shared(this->shared_from_this(), impl_->header()); } +// ---------------------------------------------------------------------- +// SchemaMessage + +class SchemaMessage::SchemaMessageImpl { + public: + explicit SchemaMessageImpl(const void* schema) + : schema_(static_cast(schema)) {} + + const flatbuf::Field* field(int i) const { return schema_->fields()->Get(i); } + + int num_fields() const { return schema_->fields()->size(); } + + private: + const flatbuf::Schema* schema_; +}; + SchemaMessage::SchemaMessage( const std::shared_ptr& message, const void* schema) { message_ = message; - impl_.reset(new Impl(schema)); + impl_.reset(new SchemaMessageImpl(schema)); } int SchemaMessage::num_fields() const { @@ -146,9 +149,12 @@ Status SchemaMessage::GetSchema(std::shared_ptr* out) const { return Status::OK(); } -class RecordBatchMessage::Impl { +// ---------------------------------------------------------------------- +// RecordBatchMessage + +class RecordBatchMessage::RecordBatchMessageImpl { public: - explicit Impl(const void* batch) + explicit RecordBatchMessageImpl(const void* batch) : batch_(static_cast(batch)) { nodes_ = batch_->nodes(); buffers_ = batch_->buffers(); @@ -177,7 +183,7 @@ std::shared_ptr Message::GetRecordBatch() { RecordBatchMessage::RecordBatchMessage( const std::shared_ptr& message, const void* batch) { message_ = message; - impl_.reset(new Impl(batch)); + impl_.reset(new RecordBatchMessageImpl(batch)); } // TODO(wesm): Copying the flatbuffer data isn't great, but this will do for @@ -213,5 +219,122 @@ int RecordBatchMessage::num_fields() const { return impl_->num_fields(); } +// ---------------------------------------------------------------------- +// File footer + +static flatbuffers::Offset> +FileBlocksToFlatbuffer(FBB& fbb, const std::vector& blocks) { + std::vector fb_blocks; + + for (const FileBlock& block : blocks) { + fb_blocks.emplace_back(block.offset, block.metadata_length, block.body_length); + } + + return fbb.CreateVectorOfStructs(fb_blocks); +} + +Status WriteFileFooter(const Schema* schema, const std::vector& dictionaries, + const std::vector& record_batches, io::OutputStream* out) { + FBB fbb; + + flatbuffers::Offset fb_schema; + RETURN_NOT_OK(SchemaToFlatbuffer(fbb, schema, &fb_schema)); + + auto fb_dictionaries = FileBlocksToFlatbuffer(fbb, dictionaries); + auto fb_record_batches = FileBlocksToFlatbuffer(fbb, record_batches); + + auto footer = flatbuf::CreateFooter( + fbb, kMetadataVersion, fb_schema, fb_dictionaries, fb_record_batches); + + fbb.Finish(footer); + + int32_t size = fbb.GetSize(); + + return out->Write(fbb.GetBufferPointer(), size); +} + +static inline FileBlock FileBlockFromFlatbuffer(const flatbuf::Block* block) { + return FileBlock(block->offset(), block->metaDataLength(), block->bodyLength()); +} + +class FileFooter::FileFooterImpl { + public: + FileFooterImpl(const std::shared_ptr& buffer, const flatbuf::Footer* footer) + : buffer_(buffer), footer_(footer) {} + + int num_dictionaries() const { return footer_->dictionaries()->size(); } + + int num_record_batches() const { return footer_->recordBatches()->size(); } + + MetadataVersion::type version() const { + switch (footer_->version()) { + case flatbuf::MetadataVersion_V1_SNAPSHOT: + return MetadataVersion::V1_SNAPSHOT; + // Add cases as other versions become available + default: + return MetadataVersion::V1_SNAPSHOT; + } + } + + FileBlock record_batch(int i) const { + return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i)); + } + + FileBlock dictionary(int i) const { + return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i)); + } + + Status GetSchema(std::shared_ptr* out) const { + auto schema_msg = std::make_shared(nullptr, footer_->schema()); + return schema_msg->GetSchema(out); + } + + private: + // Retain reference to memory + std::shared_ptr buffer_; + + const flatbuf::Footer* footer_; +}; + +FileFooter::FileFooter() {} + +FileFooter::~FileFooter() {} + +Status FileFooter::Open( + const std::shared_ptr& buffer, std::unique_ptr* out) { + const flatbuf::Footer* footer = flatbuf::GetFooter(buffer->data()); + + *out = std::unique_ptr(new FileFooter()); + + // TODO(wesm): Verify the footer + (*out)->impl_.reset(new FileFooterImpl(buffer, footer)); + + return Status::OK(); +} + +int FileFooter::num_dictionaries() const { + return impl_->num_dictionaries(); +} + +int FileFooter::num_record_batches() const { + return impl_->num_record_batches(); +} + +MetadataVersion::type FileFooter::version() const { + return impl_->version(); +} + +FileBlock FileFooter::record_batch(int i) const { + return impl_->record_batch(i); +} + +FileBlock FileFooter::dictionary(int i) const { + return impl_->dictionary(i); +} + +Status FileFooter::GetSchema(std::shared_ptr* out) const { + return impl_->GetSchema(out); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/metadata.h b/cpp/src/arrow/ipc/metadata.h index d5ec53317e6..2f0e853bf97 100644 --- a/cpp/src/arrow/ipc/metadata.h +++ b/cpp/src/arrow/ipc/metadata.h @@ -22,6 +22,7 @@ #include #include +#include #include "arrow/util/visibility.h" @@ -32,17 +33,24 @@ struct Field; class Schema; class Status; +namespace io { + +class OutputStream; + +} // namespace io + namespace ipc { +struct MetadataVersion { + enum type { V1_SNAPSHOT }; +}; + //---------------------------------------------------------------------- -// Message read/write APIs // Serialize arrow::Schema as a Flatbuffer ARROW_EXPORT Status WriteSchema(const Schema* schema, std::shared_ptr* out); -//---------------------------------------------------------------------- - // Read interface classes. We do not fully deserialize the flatbuffers so that // individual fields metadata can be retrieved from very large schema without // @@ -68,8 +76,8 @@ class ARROW_EXPORT SchemaMessage { // Parent, owns the flatbuffer data std::shared_ptr message_; - class Impl; - std::unique_ptr impl_; + class SchemaMessageImpl; + std::unique_ptr impl_; }; // Field metadata @@ -101,8 +109,8 @@ class ARROW_EXPORT RecordBatchMessage { // Parent, owns the flatbuffer data std::shared_ptr message_; - class Impl; - std::unique_ptr impl_; + class RecordBatchMessageImpl; + std::unique_ptr impl_; }; class ARROW_EXPORT DictionaryBatchMessage { @@ -133,8 +141,46 @@ class ARROW_EXPORT Message : public std::enable_shared_from_this { Message(); // Hide serialization details from user API - class Impl; - std::unique_ptr impl_; + class MessageImpl; + std::unique_ptr impl_; +}; + +// ---------------------------------------------------------------------- +// File footer for file-like representation + +struct FileBlock { + FileBlock(int64_t offset, int32_t metadata_length, int64_t body_length) + : offset(offset), metadata_length(metadata_length), body_length(body_length) {} + + int64_t offset; + int32_t metadata_length; + int64_t body_length; +}; + +ARROW_EXPORT +Status WriteFileFooter(const Schema* schema, const std::vector& dictionaries, + const std::vector& record_batches, io::OutputStream* out); + +class ARROW_EXPORT FileFooter { + public: + ~FileFooter(); + + static Status Open( + const std::shared_ptr& buffer, std::unique_ptr* out); + + int num_dictionaries() const; + int num_record_batches() const; + MetadataVersion::type version() const; + + FileBlock record_batch(int i) const; + FileBlock dictionary(int i) const; + + Status GetSchema(std::shared_ptr* out) const; + + private: + FileFooter(); + class FileFooterImpl; + std::unique_ptr impl_; }; } // namespace ipc diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index f6582fc883b..7d02bc302f4 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -25,21 +25,28 @@ #include #include "arrow/array.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" namespace arrow { namespace ipc { +const auto kInt32 = std::make_shared(); +const auto kListInt32 = std::make_shared(kInt32); +const auto kListListInt32 = std::make_shared(kListInt32); + Status MakeRandomInt32Array( int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { std::shared_ptr data; test::MakeRandomInt32PoolBuffer(length, pool, &data); - const auto INT32 = std::make_shared(); - Int32Builder builder(pool, INT32); + const auto kInt32 = std::make_shared(); + Int32Builder builder(pool, kInt32); if (include_nulls) { std::shared_ptr valid_bytes; test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); @@ -87,6 +94,188 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li return (*array)->Validate(); } +typedef Status MakeRecordBatch(std::shared_ptr* out); + +Status MakeIntRecordBatch(std::shared_ptr* out) { + const int length = 1000; + + // Make the schema + auto f0 = std::make_shared("f0", kInt32); + auto f1 = std::make_shared("f1", kInt32); + std::shared_ptr schema(new Schema({f0, f1})); + + // Example data + std::shared_ptr a0, a1; + MemoryPool* pool = default_memory_pool(); + RETURN_NOT_OK(MakeRandomInt32Array(length, false, pool, &a0)); + RETURN_NOT_OK(MakeRandomInt32Array(length, true, pool, &a1)); + out->reset(new RecordBatch(schema, length, {a0, a1})); + return Status::OK(); +} + +template +Status MakeRandomBinaryArray( + const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) { + const std::vector values = { + "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; + Builder builder(pool, type); + const auto values_len = values.size(); + for (int32_t i = 0; i < length; ++i) { + int values_index = i % values_len; + if (values_index == 0) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + const std::string& value = values[values_index]; + RETURN_NOT_OK( + builder.Append(reinterpret_cast(value.data()), value.size())); + } + } + *array = builder.Finish(); + return Status::OK(); +} + +Status MakeStringTypesRecordBatch(std::shared_ptr* out) { + const int32_t length = 500; + auto string_type = std::make_shared(); + auto binary_type = std::make_shared(); + auto f0 = std::make_shared("f0", string_type); + auto f1 = std::make_shared("f1", binary_type); + std::shared_ptr schema(new Schema({f0, f1})); + + std::shared_ptr a0, a1; + MemoryPool* pool = default_memory_pool(); + + { + auto status = + MakeRandomBinaryArray(string_type, length, pool, &a0); + RETURN_NOT_OK(status); + } + { + auto status = + MakeRandomBinaryArray(binary_type, length, pool, &a1); + RETURN_NOT_OK(status); + } + out->reset(new RecordBatch(schema, length, {a0, a1})); + return Status::OK(); +} + +Status MakeListRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", kListInt32); + auto f1 = std::make_shared("f1", kListListInt32); + auto f2 = std::make_shared("f2", kInt32); + std::shared_ptr schema(new Schema({f0, f1, f2})); + + // Example data + + MemoryPool* pool = default_memory_pool(); + const int length = 200; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK( + MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); + out->reset(new RecordBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} + +Status MakeZeroLengthRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", kListInt32); + auto f1 = std::make_shared("f1", kListListInt32); + auto f2 = std::make_shared("f2", kInt32); + std::shared_ptr schema(new Schema({f0, f1, f2})); + + // Example data + MemoryPool* pool = default_memory_pool(); + const int length = 200; + const bool include_nulls = true; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &leaf_values)); + RETURN_NOT_OK(MakeRandomListArray(leaf_values, 0, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, 0, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(0, include_nulls, pool, &flat_array)); + out->reset(new RecordBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} + +Status MakeNonNullRecordBatch(std::shared_ptr* out) { + // Make the schema + auto f0 = std::make_shared("f0", kListInt32); + auto f1 = std::make_shared("f1", kListListInt32); + auto f2 = std::make_shared("f2", kInt32); + std::shared_ptr schema(new Schema({f0, f1, f2})); + + // Example data + MemoryPool* pool = default_memory_pool(); + const int length = 50; + std::shared_ptr leaf_values, list_array, list_list_array, flat_array; + + RETURN_NOT_OK(MakeRandomInt32Array(1000, true, pool, &leaf_values)); + bool include_nulls = false; + RETURN_NOT_OK( + MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array)); + RETURN_NOT_OK( + MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array)); + RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array)); + out->reset(new RecordBatch(schema, length, {list_array, list_list_array, flat_array})); + return Status::OK(); +} + +Status MakeDeeplyNestedList(std::shared_ptr* out) { + const int batch_length = 5; + TypePtr type = kInt32; + + MemoryPool* pool = default_memory_pool(); + ArrayPtr array; + const bool include_nulls = true; + RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); + for (int i = 0; i < 63; ++i) { + type = std::static_pointer_cast(std::make_shared(type)); + RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); + } + + auto f0 = std::make_shared("f0", type); + std::shared_ptr schema(new Schema({f0})); + std::vector arrays = {array}; + out->reset(new RecordBatch(schema, batch_length, arrays)); + return Status::OK(); +} + +Status MakeStruct(std::shared_ptr* out) { + // reuse constructed list columns + std::shared_ptr list_batch; + RETURN_NOT_OK(MakeListRecordBatch(&list_batch)); + std::vector columns = { + list_batch->column(0), list_batch->column(1), list_batch->column(2)}; + auto list_schema = list_batch->schema(); + + // Define schema + std::shared_ptr type(new StructType( + {list_schema->field(0), list_schema->field(1), list_schema->field(2)})); + auto f0 = std::make_shared("non_null_struct", type); + auto f1 = std::make_shared("null_struct", type); + std::shared_ptr schema(new Schema({f0, f1})); + + // construct individual nullable/non-nullable struct arrays + ArrayPtr no_nulls(new StructArray(type, list_batch->num_rows(), columns)); + std::vector null_bytes(list_batch->num_rows(), 1); + null_bytes[0] = 0; + std::shared_ptr null_bitmask; + RETURN_NOT_OK(util::bytes_to_bits(null_bytes, &null_bitmask)); + ArrayPtr with_nulls( + new StructArray(type, list_batch->num_rows(), columns, 1, null_bitmask)); + + // construct batch + std::vector arrays = {no_nulls, with_nulls}; + out->reset(new RecordBatch(schema, list_batch->num_rows(), arrays)); + return Status::OK(); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/util.h b/cpp/src/arrow/ipc/util.h index 3f4001b21a9..94079a38277 100644 --- a/cpp/src/arrow/ipc/util.h +++ b/cpp/src/arrow/ipc/util.h @@ -27,6 +27,14 @@ namespace arrow { namespace ipc { +// Align on 8-byte boundaries +static constexpr int kArrowAlignment = 8; +static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0}; + +static inline int64_t PaddedLength(int64_t nbytes, int64_t alignment = kArrowAlignment) { + return ((nbytes + alignment - 1) / alignment) * alignment; +} + // A helper class to tracks the size of allocations class MockOutputStream : public io::OutputStream { public: diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h index a9c64eca997..2689bebea30 100644 --- a/cpp/src/arrow/parquet/reader.h +++ b/cpp/src/arrow/parquet/reader.h @@ -31,7 +31,7 @@ namespace arrow { class Array; class MemoryPool; -class RowBatch; +class RecordBatch; class Status; class Table; diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index 5aa1ba58717..ecc6a9f8be3 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -30,7 +30,7 @@ namespace arrow { class Array; class MemoryPool; class PrimitiveArray; -class RowBatch; +class RecordBatch; class Status; class StringArray; class Table; diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index d9573eae74d..3a250df81d0 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -27,11 +27,11 @@ namespace arrow { -RowBatch::RowBatch(const std::shared_ptr& schema, int num_rows, +RecordBatch::RecordBatch(const std::shared_ptr& schema, int num_rows, const std::vector>& columns) : schema_(schema), num_rows_(num_rows), columns_(columns) {} -const std::string& RowBatch::column_name(int i) const { +const std::string& RecordBatch::column_name(int i) const { return schema_->field(i)->name; } diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 2088fdf0b64..36b3c8ecaf4 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -32,15 +32,15 @@ class Column; class Schema; class Status; -// A row batch is a simpler and more rigid table data structure intended for +// A record batch is a simpler and more rigid table data structure intended for // use primarily in shared memory IPC. It contains a schema (metadata) and a -// corresponding vector of equal-length Arrow arrays -class ARROW_EXPORT RowBatch { +// corresponding sequence of equal-length Arrow arrays +class ARROW_EXPORT RecordBatch { public: - // num_rows is a parameter to allow for row batches of a particular size not + // num_rows is a parameter to allow for record batches of a particular size not // having any materialized columns. Each array should have the same length as // num_rows - RowBatch(const std::shared_ptr& schema, int num_rows, + RecordBatch(const std::shared_ptr& schema, int32_t num_rows, const std::vector>& columns); // @returns: the table's schema @@ -50,17 +50,19 @@ class ARROW_EXPORT RowBatch { // Note: Does not boundscheck const std::shared_ptr& column(int i) const { return columns_[i]; } + const std::vector>& columns() const { return columns_; } + const std::string& column_name(int i) const; // @returns: the number of columns in the table int num_columns() const { return columns_.size(); } // @returns: the number of rows (the corresponding length of each column) - int64_t num_rows() const { return num_rows_; } + int32_t num_rows() const { return num_rows_; } private: std::shared_ptr schema_; - int num_rows_; + int32_t num_rows_; std::vector> columns_; }; diff --git a/format/IPC.md b/format/IPC.md new file mode 100644 index 00000000000..1f39e762ab7 --- /dev/null +++ b/format/IPC.md @@ -0,0 +1,3 @@ +# Interprocess messaging / communication (IPC) + +## File format diff --git a/format/README.md b/format/README.md index 3b0e50364d8..78e15207ee9 100644 --- a/format/README.md +++ b/format/README.md @@ -9,6 +9,7 @@ Currently, the Arrow specification consists of these pieces: - Metadata specification (see Metadata.md) - Physical memory layout specification (see Layout.md) - Metadata serialized representation (see Message.fbs) +- Mechanics of messaging between Arrow systems (IPC, RPC, etc.) (see IPC.md) The metadata currently uses Google's [flatbuffers library][1] for serializing a couple related pieces of information: From 32fd692f3aced29cc65a786d5ec63f8cd484853c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sun, 25 Sep 2016 19:28:26 -0400 Subject: [PATCH 142/210] ARROW-296: [Python / C++] Remove arrow::parquet, make pyarrow link against parquet_arrow This patch depends on PARQUET-728 (to run the full test suite, including pyarrow Parquet tests) Author: Wes McKinney Closes #145 from wesm/ARROW-296 and squashes the following commits: d67b4f9 [Wes McKinney] Refactor to link against parquet_arrow, fix up cmake files --- cpp/CMakeLists.txt | 18 - cpp/cmake_modules/FindParquet.cmake | 44 +- cpp/doc/Parquet.md | 15 +- cpp/src/arrow/parquet/CMakeLists.txt | 67 --- cpp/src/arrow/parquet/io.cc | 105 ---- cpp/src/arrow/parquet/io.h | 84 --- cpp/src/arrow/parquet/parquet-io-test.cc | 135 ----- .../parquet/parquet-reader-writer-test.cc | 499 ------------------ cpp/src/arrow/parquet/parquet-schema-test.cc | 261 --------- cpp/src/arrow/parquet/reader.cc | 401 -------------- cpp/src/arrow/parquet/reader.h | 146 ----- cpp/src/arrow/parquet/schema.cc | 344 ------------ cpp/src/arrow/parquet/schema.h | 53 -- cpp/src/arrow/parquet/test-util.h | 193 ------- cpp/src/arrow/parquet/utils.h | 52 -- cpp/src/arrow/parquet/writer.cc | 365 ------------- cpp/src/arrow/parquet/writer.h | 76 --- cpp/src/arrow/types/string.cc | 2 +- python/CMakeLists.txt | 14 +- python/cmake_modules/FindArrow.cmake | 22 - python/pyarrow/includes/parquet.pxd | 10 +- 21 files changed, 55 insertions(+), 2851 deletions(-) delete mode 100644 cpp/src/arrow/parquet/CMakeLists.txt delete mode 100644 cpp/src/arrow/parquet/io.cc delete mode 100644 cpp/src/arrow/parquet/io.h delete mode 100644 cpp/src/arrow/parquet/parquet-io-test.cc delete mode 100644 cpp/src/arrow/parquet/parquet-reader-writer-test.cc delete mode 100644 cpp/src/arrow/parquet/parquet-schema-test.cc delete mode 100644 cpp/src/arrow/parquet/reader.cc delete mode 100644 cpp/src/arrow/parquet/reader.h delete mode 100644 cpp/src/arrow/parquet/schema.cc delete mode 100644 cpp/src/arrow/parquet/schema.h delete mode 100644 cpp/src/arrow/parquet/test-util.h delete mode 100644 cpp/src/arrow/parquet/utils.h delete mode 100644 cpp/src/arrow/parquet/writer.cc delete mode 100644 cpp/src/arrow/parquet/writer.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index be95dabf318..f3f4a7dac01 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,10 +52,6 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Build the libarrow shared libraries" ON) - option(ARROW_PARQUET - "Build the Parquet adapter and link to libparquet" - OFF) - option(ARROW_TEST_MEMCHECK "Run the test suite using valgrind --tool=memcheck" OFF) @@ -702,20 +698,6 @@ add_subdirectory(src/arrow/io) add_subdirectory(src/arrow/util) add_subdirectory(src/arrow/types) -#---------------------------------------------------------------------- -# Parquet adapter library - -if(ARROW_PARQUET) - find_package(Parquet REQUIRED) - include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) - ADD_THIRDPARTY_LIB(parquet - STATIC_LIB ${PARQUET_STATIC_LIB} - SHARED_LIB ${PARQUET_SHARED_LIB}) - - add_subdirectory(src/arrow/parquet) - list(APPEND LINK_LIBS arrow_parquet parquet) -endif() - #---------------------------------------------------------------------- # IPC library diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index 36f4828a999..7445e0919ac 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -29,15 +29,20 @@ endif() # Try the parameterized roots, if they exist if ( _parquet_roots ) - find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h - PATHS ${_parquet_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "include" ) - find_library( PARQUET_LIBRARIES NAMES parquet - PATHS ${_parquet_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "lib" ) + find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h + PATHS ${_parquet_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( PARQUET_LIBRARIES NAMES parquet + PATHS ${_parquet_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) + + find_library(PARQUET_ARROW_LIBRARIES NAMES parquet_arrow + PATHS ${_parquet_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib") else () - find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h ) - find_library( PARQUET_LIBRARIES NAMES parquet ) + find_path(PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h ) + find_library(PARQUET_LIBRARIES NAMES parquet) + find_library(PARQUET_ARROW_LIBRARIES NAMES parquet_arrow) endif () @@ -51,6 +56,18 @@ else () set(PARQUET_FOUND FALSE) endif () +if (PARQUET_INCLUDE_DIR AND PARQUET_ARROW_LIBRARIES) + set(PARQUET_ARROW_FOUND TRUE) + get_filename_component(PARQUET_ARROW_LIBS ${PARQUET_ARROW_LIBRARIES} PATH) + set(PARQUET_ARROW_LIB_NAME libparquet_arrow) + set(PARQUET_ARROW_STATIC_LIB + ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}.a) + set(PARQUET_ARROW_SHARED_LIB + ${PARQUET_ARROW_LIBS}/${PARQUET_ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +else () + set(PARQUET_ARROW_FOUND FALSE) +endif () + if (PARQUET_FOUND) if (NOT Parquet_FIND_QUIETLY) message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}") @@ -71,6 +88,12 @@ else () endif () endif () +if (PARQUET_ARROW_FOUND) + if (NOT Parquet_FIND_QUIETLY) + message(STATUS "Found the Parquet Arrow library: ${PARQUET_ARROW_LIBS}") + endif() +endif() + mark_as_advanced( PARQUET_FOUND PARQUET_INCLUDE_DIR @@ -78,4 +101,9 @@ mark_as_advanced( PARQUET_LIBRARIES PARQUET_STATIC_LIB PARQUET_SHARED_LIB + + PARQUET_ARROW_FOUND + PARQUET_ARROW_LIBS + PARQUET_ARROW_STATIC_LIB + PARQUET_ARROW_SHARED_LIB ) diff --git a/cpp/doc/Parquet.md b/cpp/doc/Parquet.md index 370ac833388..96471d94835 100644 --- a/cpp/doc/Parquet.md +++ b/cpp/doc/Parquet.md @@ -1,24 +1,19 @@ ## Building Arrow-Parquet integration -To build the Arrow C++'s Parquet adapter library, you must first build [parquet-cpp][1]: +To use Arrow C++ with Parquet, you must first build the Arrow C++ libraries and +install them someplace. Then, you can build [parquet-cpp][1] with the Arrow +adapter library: ```bash # Set this to your preferred install location -export PARQUET_HOME=$HOME/local +export ARROW_HOME=$HOME/local git clone https://github.com/apache/parquet-cpp.git cd parquet-cpp source setup_build_env.sh -cmake -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME +cmake -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME -DPARQUET_ARROW=on make -j4 make install ``` -Make sure that `$PARQUET_HOME` is set to the installation location. Now, build -Arrow with the Parquet adapter enabled: - -```bash -cmake -DARROW_PARQUET=ON -``` - [1]: https://github.com/apache/parquet-cpp \ No newline at end of file diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt deleted file mode 100644 index c400e14ea47..00000000000 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# ---------------------------------------------------------------------- -# arrow_parquet : Arrow <-> Parquet adapter - -set(PARQUET_SRCS - io.cc - reader.cc - schema.cc - writer.cc -) - -set(PARQUET_LIBS - arrow_shared - arrow_io - parquet_shared -) - -add_library(arrow_parquet SHARED - ${PARQUET_SRCS} -) -target_link_libraries(arrow_parquet ${PARQUET_LIBS}) -SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) - -if (APPLE) - set_target_properties(arrow_parquet - PROPERTIES - BUILD_WITH_INSTALL_RPATH ON - INSTALL_NAME_DIR "@rpath") -endif() - -ADD_ARROW_TEST(parquet-schema-test) -ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) - -ADD_ARROW_TEST(parquet-io-test) -ARROW_TEST_LINK_LIBRARIES(parquet-io-test arrow_parquet) - -ADD_ARROW_TEST(parquet-reader-writer-test) -ARROW_TEST_LINK_LIBRARIES(parquet-reader-writer-test arrow_parquet) - -# Headers: top level -install(FILES - io.h - reader.h - schema.h - utils.h - writer.h - DESTINATION include/arrow/parquet) - -install(TARGETS arrow_parquet - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) diff --git a/cpp/src/arrow/parquet/io.cc b/cpp/src/arrow/parquet/io.cc deleted file mode 100644 index a50d753f305..00000000000 --- a/cpp/src/arrow/parquet/io.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/parquet/io.h" - -#include -#include - -#include "parquet/api/io.h" - -#include "arrow/parquet/utils.h" -#include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" - -// To assist with readability -using ArrowROFile = arrow::io::ReadableFileInterface; - -namespace arrow { -namespace parquet { - -// ---------------------------------------------------------------------- -// ParquetAllocator - -ParquetAllocator::ParquetAllocator() : pool_(default_memory_pool()) {} - -ParquetAllocator::ParquetAllocator(MemoryPool* pool) : pool_(pool) {} - -ParquetAllocator::~ParquetAllocator() {} - -uint8_t* ParquetAllocator::Malloc(int64_t size) { - uint8_t* result; - PARQUET_THROW_NOT_OK(pool_->Allocate(size, &result)); - return result; -} - -void ParquetAllocator::Free(uint8_t* buffer, int64_t size) { - // Does not report Status - pool_->Free(buffer, size); -} - -// ---------------------------------------------------------------------- -// ParquetReadSource - -ParquetReadSource::ParquetReadSource(ParquetAllocator* allocator) - : file_(nullptr), allocator_(allocator) {} - -Status ParquetReadSource::Open(const std::shared_ptr& file) { - int64_t file_size; - RETURN_NOT_OK(file->GetSize(&file_size)); - - file_ = file; - size_ = file_size; - return Status::OK(); -} - -void ParquetReadSource::Close() { - // TODO(wesm): Make this a no-op for now. This leaves Python wrappers for - // these classes in a borked state. Probably better to explicitly close. - - // PARQUET_THROW_NOT_OK(file_->Close()); -} - -int64_t ParquetReadSource::Tell() const { - int64_t position; - PARQUET_THROW_NOT_OK(file_->Tell(&position)); - return position; -} - -void ParquetReadSource::Seek(int64_t position) { - PARQUET_THROW_NOT_OK(file_->Seek(position)); -} - -int64_t ParquetReadSource::Read(int64_t nbytes, uint8_t* out) { - int64_t bytes_read; - PARQUET_THROW_NOT_OK(file_->Read(nbytes, &bytes_read, out)); - return bytes_read; -} - -std::shared_ptr<::parquet::Buffer> ParquetReadSource::Read(int64_t nbytes) { - // TODO(wesm): This code is duplicated from parquet/util/input.cc; suggests - // that there should be more code sharing amongst file-like sources - auto result = std::make_shared<::parquet::OwnedMutableBuffer>(0, allocator_); - result->Resize(nbytes); - - int64_t bytes_read = Read(nbytes, result->mutable_data()); - if (bytes_read < nbytes) { result->Resize(bytes_read); } - return result; -} - -} // namespace parquet -} // namespace arrow diff --git a/cpp/src/arrow/parquet/io.h b/cpp/src/arrow/parquet/io.h deleted file mode 100644 index 1734863acf1..00000000000 --- a/cpp/src/arrow/parquet/io.h +++ /dev/null @@ -1,84 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Bridges Arrow's IO interfaces and Parquet-cpp's IO interfaces - -#ifndef ARROW_PARQUET_IO_H -#define ARROW_PARQUET_IO_H - -#include -#include - -#include "parquet/api/io.h" - -#include "arrow/io/interfaces.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class MemoryPool; - -namespace parquet { - -// An implementation of the Parquet MemoryAllocator API that plugs into an -// existing Arrow memory pool. This way we can direct all allocations to a -// single place rather than tracking allocations in different locations (for -// example: without utilizing parquet-cpp's default allocator) -class ARROW_EXPORT ParquetAllocator : public ::parquet::MemoryAllocator { - public: - // Uses the default memory pool - ParquetAllocator(); - - explicit ParquetAllocator(MemoryPool* pool); - virtual ~ParquetAllocator(); - - uint8_t* Malloc(int64_t size) override; - void Free(uint8_t* buffer, int64_t size) override; - - void set_pool(MemoryPool* pool) { pool_ = pool; } - - MemoryPool* pool() const { return pool_; } - - private: - MemoryPool* pool_; -}; - -class ARROW_EXPORT ParquetReadSource : public ::parquet::RandomAccessSource { - public: - explicit ParquetReadSource(ParquetAllocator* allocator); - - // We need to ask for the file size on opening the file, and this can fail - Status Open(const std::shared_ptr& file); - - void Close() override; - int64_t Tell() const override; - void Seek(int64_t pos) override; - int64_t Read(int64_t nbytes, uint8_t* out) override; - std::shared_ptr<::parquet::Buffer> Read(int64_t nbytes) override; - - private: - // An Arrow readable file of some kind - std::shared_ptr file_; - - // The allocator is required for creating managed buffers - ParquetAllocator* allocator_; -}; - -} // namespace parquet -} // namespace arrow - -#endif // ARROW_PARQUET_IO_H diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc deleted file mode 100644 index 208b3e867d3..00000000000 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ /dev/null @@ -1,135 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include -#include - -#include "gtest/gtest.h" - -#include "arrow/io/memory.h" -#include "arrow/parquet/io.h" -#include "arrow/test-util.h" -#include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" - -#include "parquet/api/io.h" - -namespace arrow { -namespace parquet { - -// Allocator tests - -TEST(TestParquetAllocator, DefaultCtor) { - ParquetAllocator allocator; - - const int buffer_size = 10; - - uint8_t* buffer = nullptr; - ASSERT_NO_THROW(buffer = allocator.Malloc(buffer_size);); - - // valgrind will complain if we write into nullptr - memset(buffer, 0, buffer_size); - - allocator.Free(buffer, buffer_size); -} - -// Pass through to the default memory pool -class TrackingPool : public MemoryPool { - public: - TrackingPool() : pool_(default_memory_pool()), bytes_allocated_(0) {} - - Status Allocate(int64_t size, uint8_t** out) override { - RETURN_NOT_OK(pool_->Allocate(size, out)); - bytes_allocated_ += size; - return Status::OK(); - } - - void Free(uint8_t* buffer, int64_t size) override { - pool_->Free(buffer, size); - bytes_allocated_ -= size; - } - - int64_t bytes_allocated() const override { return bytes_allocated_; } - - private: - MemoryPool* pool_; - int64_t bytes_allocated_; -}; - -TEST(TestParquetAllocator, CustomPool) { - TrackingPool pool; - - ParquetAllocator allocator(&pool); - - ASSERT_EQ(&pool, allocator.pool()); - - const int buffer_size = 10; - - uint8_t* buffer = nullptr; - ASSERT_NO_THROW(buffer = allocator.Malloc(buffer_size);); - - ASSERT_EQ(buffer_size, pool.bytes_allocated()); - - // valgrind will complain if we write into nullptr - memset(buffer, 0, buffer_size); - - allocator.Free(buffer, buffer_size); - - ASSERT_EQ(0, pool.bytes_allocated()); -} - -// ---------------------------------------------------------------------- -// Read source tests - -TEST(TestParquetReadSource, Basics) { - std::string data = "this is the data"; - auto data_buffer = reinterpret_cast(data.c_str()); - - ParquetAllocator allocator(default_memory_pool()); - - auto file = std::make_shared(data_buffer, data.size()); - auto source = std::make_shared(&allocator); - - ASSERT_OK(source->Open(file)); - - ASSERT_EQ(0, source->Tell()); - ASSERT_NO_THROW(source->Seek(5)); - ASSERT_EQ(5, source->Tell()); - ASSERT_NO_THROW(source->Seek(0)); - - // Seek out of bounds - ASSERT_THROW(source->Seek(100), ::parquet::ParquetException); - - uint8_t buffer[50]; - - ASSERT_NO_THROW(source->Read(4, buffer)); - ASSERT_EQ(0, std::memcmp(buffer, "this", 4)); - ASSERT_EQ(4, source->Tell()); - - std::shared_ptr<::parquet::Buffer> pq_buffer; - - ASSERT_NO_THROW(pq_buffer = source->Read(7)); - - auto expected_buffer = std::make_shared<::parquet::Buffer>(data_buffer + 4, 7); - - ASSERT_TRUE(expected_buffer->Equals(*pq_buffer.get())); -} - -} // namespace parquet -} // namespace arrow diff --git a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc b/cpp/src/arrow/parquet/parquet-reader-writer-test.cc deleted file mode 100644 index d7b39dda377..00000000000 --- a/cpp/src/arrow/parquet/parquet-reader-writer-test.cc +++ /dev/null @@ -1,499 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "gtest/gtest.h" - -#include "arrow/test-util.h" -#include "arrow/parquet/test-util.h" -#include "arrow/parquet/reader.h" -#include "arrow/parquet/writer.h" -#include "arrow/types/construct.h" -#include "arrow/types/primitive.h" -#include "arrow/types/string.h" -#include "arrow/util/memory-pool.h" -#include "arrow/util/status.h" - -#include "parquet/api/reader.h" -#include "parquet/api/writer.h" - -using ParquetBuffer = parquet::Buffer; -using parquet::BufferReader; -using parquet::default_writer_properties; -using parquet::InMemoryOutputStream; -using parquet::LogicalType; -using parquet::ParquetFileReader; -using parquet::ParquetFileWriter; -using parquet::RandomAccessSource; -using parquet::Repetition; -using parquet::SchemaDescriptor; -using parquet::ParquetVersion; -using ParquetType = parquet::Type; -using parquet::schema::GroupNode; -using parquet::schema::NodePtr; -using parquet::schema::PrimitiveNode; - -namespace arrow { - -namespace parquet { - -const int SMALL_SIZE = 100; -const int LARGE_SIZE = 10000; - -template -struct test_traits {}; - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::BOOLEAN; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static uint8_t const value; -}; - -const uint8_t test_traits::value(1); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_8; - static uint8_t const value; -}; - -const uint8_t test_traits::value(64); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::INT_8; - static int8_t const value; -}; - -const int8_t test_traits::value(-64); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_16; - static uint16_t const value; -}; - -const uint16_t test_traits::value(1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::INT_16; - static int16_t const value; -}; - -const int16_t test_traits::value(-1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_32; - static uint32_t const value; -}; - -const uint32_t test_traits::value(1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT32; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static int32_t const value; -}; - -const int32_t test_traits::value(-1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT64; - static constexpr LogicalType::type logical_enum = LogicalType::UINT_64; - static uint64_t const value; -}; - -const uint64_t test_traits::value(1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT64; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static int64_t const value; -}; - -const int64_t test_traits::value(-1024); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::INT64; - static constexpr LogicalType::type logical_enum = LogicalType::TIMESTAMP_MILLIS; - static int64_t const value; -}; - -const int64_t test_traits::value(14695634030000); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static float const value; -}; - -const float test_traits::value(2.1f); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE; - static constexpr LogicalType::type logical_enum = LogicalType::NONE; - static double const value; -}; - -const double test_traits::value(4.2); - -template <> -struct test_traits { - static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY; - static constexpr LogicalType::type logical_enum = LogicalType::UTF8; - static std::string const value; -}; - -const std::string test_traits::value("Test"); - -template -using ParquetDataType = ::parquet::DataType::parquet_enum>; - -template -using ParquetWriter = ::parquet::TypedColumnWriter>; - -template -class TestParquetIO : public ::testing::Test { - public: - virtual void SetUp() {} - - std::shared_ptr MakeSchema(Repetition::type repetition) { - auto pnode = PrimitiveNode::Make("column1", repetition, - test_traits::parquet_enum, test_traits::logical_enum); - NodePtr node_ = - GroupNode::Make("schema", Repetition::REQUIRED, std::vector({pnode})); - return std::static_pointer_cast(node_); - } - - std::unique_ptr MakeWriter( - const std::shared_ptr& schema) { - sink_ = std::make_shared(); - return ParquetFileWriter::Open(sink_, schema); - } - - std::unique_ptr ReaderFromSink() { - std::shared_ptr buffer = sink_->GetBuffer(); - std::unique_ptr source(new BufferReader(buffer)); - return ParquetFileReader::Open(std::move(source)); - } - - void ReadSingleColumnFile( - std::unique_ptr file_reader, std::shared_ptr* out) { - arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); - std::unique_ptr column_reader; - ASSERT_OK_NO_THROW(reader.GetFlatColumn(0, &column_reader)); - ASSERT_NE(nullptr, column_reader.get()); - - ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); - ASSERT_NE(nullptr, out->get()); - } - - void ReadAndCheckSingleColumnFile(Array* values) { - std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); - ASSERT_TRUE(values->Equals(out)); - } - - void ReadTableFromFile( - std::unique_ptr file_reader, std::shared_ptr
* out) { - arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); - ASSERT_OK_NO_THROW(reader.ReadFlatTable(out)); - ASSERT_NE(nullptr, out->get()); - } - - void ReadAndCheckSingleColumnTable(const std::shared_ptr& values) { - std::shared_ptr
out; - ReadTableFromFile(ReaderFromSink(), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(values->length(), out->num_rows()); - - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); - } - - template - void WriteFlatColumn(const std::shared_ptr& schema, - const std::shared_ptr& values) { - FileWriter writer(default_memory_pool(), MakeWriter(schema)); - ASSERT_OK_NO_THROW(writer.NewRowGroup(values->length())); - ASSERT_OK_NO_THROW(writer.WriteFlatColumnChunk(values.get())); - ASSERT_OK_NO_THROW(writer.Close()); - } - - std::shared_ptr sink_; -}; - -// We habe separate tests for UInt32Type as this is currently the only type -// where a roundtrip does not yield the identical Array structure. -// There we write an UInt32 Array but receive an Int64 Array as result for -// Parquet version 1.0. - -typedef ::testing::Types TestTypes; - -TYPED_TEST_CASE(TestParquetIO, TestTypes); - -TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) { - auto values = NonNullArray(SMALL_SIZE); - - std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); - this->WriteFlatColumn(schema, values); - - this->ReadAndCheckSingleColumnFile(values.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { - auto values = NonNullArray(SMALL_SIZE); - std::shared_ptr
table = MakeSimpleTable(values, false); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, - values->length(), default_writer_properties())); - - std::shared_ptr
out; - this->ReadTableFromFile(this->ReaderFromSink(), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(100, out->num_rows()); - - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); -} - -TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { - // This also tests max_definition_level = 1 - auto values = NullableArray(SMALL_SIZE, 10); - - std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); - this->WriteFlatColumn(schema, values); - - this->ReadAndCheckSingleColumnFile(values.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { - // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(SMALL_SIZE, 10); - std::shared_ptr
table = MakeSimpleTable(values, true); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, - values->length(), default_writer_properties())); - - this->ReadAndCheckSingleColumnTable(values); -} - -TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedWrite) { - auto values = NonNullArray(SMALL_SIZE); - int64_t chunk_size = values->length() / 4; - - std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); - FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); - for (int i = 0; i < 4; i++) { - ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); - ASSERT_OK_NO_THROW( - writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); - } - ASSERT_OK_NO_THROW(writer.Close()); - - this->ReadAndCheckSingleColumnFile(values.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { - auto values = NonNullArray(LARGE_SIZE); - std::shared_ptr
table = MakeSimpleTable(values, false); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable( - table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); - - this->ReadAndCheckSingleColumnTable(values); -} - -TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { - int64_t chunk_size = SMALL_SIZE / 4; - auto values = NullableArray(SMALL_SIZE, 10); - - std::shared_ptr schema = this->MakeSchema(Repetition::OPTIONAL); - FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); - for (int i = 0; i < 4; i++) { - ASSERT_OK_NO_THROW(writer.NewRowGroup(chunk_size)); - ASSERT_OK_NO_THROW( - writer.WriteFlatColumnChunk(values.get(), i * chunk_size, chunk_size)); - } - ASSERT_OK_NO_THROW(writer.Close()); - - this->ReadAndCheckSingleColumnFile(values.get()); -} - -TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { - // This also tests max_definition_level = 1 - auto values = NullableArray(LARGE_SIZE, 100); - std::shared_ptr
table = MakeSimpleTable(values, true); - this->sink_ = std::make_shared(); - ASSERT_OK_NO_THROW(WriteFlatTable( - table.get(), default_memory_pool(), this->sink_, 512, default_writer_properties())); - - this->ReadAndCheckSingleColumnTable(values); -} - -using TestUInt32ParquetIO = TestParquetIO; - -TEST_F(TestUInt32ParquetIO, Parquet_2_0_Compability) { - // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(LARGE_SIZE, 100); - std::shared_ptr
table = MakeSimpleTable(values, true); - - // Parquet 2.0 roundtrip should yield an uint32_t column again - this->sink_ = std::make_shared(); - std::shared_ptr<::parquet::WriterProperties> properties = - ::parquet::WriterProperties::Builder() - .version(ParquetVersion::PARQUET_2_0) - ->build(); - ASSERT_OK_NO_THROW( - WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); - this->ReadAndCheckSingleColumnTable(values); -} - -TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { - // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(LARGE_SIZE, 100); - std::shared_ptr
table = MakeSimpleTable(values, true); - - // Parquet 1.0 returns an int64_t column as there is no way to tell a Parquet 1.0 - // reader that a column is unsigned. - this->sink_ = std::make_shared(); - std::shared_ptr<::parquet::WriterProperties> properties = - ::parquet::WriterProperties::Builder() - .version(ParquetVersion::PARQUET_1_0) - ->build(); - ASSERT_OK_NO_THROW( - WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512, properties)); - - std::shared_ptr expected_values; - std::shared_ptr int64_data = - std::make_shared(default_memory_pool()); - { - ASSERT_OK(int64_data->Resize(sizeof(int64_t) * values->length())); - int64_t* int64_data_ptr = reinterpret_cast(int64_data->mutable_data()); - const uint32_t* uint32_data_ptr = - reinterpret_cast(values->data()->data()); - // std::copy might be faster but this is explicit on the casts) - for (int64_t i = 0; i < values->length(); i++) { - int64_data_ptr[i] = static_cast(uint32_data_ptr[i]); - } - } - ASSERT_OK(MakePrimitiveArray(std::make_shared(), values->length(), - int64_data, values->null_count(), values->null_bitmap(), &expected_values)); - this->ReadAndCheckSingleColumnTable(expected_values); -} - -template -using ParquetCDataType = typename ParquetDataType::c_type; - -template -class TestPrimitiveParquetIO : public TestParquetIO { - public: - typedef typename TestType::c_type T; - - void MakeTestFile(std::vector& values, int num_chunks, - std::unique_ptr* file_reader) { - std::shared_ptr schema = this->MakeSchema(Repetition::REQUIRED); - std::unique_ptr file_writer = this->MakeWriter(schema); - size_t chunk_size = values.size() / num_chunks; - // Convert to Parquet's expected physical type - std::vector values_buffer( - sizeof(ParquetCDataType) * values.size()); - auto values_parquet = - reinterpret_cast*>(values_buffer.data()); - std::copy(values.cbegin(), values.cend(), values_parquet); - for (int i = 0; i < num_chunks; i++) { - auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = - static_cast*>(row_group_writer->NextColumn()); - ParquetCDataType* data = values_parquet + i * chunk_size; - column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); - column_writer->Close(); - row_group_writer->Close(); - } - file_writer->Close(); - *file_reader = this->ReaderFromSink(); - } - - void CheckSingleColumnRequiredTableRead(int num_chunks) { - std::vector values(SMALL_SIZE, test_traits::value); - std::unique_ptr file_reader; - ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); - - std::shared_ptr
out; - this->ReadTableFromFile(std::move(file_reader), &out); - ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(SMALL_SIZE, out->num_rows()); - - std::shared_ptr chunked_array = out->column(0)->data(); - ASSERT_EQ(1, chunked_array->num_chunks()); - ExpectArray(values.data(), chunked_array->chunk(0).get()); - } - - void CheckSingleColumnRequiredRead(int num_chunks) { - std::vector values(SMALL_SIZE, test_traits::value); - std::unique_ptr file_reader; - ASSERT_NO_THROW(MakeTestFile(values, num_chunks, &file_reader)); - - std::shared_ptr out; - this->ReadSingleColumnFile(std::move(file_reader), &out); - - ExpectArray(values.data(), out.get()); - } -}; - -typedef ::testing::Types PrimitiveTestTypes; - -TYPED_TEST_CASE(TestPrimitiveParquetIO, PrimitiveTestTypes); - -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredRead) { - this->CheckSingleColumnRequiredRead(1); -} - -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredTableRead) { - this->CheckSingleColumnRequiredTableRead(1); -} - -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedRead) { - this->CheckSingleColumnRequiredRead(4); -} - -TYPED_TEST(TestPrimitiveParquetIO, SingleColumnRequiredChunkedTableRead) { - this->CheckSingleColumnRequiredTableRead(4); -} - -} // namespace parquet - -} // namespace arrow diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc deleted file mode 100644 index 63ad8fba465..00000000000 --- a/cpp/src/arrow/parquet/parquet-schema-test.cc +++ /dev/null @@ -1,261 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "gtest/gtest.h" - -#include "arrow/test-util.h" -#include "arrow/type.h" -#include "arrow/types/datetime.h" -#include "arrow/types/decimal.h" -#include "arrow/util/status.h" - -#include "arrow/parquet/schema.h" - -using ParquetType = parquet::Type; -using parquet::LogicalType; -using parquet::Repetition; -using parquet::schema::NodePtr; -using parquet::schema::GroupNode; -using parquet::schema::PrimitiveNode; - -namespace arrow { - -namespace parquet { - -const auto BOOL = std::make_shared(); -const auto UINT8 = std::make_shared(); -const auto INT32 = std::make_shared(); -const auto INT64 = std::make_shared(); -const auto FLOAT = std::make_shared(); -const auto DOUBLE = std::make_shared(); -const auto UTF8 = std::make_shared(); -const auto TIMESTAMP_MS = std::make_shared(TimestampType::Unit::MILLI); -// TODO: This requires parquet-cpp implementing the MICROS enum value -// const auto TIMESTAMP_US = std::make_shared(TimestampType::Unit::MICRO); -const auto BINARY = std::make_shared(std::make_shared("", UINT8)); -const auto DECIMAL_8_4 = std::make_shared(8, 4); - -class TestConvertParquetSchema : public ::testing::Test { - public: - virtual void SetUp() {} - - void CheckFlatSchema(const std::shared_ptr& expected_schema) { - ASSERT_EQ(expected_schema->num_fields(), result_schema_->num_fields()); - for (int i = 0; i < expected_schema->num_fields(); ++i) { - auto lhs = result_schema_->field(i); - auto rhs = expected_schema->field(i); - EXPECT_TRUE(lhs->Equals(rhs)) << i << " " << lhs->ToString() - << " != " << rhs->ToString(); - } - } - - Status ConvertSchema(const std::vector& nodes) { - NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); - descr_.Init(schema); - return FromParquetSchema(&descr_, &result_schema_); - } - - protected: - ::parquet::SchemaDescriptor descr_; - std::shared_ptr result_schema_; -}; - -TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { - std::vector parquet_fields; - std::vector> arrow_fields; - - parquet_fields.push_back( - PrimitiveNode::Make("boolean", Repetition::REQUIRED, ParquetType::BOOLEAN)); - arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); - - parquet_fields.push_back( - PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32)); - arrow_fields.push_back(std::make_shared("int32", INT32, false)); - - parquet_fields.push_back( - PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64)); - arrow_fields.push_back(std::make_shared("int64", INT64, false)); - - parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, - ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); - arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_MS, false)); - - // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, - // ParquetType::INT64, LogicalType::TIMESTAMP_MICROS)); - // arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_US, false)); - - parquet_fields.push_back( - PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); - arrow_fields.push_back(std::make_shared("float", FLOAT)); - - parquet_fields.push_back( - PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE)); - arrow_fields.push_back(std::make_shared("double", DOUBLE)); - - parquet_fields.push_back( - PrimitiveNode::Make("binary", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY)); - arrow_fields.push_back(std::make_shared("binary", BINARY)); - - parquet_fields.push_back(PrimitiveNode::Make( - "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8)); - arrow_fields.push_back(std::make_shared("string", UTF8)); - - parquet_fields.push_back(PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, - ParquetType::FIXED_LEN_BYTE_ARRAY, LogicalType::NONE, 12)); - arrow_fields.push_back(std::make_shared("flba-binary", BINARY)); - - auto arrow_schema = std::make_shared(arrow_fields); - ASSERT_OK(ConvertSchema(parquet_fields)); - - CheckFlatSchema(arrow_schema); -} - -TEST_F(TestConvertParquetSchema, ParquetFlatDecimals) { - std::vector parquet_fields; - std::vector> arrow_fields; - - parquet_fields.push_back(PrimitiveNode::Make("flba-decimal", Repetition::OPTIONAL, - ParquetType::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 4, 8, 4)); - arrow_fields.push_back(std::make_shared("flba-decimal", DECIMAL_8_4)); - - parquet_fields.push_back(PrimitiveNode::Make("binary-decimal", Repetition::OPTIONAL, - ParquetType::BYTE_ARRAY, LogicalType::DECIMAL, -1, 8, 4)); - arrow_fields.push_back(std::make_shared("binary-decimal", DECIMAL_8_4)); - - parquet_fields.push_back(PrimitiveNode::Make("int32-decimal", Repetition::OPTIONAL, - ParquetType::INT32, LogicalType::DECIMAL, -1, 8, 4)); - arrow_fields.push_back(std::make_shared("int32-decimal", DECIMAL_8_4)); - - parquet_fields.push_back(PrimitiveNode::Make("int64-decimal", Repetition::OPTIONAL, - ParquetType::INT64, LogicalType::DECIMAL, -1, 8, 4)); - arrow_fields.push_back(std::make_shared("int64-decimal", DECIMAL_8_4)); - - auto arrow_schema = std::make_shared(arrow_fields); - ASSERT_OK(ConvertSchema(parquet_fields)); - - CheckFlatSchema(arrow_schema); -} - -TEST_F(TestConvertParquetSchema, UnsupportedThings) { - std::vector unsupported_nodes; - - unsupported_nodes.push_back( - PrimitiveNode::Make("int96", Repetition::REQUIRED, ParquetType::INT96)); - - unsupported_nodes.push_back( - GroupNode::Make("repeated-group", Repetition::REPEATED, {})); - - unsupported_nodes.push_back(PrimitiveNode::Make( - "int32", Repetition::OPTIONAL, ParquetType::INT32, LogicalType::DATE)); - - for (const NodePtr& node : unsupported_nodes) { - ASSERT_RAISES(NotImplemented, ConvertSchema({node})); - } -} - -class TestConvertArrowSchema : public ::testing::Test { - public: - virtual void SetUp() {} - - void CheckFlatSchema(const std::vector& nodes) { - NodePtr schema_node = GroupNode::Make("schema", Repetition::REPEATED, nodes); - const GroupNode* expected_schema_node = - static_cast(schema_node.get()); - const GroupNode* result_schema_node = result_schema_->group_node(); - - ASSERT_EQ(expected_schema_node->field_count(), result_schema_node->field_count()); - - for (int i = 0; i < expected_schema_node->field_count(); i++) { - auto lhs = result_schema_node->field(i); - auto rhs = expected_schema_node->field(i); - EXPECT_TRUE(lhs->Equals(rhs.get())); - } - } - - Status ConvertSchema(const std::vector>& fields) { - arrow_schema_ = std::make_shared(fields); - std::shared_ptr<::parquet::WriterProperties> properties = - ::parquet::default_writer_properties(); - return ToParquetSchema(arrow_schema_.get(), *properties.get(), &result_schema_); - } - - protected: - std::shared_ptr arrow_schema_; - std::shared_ptr<::parquet::SchemaDescriptor> result_schema_; -}; - -TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) { - std::vector parquet_fields; - std::vector> arrow_fields; - - parquet_fields.push_back( - PrimitiveNode::Make("boolean", Repetition::REQUIRED, ParquetType::BOOLEAN)); - arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); - - parquet_fields.push_back( - PrimitiveNode::Make("int32", Repetition::REQUIRED, ParquetType::INT32)); - arrow_fields.push_back(std::make_shared("int32", INT32, false)); - - parquet_fields.push_back( - PrimitiveNode::Make("int64", Repetition::REQUIRED, ParquetType::INT64)); - arrow_fields.push_back(std::make_shared("int64", INT64, false)); - - parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, - ParquetType::INT64, LogicalType::TIMESTAMP_MILLIS)); - arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_MS, false)); - - // parquet_fields.push_back(PrimitiveNode::Make("timestamp", Repetition::REQUIRED, - // ParquetType::INT64, LogicalType::TIMESTAMP_MICROS)); - // arrow_fields.push_back(std::make_shared("timestamp", TIMESTAMP_US, false)); - - parquet_fields.push_back( - PrimitiveNode::Make("float", Repetition::OPTIONAL, ParquetType::FLOAT)); - arrow_fields.push_back(std::make_shared("float", FLOAT)); - - parquet_fields.push_back( - PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE)); - arrow_fields.push_back(std::make_shared("double", DOUBLE)); - - // TODO: String types need to be clarified a bit more in the Arrow spec - parquet_fields.push_back(PrimitiveNode::Make( - "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8)); - arrow_fields.push_back(std::make_shared("string", UTF8)); - - ASSERT_OK(ConvertSchema(arrow_fields)); - - CheckFlatSchema(parquet_fields); -} - -TEST_F(TestConvertArrowSchema, ParquetFlatDecimals) { - std::vector parquet_fields; - std::vector> arrow_fields; - - // TODO: Test Decimal Arrow -> Parquet conversion - - ASSERT_OK(ConvertSchema(arrow_fields)); - - CheckFlatSchema(parquet_fields); -} - -TEST(TestNodeConversion, DateAndTime) {} - -} // namespace parquet - -} // namespace arrow diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc deleted file mode 100644 index 0c2fc6e8fc7..00000000000 --- a/cpp/src/arrow/parquet/reader.cc +++ /dev/null @@ -1,401 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/parquet/reader.h" - -#include -#include -#include -#include - -#include "arrow/column.h" -#include "arrow/parquet/io.h" -#include "arrow/parquet/schema.h" -#include "arrow/parquet/utils.h" -#include "arrow/schema.h" -#include "arrow/table.h" -#include "arrow/types/primitive.h" -#include "arrow/types/string.h" -#include "arrow/util/status.h" - -using parquet::ColumnReader; -using parquet::Repetition; -using parquet::TypedColumnReader; - -// Help reduce verbosity -using ParquetRAS = parquet::RandomAccessSource; -using ParquetReader = parquet::ParquetFileReader; - -namespace arrow { -namespace parquet { - -template -struct ArrowTypeTraits { - typedef NumericBuilder builder_type; -}; - -template <> -struct ArrowTypeTraits { - typedef BooleanBuilder builder_type; -}; - -template -using BuilderType = typename ArrowTypeTraits::builder_type; - -class FileReader::Impl { - public: - Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader); - virtual ~Impl() {} - - bool CheckForFlatColumn(const ::parquet::ColumnDescriptor* descr); - Status GetFlatColumn(int i, std::unique_ptr* out); - Status ReadFlatColumn(int i, std::shared_ptr* out); - Status ReadFlatTable(std::shared_ptr
* out); - - private: - MemoryPool* pool_; - std::unique_ptr<::parquet::ParquetFileReader> reader_; -}; - -class FlatColumnReader::Impl { - public: - Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr, - ::parquet::ParquetFileReader* reader, int column_index); - virtual ~Impl() {} - - Status NextBatch(int batch_size, std::shared_ptr* out); - template - Status TypedReadBatch(int batch_size, std::shared_ptr* out); - - template - Status ReadNullableFlatBatch(const int16_t* def_levels, - typename ParquetType::c_type* values, int64_t values_read, int64_t levels_read, - BuilderType* builder); - template - Status ReadNonNullableBatch(typename ParquetType::c_type* values, int64_t values_read, - BuilderType* builder); - - private: - void NextRowGroup(); - - template - struct can_copy_ptr { - static constexpr bool value = - std::is_same::value || - (std::is_integral{} && std::is_integral{} && - (sizeof(InType) == sizeof(OutType))); - }; - - template ::value>::type* = nullptr> - Status ConvertPhysicalType( - const InType* in_ptr, int64_t length, const OutType** out_ptr) { - *out_ptr = reinterpret_cast(in_ptr); - return Status::OK(); - } - - template ::value>::type* = nullptr> - Status ConvertPhysicalType( - const InType* in_ptr, int64_t length, const OutType** out_ptr) { - RETURN_NOT_OK(values_builder_buffer_.Resize(length * sizeof(OutType))); - OutType* mutable_out_ptr = - reinterpret_cast(values_builder_buffer_.mutable_data()); - std::copy(in_ptr, in_ptr + length, mutable_out_ptr); - *out_ptr = mutable_out_ptr; - return Status::OK(); - } - - MemoryPool* pool_; - const ::parquet::ColumnDescriptor* descr_; - ::parquet::ParquetFileReader* reader_; - int column_index_; - int next_row_group_; - std::shared_ptr column_reader_; - std::shared_ptr field_; - - PoolBuffer values_buffer_; - PoolBuffer def_levels_buffer_; - PoolBuffer values_builder_buffer_; - PoolBuffer valid_bytes_buffer_; -}; - -FileReader::Impl::Impl( - MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader) - : pool_(pool), reader_(std::move(reader)) {} - -bool FileReader::Impl::CheckForFlatColumn(const ::parquet::ColumnDescriptor* descr) { - if ((descr->max_repetition_level() > 0) || (descr->max_definition_level() > 1)) { - return false; - } else if ((descr->max_definition_level() == 1) && - (descr->schema_node()->repetition() != Repetition::OPTIONAL)) { - return false; - } - return true; -} - -Status FileReader::Impl::GetFlatColumn(int i, std::unique_ptr* out) { - const ::parquet::SchemaDescriptor* schema = reader_->metadata()->schema(); - - if (!CheckForFlatColumn(schema->Column(i))) { - return Status::Invalid("The requested column is not flat"); - } - std::unique_ptr impl( - new FlatColumnReader::Impl(pool_, schema->Column(i), reader_.get(), i)); - *out = std::unique_ptr(new FlatColumnReader(std::move(impl))); - return Status::OK(); -} - -Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr* out) { - std::unique_ptr flat_column_reader; - RETURN_NOT_OK(GetFlatColumn(i, &flat_column_reader)); - return flat_column_reader->NextBatch(reader_->metadata()->num_rows(), out); -} - -Status FileReader::Impl::ReadFlatTable(std::shared_ptr
* table) { - auto descr = reader_->metadata()->schema(); - - const std::string& name = descr->name(); - std::shared_ptr schema; - RETURN_NOT_OK(FromParquetSchema(descr, &schema)); - - int num_columns = reader_->metadata()->num_columns(); - - std::vector> columns(num_columns); - for (int i = 0; i < num_columns; i++) { - std::shared_ptr array; - RETURN_NOT_OK(ReadFlatColumn(i, &array)); - columns[i] = std::make_shared(schema->field(i), array); - } - - *table = std::make_shared
(name, schema, columns); - return Status::OK(); -} - -FileReader::FileReader( - MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader) - : impl_(new FileReader::Impl(pool, std::move(reader))) {} - -FileReader::~FileReader() {} - -// Static ctor -Status OpenFile(const std::shared_ptr& file, - ParquetAllocator* allocator, std::unique_ptr* reader) { - std::unique_ptr source(new ParquetReadSource(allocator)); - RETURN_NOT_OK(source->Open(file)); - - // TODO(wesm): reader properties - std::unique_ptr pq_reader; - PARQUET_CATCH_NOT_OK(pq_reader = ParquetReader::Open(std::move(source))); - - // Use the same memory pool as the ParquetAllocator - reader->reset(new FileReader(allocator->pool(), std::move(pq_reader))); - return Status::OK(); -} - -Status FileReader::GetFlatColumn(int i, std::unique_ptr* out) { - return impl_->GetFlatColumn(i, out); -} - -Status FileReader::ReadFlatColumn(int i, std::shared_ptr* out) { - return impl_->ReadFlatColumn(i, out); -} - -Status FileReader::ReadFlatTable(std::shared_ptr
* out) { - return impl_->ReadFlatTable(out); -} - -FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr, - ::parquet::ParquetFileReader* reader, int column_index) - : pool_(pool), - descr_(descr), - reader_(reader), - column_index_(column_index), - next_row_group_(0), - values_buffer_(pool), - def_levels_buffer_(pool) { - NodeToField(descr_->schema_node(), &field_); - NextRowGroup(); -} - -template -Status FlatColumnReader::Impl::ReadNonNullableBatch(typename ParquetType::c_type* values, - int64_t values_read, BuilderType* builder) { - using ArrowCType = typename ArrowType::c_type; - using ParquetCType = typename ParquetType::c_type; - - DCHECK(builder); - const ArrowCType* values_ptr = nullptr; - RETURN_NOT_OK( - (ConvertPhysicalType(values, values_read, &values_ptr))); - RETURN_NOT_OK(builder->Append(values_ptr, values_read)); - return Status::OK(); -} - -template -Status FlatColumnReader::Impl::ReadNullableFlatBatch(const int16_t* def_levels, - typename ParquetType::c_type* values, int64_t values_read, int64_t levels_read, - BuilderType* builder) { - using ArrowCType = typename ArrowType::c_type; - - DCHECK(builder); - RETURN_NOT_OK(values_builder_buffer_.Resize(levels_read * sizeof(ArrowCType))); - RETURN_NOT_OK(valid_bytes_buffer_.Resize(levels_read * sizeof(uint8_t))); - auto values_ptr = reinterpret_cast(values_builder_buffer_.mutable_data()); - uint8_t* valid_bytes = valid_bytes_buffer_.mutable_data(); - int values_idx = 0; - for (int64_t i = 0; i < levels_read; i++) { - if (def_levels[i] < descr_->max_definition_level()) { - valid_bytes[i] = 0; - } else { - valid_bytes[i] = 1; - values_ptr[i] = values[values_idx++]; - } - } - RETURN_NOT_OK(builder->Append(values_ptr, levels_read, valid_bytes)); - return Status::OK(); -} - -template -Status FlatColumnReader::Impl::TypedReadBatch( - int batch_size, std::shared_ptr* out) { - using ParquetCType = typename ParquetType::c_type; - - int values_to_read = batch_size; - BuilderType builder(pool_, field_->type); - while ((values_to_read > 0) && column_reader_) { - values_buffer_.Resize(values_to_read * sizeof(ParquetCType)); - if (descr_->max_definition_level() > 0) { - def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); - } - auto reader = dynamic_cast*>(column_reader_.get()); - int64_t values_read; - int64_t levels_read; - int16_t* def_levels = reinterpret_cast(def_levels_buffer_.mutable_data()); - auto values = reinterpret_cast(values_buffer_.mutable_data()); - PARQUET_CATCH_NOT_OK(levels_read = reader->ReadBatch( - values_to_read, def_levels, nullptr, values, &values_read)); - values_to_read -= levels_read; - if (descr_->max_definition_level() == 0) { - RETURN_NOT_OK( - (ReadNonNullableBatch(values, values_read, &builder))); - } else { - // As per the defintion and checks for flat columns: - // descr_->max_definition_level() == 1 - RETURN_NOT_OK((ReadNullableFlatBatch( - def_levels, values, values_read, levels_read, &builder))); - } - if (!column_reader_->HasNext()) { NextRowGroup(); } - } - *out = builder.Finish(); - return Status::OK(); -} - -template <> -Status FlatColumnReader::Impl::TypedReadBatch( - int batch_size, std::shared_ptr* out) { - int values_to_read = batch_size; - StringBuilder builder(pool_, field_->type); - while ((values_to_read > 0) && column_reader_) { - values_buffer_.Resize(values_to_read * sizeof(::parquet::ByteArray)); - if (descr_->max_definition_level() > 0) { - def_levels_buffer_.Resize(values_to_read * sizeof(int16_t)); - } - auto reader = - dynamic_cast*>(column_reader_.get()); - int64_t values_read; - int64_t levels_read; - int16_t* def_levels = reinterpret_cast(def_levels_buffer_.mutable_data()); - auto values = reinterpret_cast<::parquet::ByteArray*>(values_buffer_.mutable_data()); - PARQUET_CATCH_NOT_OK(levels_read = reader->ReadBatch( - values_to_read, def_levels, nullptr, values, &values_read)); - values_to_read -= levels_read; - if (descr_->max_definition_level() == 0) { - for (int64_t i = 0; i < levels_read; i++) { - RETURN_NOT_OK( - builder.Append(reinterpret_cast(values[i].ptr), values[i].len)); - } - } else { - // descr_->max_definition_level() == 1 - int values_idx = 0; - for (int64_t i = 0; i < levels_read; i++) { - if (def_levels[i] < descr_->max_definition_level()) { - RETURN_NOT_OK(builder.AppendNull()); - } else { - RETURN_NOT_OK( - builder.Append(reinterpret_cast(values[values_idx].ptr), - values[values_idx].len)); - values_idx++; - } - } - } - if (!column_reader_->HasNext()) { NextRowGroup(); } - } - *out = builder.Finish(); - return Status::OK(); -} - -#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ - case Type::ENUM: \ - return TypedReadBatch(batch_size, out); \ - break; - -Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr* out) { - if (!column_reader_) { - // Exhausted all row groups. - *out = nullptr; - return Status::OK(); - } - - switch (field_->type->type) { - TYPED_BATCH_CASE(BOOL, BooleanType, ::parquet::BooleanType) - TYPED_BATCH_CASE(UINT8, UInt8Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(INT8, Int8Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(UINT16, UInt16Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(INT16, Int16Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(UINT32, UInt32Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(UINT64, UInt64Type, ::parquet::Int64Type) - TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) - TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) - TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) - TYPED_BATCH_CASE(STRING, StringType, ::parquet::ByteArrayType) - TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type) - default: - return Status::NotImplemented(field_->type->ToString()); - } -} - -void FlatColumnReader::Impl::NextRowGroup() { - if (next_row_group_ < reader_->metadata()->num_row_groups()) { - column_reader_ = reader_->RowGroup(next_row_group_)->Column(column_index_); - next_row_group_++; - } else { - column_reader_ = nullptr; - } -} - -FlatColumnReader::FlatColumnReader(std::unique_ptr impl) : impl_(std::move(impl)) {} - -FlatColumnReader::~FlatColumnReader() {} - -Status FlatColumnReader::NextBatch(int batch_size, std::shared_ptr* out) { - return impl_->NextBatch(batch_size, out); -} - -} // namespace parquet -} // namespace arrow diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h deleted file mode 100644 index 2689bebea30..00000000000 --- a/cpp/src/arrow/parquet/reader.h +++ /dev/null @@ -1,146 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PARQUET_READER_H -#define ARROW_PARQUET_READER_H - -#include - -#include "parquet/api/reader.h" -#include "parquet/api/schema.h" - -#include "arrow/io/interfaces.h" -#include "arrow/parquet/io.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class MemoryPool; -class RecordBatch; -class Status; -class Table; - -namespace parquet { - -class FlatColumnReader; - -// Arrow read adapter class for deserializing Parquet files as Arrow row -// batches. -// -// TODO(wesm): nested data does not always make sense with this user -// interface unless you are only reading a single leaf node from a branch of -// a table. For example: -// -// repeated group data { -// optional group record { -// optional int32 val1; -// optional byte_array val2; -// optional bool val3; -// } -// optional int32 val4; -// } -// -// In the Parquet file, there are 3 leaf nodes: -// -// * data.record.val1 -// * data.record.val2 -// * data.record.val3 -// * data.val4 -// -// When materializing this data in an Arrow array, we would have: -// -// data: list), -// val3: bool, -// >, -// val4: int32 -// >> -// -// However, in the Parquet format, each leaf node has its own repetition and -// definition levels describing the structure of the intermediate nodes in -// this array structure. Thus, we will need to scan the leaf data for a group -// of leaf nodes part of the same type tree to create a single result Arrow -// nested array structure. -// -// This is additionally complicated "chunky" repeated fields or very large byte -// arrays -class ARROW_EXPORT FileReader { - public: - FileReader(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader); - - // Since the distribution of columns amongst a Parquet file's row groups may - // be uneven (the number of values in each column chunk can be different), we - // provide a column-oriented read interface. The ColumnReader hides the - // details of paging through the file's row groups and yielding - // fully-materialized arrow::Array instances - // - // Returns error status if the column of interest is not flat. - Status GetFlatColumn(int i, std::unique_ptr* out); - // Read column as a whole into an Array. - Status ReadFlatColumn(int i, std::shared_ptr* out); - // Read a table of flat columns into a Table. - Status ReadFlatTable(std::shared_ptr
* out); - - virtual ~FileReader(); - - private: - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -// At this point, the column reader is a stream iterator. It only knows how to -// read the next batch of values for a particular column from the file until it -// runs out. -// -// We also do not expose any internal Parquet details, such as row groups. This -// might change in the future. -class ARROW_EXPORT FlatColumnReader { - public: - virtual ~FlatColumnReader(); - - // Scan the next array of the indicated size. The actual size of the - // returned array may be less than the passed size depending how much data is - // available in the file. - // - // When all the data in the file has been exhausted, the result is set to - // nullptr. - // - // Returns Status::OK on a successful read, including if you have exhausted - // the data available in the file. - Status NextBatch(int batch_size, std::shared_ptr* out); - - private: - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; - explicit FlatColumnReader(std::unique_ptr impl); - - friend class FileReader; -}; - -// Helper function to create a file reader from an implementation of an Arrow -// readable file -ARROW_EXPORT -Status OpenFile(const std::shared_ptr& file, - ParquetAllocator* allocator, std::unique_ptr* reader); - -} // namespace parquet -} // namespace arrow - -#endif // ARROW_PARQUET_READER_H diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc deleted file mode 100644 index ff32e51bacd..00000000000 --- a/cpp/src/arrow/parquet/schema.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/parquet/schema.h" - -#include -#include - -#include "parquet/api/schema.h" - -#include "arrow/parquet/utils.h" -#include "arrow/types/decimal.h" -#include "arrow/types/string.h" -#include "arrow/util/status.h" - -using parquet::Repetition; -using parquet::schema::Node; -using parquet::schema::NodePtr; -using parquet::schema::GroupNode; -using parquet::schema::PrimitiveNode; - -using ParquetType = parquet::Type; -using parquet::LogicalType; - -namespace arrow { - -namespace parquet { - -const auto BOOL = std::make_shared(); -const auto UINT8 = std::make_shared(); -const auto INT8 = std::make_shared(); -const auto UINT16 = std::make_shared(); -const auto INT16 = std::make_shared(); -const auto UINT32 = std::make_shared(); -const auto INT32 = std::make_shared(); -const auto UINT64 = std::make_shared(); -const auto INT64 = std::make_shared(); -const auto FLOAT = std::make_shared(); -const auto DOUBLE = std::make_shared(); -const auto UTF8 = std::make_shared(); -const auto TIMESTAMP_MS = std::make_shared(TimestampType::Unit::MILLI); -const auto BINARY = std::make_shared(std::make_shared("", UINT8)); - -TypePtr MakeDecimalType(const PrimitiveNode* node) { - int precision = node->decimal_metadata().precision; - int scale = node->decimal_metadata().scale; - return std::make_shared(precision, scale); -} - -static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) { - switch (node->logical_type()) { - case LogicalType::UTF8: - *out = UTF8; - break; - case LogicalType::DECIMAL: - *out = MakeDecimalType(node); - break; - default: - // BINARY - *out = BINARY; - break; - } - return Status::OK(); -} - -static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) { - switch (node->logical_type()) { - case LogicalType::NONE: - *out = BINARY; - break; - case LogicalType::DECIMAL: - *out = MakeDecimalType(node); - break; - default: - return Status::NotImplemented("unhandled type"); - break; - } - - return Status::OK(); -} - -static Status FromInt32(const PrimitiveNode* node, TypePtr* out) { - switch (node->logical_type()) { - case LogicalType::NONE: - *out = INT32; - break; - case LogicalType::UINT_8: - *out = UINT8; - break; - case LogicalType::INT_8: - *out = INT8; - break; - case LogicalType::UINT_16: - *out = UINT16; - break; - case LogicalType::INT_16: - *out = INT16; - break; - case LogicalType::UINT_32: - *out = UINT32; - break; - case LogicalType::DECIMAL: - *out = MakeDecimalType(node); - break; - default: - return Status::NotImplemented("Unhandled logical type for int32"); - break; - } - return Status::OK(); -} - -static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { - switch (node->logical_type()) { - case LogicalType::NONE: - *out = INT64; - break; - case LogicalType::UINT_64: - *out = UINT64; - break; - case LogicalType::DECIMAL: - *out = MakeDecimalType(node); - break; - case LogicalType::TIMESTAMP_MILLIS: - *out = TIMESTAMP_MS; - break; - default: - return Status::NotImplemented("Unhandled logical type for int64"); - break; - } - return Status::OK(); -} - -// TODO: Logical Type Handling -Status NodeToField(const NodePtr& node, std::shared_ptr* out) { - std::shared_ptr type; - - if (node->is_repeated()) { - return Status::NotImplemented("No support yet for repeated node types"); - } - - if (node->is_group()) { - const GroupNode* group = static_cast(node.get()); - std::vector> fields(group->field_count()); - for (int i = 0; i < group->field_count(); i++) { - RETURN_NOT_OK(NodeToField(group->field(i), &fields[i])); - } - type = std::make_shared(fields); - } else { - // Primitive (leaf) node - const PrimitiveNode* primitive = static_cast(node.get()); - - switch (primitive->physical_type()) { - case ParquetType::BOOLEAN: - type = BOOL; - break; - case ParquetType::INT32: - RETURN_NOT_OK(FromInt32(primitive, &type)); - break; - case ParquetType::INT64: - RETURN_NOT_OK(FromInt64(primitive, &type)); - break; - case ParquetType::INT96: - // TODO: Do we have that type in Arrow? - // type = TypePtr(new Int96Type()); - return Status::NotImplemented("int96"); - case ParquetType::FLOAT: - type = FLOAT; - break; - case ParquetType::DOUBLE: - type = DOUBLE; - break; - case ParquetType::BYTE_ARRAY: - // TODO: Do we have that type in Arrow? - RETURN_NOT_OK(FromByteArray(primitive, &type)); - break; - case ParquetType::FIXED_LEN_BYTE_ARRAY: - RETURN_NOT_OK(FromFLBA(primitive, &type)); - break; - } - } - - *out = std::make_shared(node->name(), type, !node->is_required()); - return Status::OK(); -} - -Status FromParquetSchema( - const ::parquet::SchemaDescriptor* parquet_schema, std::shared_ptr* out) { - // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes - // from the root Parquet node - const GroupNode* schema_node = - static_cast(parquet_schema->group_node()); - - std::vector> fields(schema_node->field_count()); - for (int i = 0; i < schema_node->field_count(); i++) { - RETURN_NOT_OK(NodeToField(schema_node->field(i), &fields[i])); - } - - *out = std::make_shared(fields); - return Status::OK(); -} - -Status StructToNode(const std::shared_ptr& type, const std::string& name, - bool nullable, const ::parquet::WriterProperties& properties, NodePtr* out) { - Repetition::type repetition = Repetition::REQUIRED; - if (nullable) { repetition = Repetition::OPTIONAL; } - - std::vector children(type->num_children()); - for (int i = 0; i < type->num_children(); i++) { - RETURN_NOT_OK(FieldToNode(type->child(i), properties, &children[i])); - } - - *out = GroupNode::Make(name, repetition, children); - return Status::OK(); -} - -Status FieldToNode(const std::shared_ptr& field, - const ::parquet::WriterProperties& properties, NodePtr* out) { - LogicalType::type logical_type = LogicalType::NONE; - ParquetType::type type; - Repetition::type repetition = Repetition::REQUIRED; - if (field->nullable) { repetition = Repetition::OPTIONAL; } - int length = -1; - - switch (field->type->type) { - // TODO: - // case Type::NA: - // break; - case Type::BOOL: - type = ParquetType::BOOLEAN; - break; - case Type::UINT8: - type = ParquetType::INT32; - logical_type = LogicalType::UINT_8; - break; - case Type::INT8: - type = ParquetType::INT32; - logical_type = LogicalType::INT_8; - break; - case Type::UINT16: - type = ParquetType::INT32; - logical_type = LogicalType::UINT_16; - break; - case Type::INT16: - type = ParquetType::INT32; - logical_type = LogicalType::INT_16; - break; - case Type::UINT32: - if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) { - type = ParquetType::INT64; - } else { - type = ParquetType::INT32; - logical_type = LogicalType::UINT_32; - } - break; - case Type::INT32: - type = ParquetType::INT32; - break; - case Type::UINT64: - type = ParquetType::INT64; - logical_type = LogicalType::UINT_64; - break; - case Type::INT64: - type = ParquetType::INT64; - break; - case Type::FLOAT: - type = ParquetType::FLOAT; - break; - case Type::DOUBLE: - type = ParquetType::DOUBLE; - break; - case Type::STRING: - type = ParquetType::BYTE_ARRAY; - logical_type = LogicalType::UTF8; - break; - case Type::BINARY: - type = ParquetType::BYTE_ARRAY; - break; - case Type::DATE: - type = ParquetType::INT32; - logical_type = LogicalType::DATE; - break; - case Type::TIMESTAMP: { - auto timestamp_type = static_cast(field->type.get()); - if (timestamp_type->unit != TimestampType::Unit::MILLI) { - return Status::NotImplemented( - "Other timestamp units than millisecond are not yet support with parquet."); - } - type = ParquetType::INT64; - logical_type = LogicalType::TIMESTAMP_MILLIS; - } break; - case Type::TIMESTAMP_DOUBLE: - type = ParquetType::INT64; - // This is specified as seconds since the UNIX epoch - // TODO: Converted type in Parquet? - // logical_type = LogicalType::TIMESTAMP_MILLIS; - break; - case Type::TIME: - type = ParquetType::INT64; - logical_type = LogicalType::TIME_MILLIS; - break; - case Type::STRUCT: { - auto struct_type = std::static_pointer_cast(field->type); - return StructToNode(struct_type, field->name, field->nullable, properties, out); - } break; - default: - // TODO: LIST, DENSE_UNION, SPARE_UNION, JSON_SCALAR, DECIMAL, DECIMAL_TEXT, VARCHAR - return Status::NotImplemented("unhandled type"); - } - *out = PrimitiveNode::Make(field->name, repetition, type, logical_type, length); - return Status::OK(); -} - -Status ToParquetSchema(const Schema* arrow_schema, - const ::parquet::WriterProperties& properties, - std::shared_ptr<::parquet::SchemaDescriptor>* out) { - std::vector nodes(arrow_schema->num_fields()); - for (int i = 0; i < arrow_schema->num_fields(); i++) { - RETURN_NOT_OK(FieldToNode(arrow_schema->field(i), properties, &nodes[i])); - } - - NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); - *out = std::make_shared<::parquet::SchemaDescriptor>(); - PARQUET_CATCH_NOT_OK((*out)->Init(schema)); - - return Status::OK(); -} - -} // namespace parquet - -} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h deleted file mode 100644 index 88b5977d223..00000000000 --- a/cpp/src/arrow/parquet/schema.h +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PARQUET_SCHEMA_H -#define ARROW_PARQUET_SCHEMA_H - -#include - -#include "parquet/api/schema.h" -#include "parquet/api/writer.h" - -#include "arrow/schema.h" -#include "arrow/type.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -class Status; - -namespace parquet { - -Status ARROW_EXPORT NodeToField( - const ::parquet::schema::NodePtr& node, std::shared_ptr* out); - -Status ARROW_EXPORT FromParquetSchema( - const ::parquet::SchemaDescriptor* parquet_schema, std::shared_ptr* out); - -Status ARROW_EXPORT FieldToNode(const std::shared_ptr& field, - const ::parquet::WriterProperties& properties, ::parquet::schema::NodePtr* out); - -Status ARROW_EXPORT ToParquetSchema(const Schema* arrow_schema, - const ::parquet::WriterProperties& properties, - std::shared_ptr<::parquet::SchemaDescriptor>* out); - -} // namespace parquet - -} // namespace arrow - -#endif // ARROW_PARQUET_SCHEMA_H diff --git a/cpp/src/arrow/parquet/test-util.h b/cpp/src/arrow/parquet/test-util.h deleted file mode 100644 index 68a7fb94c2a..00000000000 --- a/cpp/src/arrow/parquet/test-util.h +++ /dev/null @@ -1,193 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include "arrow/test-util.h" -#include "arrow/types/primitive.h" -#include "arrow/types/string.h" - -namespace arrow { - -namespace parquet { - -template -using is_arrow_float = std::is_floating_point; - -template -using is_arrow_int = std::is_integral; - -template -using is_arrow_string = std::is_same; - -template -typename std::enable_if::value, - std::shared_ptr>::type -NonNullArray(size_t size) { - std::vector values; - ::arrow::test::random_real(size, 0, 0, 1, &values); - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size()); - return std::static_pointer_cast(builder.Finish()); -} - -template -typename std::enable_if::value, - std::shared_ptr>::type -NonNullArray(size_t size) { - std::vector values; - ::arrow::test::randint(size, 0, 64, &values); - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size()); - return std::static_pointer_cast(builder.Finish()); -} - -template -typename std::enable_if::value, - std::shared_ptr>::type -NonNullArray(size_t size) { - StringBuilder builder(default_memory_pool(), std::make_shared()); - for (size_t i = 0; i < size; i++) { - builder.Append("test-string"); - } - return std::static_pointer_cast(builder.Finish()); -} - -template <> -std::shared_ptr NonNullArray(size_t size) { - std::vector values; - ::arrow::test::randint(size, 0, 1, &values); - BooleanBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size()); - return std::static_pointer_cast(builder.Finish()); -} - -// This helper function only supports (size/2) nulls. -template -typename std::enable_if::value, - std::shared_ptr>::type -NullableArray(size_t size, size_t num_nulls) { - std::vector values; - ::arrow::test::random_real(size, 0, 0, 1, &values); - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size(), valid_bytes.data()); - return std::static_pointer_cast(builder.Finish()); -} - -// This helper function only supports (size/2) nulls. -template -typename std::enable_if::value, - std::shared_ptr>::type -NullableArray(size_t size, size_t num_nulls) { - std::vector values; - ::arrow::test::randint(size, 0, 64, &values); - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size(), valid_bytes.data()); - return std::static_pointer_cast(builder.Finish()); -} - -// This helper function only supports (size/2) nulls yet. -template -typename std::enable_if::value, - std::shared_ptr>::type -NullableArray(size_t size, size_t num_nulls) { - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - StringBuilder builder(default_memory_pool(), std::make_shared()); - for (size_t i = 0; i < size; i++) { - builder.Append("test-string"); - } - return std::static_pointer_cast(builder.Finish()); -} - -// This helper function only supports (size/2) nulls yet. -template <> -std::shared_ptr NullableArray( - size_t size, size_t num_nulls) { - std::vector values; - ::arrow::test::randint(size, 0, 1, &values); - std::vector valid_bytes(size, 1); - - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } - - BooleanBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size(), valid_bytes.data()); - return std::static_pointer_cast(builder.Finish()); -} - -std::shared_ptr MakeColumn( - const std::string& name, const std::shared_ptr& array, bool nullable) { - auto field = std::make_shared(name, array->type(), nullable); - return std::make_shared(field, array); -} - -std::shared_ptr
MakeSimpleTable( - const std::shared_ptr& values, bool nullable) { - std::shared_ptr column = MakeColumn("col", values, nullable); - std::vector> columns({column}); - std::vector> fields({column->field()}); - auto schema = std::make_shared(fields); - return std::make_shared
("table", schema, columns); -} - -template -void ExpectArray(T* expected, Array* result) { - PrimitiveArray* p_array = static_cast(result); - for (int i = 0; i < result->length(); i++) { - EXPECT_EQ(expected[i], reinterpret_cast(p_array->data()->data())[i]); - } -} - -template -void ExpectArray(typename ArrowType::c_type* expected, Array* result) { - PrimitiveArray* p_array = static_cast(result); - for (int64_t i = 0; i < result->length(); i++) { - EXPECT_EQ(expected[i], - reinterpret_cast(p_array->data()->data())[i]); - } -} - -template <> -void ExpectArray(uint8_t* expected, Array* result) { - BooleanBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(expected, result->length()); - std::shared_ptr expected_array = builder.Finish(); - EXPECT_TRUE(result->Equals(expected_array)); -} - -} // namespace parquet - -} // namespace arrow diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h deleted file mode 100644 index bcc46be60e6..00000000000 --- a/cpp/src/arrow/parquet/utils.h +++ /dev/null @@ -1,52 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PARQUET_UTILS_H -#define ARROW_PARQUET_UTILS_H - -#include - -#include "arrow/util/status.h" -#include "parquet/exception.h" - -namespace arrow { -namespace parquet { - -#define PARQUET_CATCH_NOT_OK(s) \ - try { \ - (s); \ - } catch (const ::parquet::ParquetException& e) { return Status::Invalid(e.what()); } - -#define PARQUET_IGNORE_NOT_OK(s) \ - try { \ - (s); \ - } catch (const ::parquet::ParquetException& e) {} - -#define PARQUET_THROW_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (!_s.ok()) { \ - std::stringstream ss; \ - ss << "Arrow error: " << _s.ToString(); \ - throw ::parquet::ParquetException(ss.str()); \ - } \ - } while (0); - -} // namespace parquet -} // namespace arrow - -#endif // ARROW_PARQUET_UTILS_H diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc deleted file mode 100644 index 2b47f1461c9..00000000000 --- a/cpp/src/arrow/parquet/writer.cc +++ /dev/null @@ -1,365 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/parquet/writer.h" - -#include -#include - -#include "arrow/array.h" -#include "arrow/column.h" -#include "arrow/table.h" -#include "arrow/types/construct.h" -#include "arrow/types/primitive.h" -#include "arrow/types/string.h" -#include "arrow/parquet/schema.h" -#include "arrow/parquet/utils.h" -#include "arrow/util/status.h" - -using parquet::ParquetFileWriter; -using parquet::ParquetVersion; -using parquet::schema::GroupNode; - -namespace arrow { -namespace parquet { - -class FileWriter::Impl { - public: - Impl(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); - - Status NewRowGroup(int64_t chunk_size); - template - Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data, - int64_t offset, int64_t length); - - // TODO(uwe): Same code as in reader.cc the only difference is the name of the temporary - // buffer - template - struct can_copy_ptr { - static constexpr bool value = - std::is_same::value || - (std::is_integral{} && std::is_integral{} && - (sizeof(InType) == sizeof(OutType))); - }; - - template ::value>::type* = nullptr> - Status ConvertPhysicalType(const InType* in_ptr, int64_t, const OutType** out_ptr) { - *out_ptr = reinterpret_cast(in_ptr); - return Status::OK(); - } - - template ::value>::type* = nullptr> - Status ConvertPhysicalType( - const InType* in_ptr, int64_t length, const OutType** out_ptr) { - RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(OutType))); - OutType* mutable_out_ptr = reinterpret_cast(data_buffer_.mutable_data()); - std::copy(in_ptr, in_ptr + length, mutable_out_ptr); - *out_ptr = mutable_out_ptr; - return Status::OK(); - } - - Status WriteFlatColumnChunk(const PrimitiveArray* data, int64_t offset, int64_t length); - Status WriteFlatColumnChunk(const StringArray* data, int64_t offset, int64_t length); - Status Close(); - - virtual ~Impl() {} - - private: - friend class FileWriter; - - MemoryPool* pool_; - // Buffer used for storing the data of an array converted to the physical type - // as expected by parquet-cpp. - PoolBuffer data_buffer_; - PoolBuffer def_levels_buffer_; - std::unique_ptr<::parquet::ParquetFileWriter> writer_; - ::parquet::RowGroupWriter* row_group_writer_; -}; - -FileWriter::Impl::Impl( - MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer) - : pool_(pool), - data_buffer_(pool), - writer_(std::move(writer)), - row_group_writer_(nullptr) {} - -Status FileWriter::Impl::NewRowGroup(int64_t chunk_size) { - if (row_group_writer_ != nullptr) { PARQUET_CATCH_NOT_OK(row_group_writer_->Close()); } - PARQUET_CATCH_NOT_OK(row_group_writer_ = writer_->AppendRowGroup(chunk_size)); - return Status::OK(); -} - -template -Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer, - const PrimitiveArray* data, int64_t offset, int64_t length) { - using ArrowCType = typename ArrowType::c_type; - using ParquetCType = typename ParquetType::c_type; - - DCHECK((offset + length) <= data->length()); - auto data_ptr = reinterpret_cast(data->data()->data()) + offset; - auto writer = - reinterpret_cast<::parquet::TypedColumnWriter*>(column_writer); - if (writer->descr()->max_definition_level() == 0) { - // no nulls, just dump the data - const ParquetCType* data_writer_ptr = nullptr; - RETURN_NOT_OK((ConvertPhysicalType( - data_ptr, length, &data_writer_ptr))); - PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, data_writer_ptr)); - } else if (writer->descr()->max_definition_level() == 1) { - RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); - int16_t* def_levels_ptr = - reinterpret_cast(def_levels_buffer_.mutable_data()); - if (data->null_count() == 0) { - std::fill(def_levels_ptr, def_levels_ptr + length, 1); - const ParquetCType* data_writer_ptr = nullptr; - RETURN_NOT_OK((ConvertPhysicalType( - data_ptr, length, &data_writer_ptr))); - PARQUET_CATCH_NOT_OK( - writer->WriteBatch(length, def_levels_ptr, nullptr, data_writer_ptr)); - } else { - RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(ParquetCType))); - auto buffer_ptr = reinterpret_cast(data_buffer_.mutable_data()); - int buffer_idx = 0; - for (int i = 0; i < length; i++) { - if (data->IsNull(offset + i)) { - def_levels_ptr[i] = 0; - } else { - def_levels_ptr[i] = 1; - buffer_ptr[buffer_idx++] = static_cast(data_ptr[i]); - } - } - PARQUET_CATCH_NOT_OK( - writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); - } - } else { - return Status::NotImplemented("no support for max definition level > 1 yet"); - } - PARQUET_CATCH_NOT_OK(writer->Close()); - return Status::OK(); -} - -// This specialization seems quite similar but it significantly differs in two points: -// * offset is added at the most latest time to the pointer as we have sub-byte access -// * Arrow data is stored bitwise thus we cannot use std::copy to transform from -// ArrowType::c_type to ParquetType::c_type -template <> -Status FileWriter::Impl::TypedWriteBatch<::parquet::BooleanType, BooleanType>( - ::parquet::ColumnWriter* column_writer, const PrimitiveArray* data, int64_t offset, - int64_t length) { - DCHECK((offset + length) <= data->length()); - RETURN_NOT_OK(data_buffer_.Resize(length)); - auto data_ptr = reinterpret_cast(data->data()->data()); - auto buffer_ptr = reinterpret_cast(data_buffer_.mutable_data()); - auto writer = reinterpret_cast<::parquet::TypedColumnWriter<::parquet::BooleanType>*>( - column_writer); - if (writer->descr()->max_definition_level() == 0) { - // no nulls, just dump the data - for (int64_t i = 0; i < length; i++) { - buffer_ptr[i] = util::get_bit(data_ptr, offset + i); - } - PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, buffer_ptr)); - } else if (writer->descr()->max_definition_level() == 1) { - RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); - int16_t* def_levels_ptr = - reinterpret_cast(def_levels_buffer_.mutable_data()); - if (data->null_count() == 0) { - std::fill(def_levels_ptr, def_levels_ptr + length, 1); - for (int64_t i = 0; i < length; i++) { - buffer_ptr[i] = util::get_bit(data_ptr, offset + i); - } - // TODO(PARQUET-644): write boolean values as a packed bitmap - PARQUET_CATCH_NOT_OK( - writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); - } else { - int buffer_idx = 0; - for (int i = 0; i < length; i++) { - if (data->IsNull(offset + i)) { - def_levels_ptr[i] = 0; - } else { - def_levels_ptr[i] = 1; - buffer_ptr[buffer_idx++] = util::get_bit(data_ptr, offset + i); - } - } - PARQUET_CATCH_NOT_OK( - writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); - } - } else { - return Status::NotImplemented("no support for max definition level > 1 yet"); - } - PARQUET_CATCH_NOT_OK(writer->Close()); - return Status::OK(); -} - -Status FileWriter::Impl::Close() { - if (row_group_writer_ != nullptr) { PARQUET_CATCH_NOT_OK(row_group_writer_->Close()); } - PARQUET_CATCH_NOT_OK(writer_->Close()); - return Status::OK(); -} - -#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ - case Type::ENUM: \ - return TypedWriteBatch(writer, data, offset, length); \ - break; - -Status FileWriter::Impl::WriteFlatColumnChunk( - const PrimitiveArray* data, int64_t offset, int64_t length) { - ::parquet::ColumnWriter* writer; - PARQUET_CATCH_NOT_OK(writer = row_group_writer_->NextColumn()); - switch (data->type_enum()) { - TYPED_BATCH_CASE(BOOL, BooleanType, ::parquet::BooleanType) - TYPED_BATCH_CASE(UINT8, UInt8Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(INT8, Int8Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(UINT16, UInt16Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(INT16, Int16Type, ::parquet::Int32Type) - case Type::UINT32: - if (writer_->properties()->version() == ParquetVersion::PARQUET_1_0) { - // Parquet 1.0 reader cannot read the UINT_32 logical type. Thus we need - // to use the larger Int64Type to store them lossless. - return TypedWriteBatch<::parquet::Int64Type, UInt32Type>( - writer, data, offset, length); - } else { - return TypedWriteBatch<::parquet::Int32Type, UInt32Type>( - writer, data, offset, length); - } - TYPED_BATCH_CASE(INT32, Int32Type, ::parquet::Int32Type) - TYPED_BATCH_CASE(UINT64, UInt64Type, ::parquet::Int64Type) - TYPED_BATCH_CASE(INT64, Int64Type, ::parquet::Int64Type) - TYPED_BATCH_CASE(TIMESTAMP, TimestampType, ::parquet::Int64Type) - TYPED_BATCH_CASE(FLOAT, FloatType, ::parquet::FloatType) - TYPED_BATCH_CASE(DOUBLE, DoubleType, ::parquet::DoubleType) - default: - return Status::NotImplemented(data->type()->ToString()); - } -} - -Status FileWriter::Impl::WriteFlatColumnChunk( - const StringArray* data, int64_t offset, int64_t length) { - ::parquet::ColumnWriter* column_writer; - PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn()); - DCHECK((offset + length) <= data->length()); - RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(::parquet::ByteArray))); - auto buffer_ptr = reinterpret_cast<::parquet::ByteArray*>(data_buffer_.mutable_data()); - auto values = std::dynamic_pointer_cast(data->values()); - auto data_ptr = reinterpret_cast(values->data()->data()); - DCHECK(values != nullptr); - auto writer = reinterpret_cast<::parquet::TypedColumnWriter<::parquet::ByteArrayType>*>( - column_writer); - if (writer->descr()->max_definition_level() > 0) { - RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); - } - int16_t* def_levels_ptr = reinterpret_cast(def_levels_buffer_.mutable_data()); - if (writer->descr()->max_definition_level() == 0 || data->null_count() == 0) { - // no nulls, just dump the data - for (int64_t i = 0; i < length; i++) { - buffer_ptr[i] = ::parquet::ByteArray( - data->value_length(i + offset), data_ptr + data->value_offset(i)); - } - if (writer->descr()->max_definition_level() > 0) { - std::fill(def_levels_ptr, def_levels_ptr + length, 1); - } - PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); - } else if (writer->descr()->max_definition_level() == 1) { - int buffer_idx = 0; - for (int64_t i = 0; i < length; i++) { - if (data->IsNull(offset + i)) { - def_levels_ptr[i] = 0; - } else { - def_levels_ptr[i] = 1; - buffer_ptr[buffer_idx++] = ::parquet::ByteArray( - data->value_length(i + offset), data_ptr + data->value_offset(i + offset)); - } - } - PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); - } else { - return Status::NotImplemented("no support for max definition level > 1 yet"); - } - PARQUET_CATCH_NOT_OK(writer->Close()); - return Status::OK(); -} - -FileWriter::FileWriter( - MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer) - : impl_(new FileWriter::Impl(pool, std::move(writer))) {} - -Status FileWriter::NewRowGroup(int64_t chunk_size) { - return impl_->NewRowGroup(chunk_size); -} - -Status FileWriter::WriteFlatColumnChunk( - const Array* array, int64_t offset, int64_t length) { - int64_t real_length = length; - if (length == -1) { real_length = array->length(); } - if (array->type_enum() == Type::STRING) { - auto string_array = dynamic_cast(array); - DCHECK(string_array); - return impl_->WriteFlatColumnChunk(string_array, offset, real_length); - } else { - auto primitive_array = dynamic_cast(array); - if (!primitive_array) { - return Status::NotImplemented("Table must consist of PrimitiveArray instances"); - } - return impl_->WriteFlatColumnChunk(primitive_array, offset, real_length); - } -} - -Status FileWriter::Close() { - return impl_->Close(); -} - -MemoryPool* FileWriter::memory_pool() const { - return impl_->pool_; -} - -FileWriter::~FileWriter() {} - -Status WriteFlatTable(const Table* table, MemoryPool* pool, - const std::shared_ptr<::parquet::OutputStream>& sink, int64_t chunk_size, - const std::shared_ptr<::parquet::WriterProperties>& properties) { - std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema; - RETURN_NOT_OK( - ToParquetSchema(table->schema().get(), *properties.get(), &parquet_schema)); - auto schema_node = std::static_pointer_cast(parquet_schema->schema_root()); - std::unique_ptr parquet_writer = - ParquetFileWriter::Open(sink, schema_node, properties); - FileWriter writer(pool, std::move(parquet_writer)); - - // TODO(ARROW-232) Support writing chunked arrays. - for (int i = 0; i < table->num_columns(); i++) { - if (table->column(i)->data()->num_chunks() != 1) { - return Status::NotImplemented("No support for writing chunked arrays yet."); - } - } - - for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) { - int64_t offset = chunk * chunk_size; - int64_t size = std::min(chunk_size, table->num_rows() - offset); - RETURN_NOT_OK_ELSE(writer.NewRowGroup(size), PARQUET_IGNORE_NOT_OK(writer.Close())); - for (int i = 0; i < table->num_columns(); i++) { - std::shared_ptr array = table->column(i)->data()->chunk(0); - RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(array.get(), offset, size), - PARQUET_IGNORE_NOT_OK(writer.Close())); - } - } - - return writer.Close(); -} - -} // namespace parquet - -} // namespace arrow diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h deleted file mode 100644 index ecc6a9f8be3..00000000000 --- a/cpp/src/arrow/parquet/writer.h +++ /dev/null @@ -1,76 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_PARQUET_WRITER_H -#define ARROW_PARQUET_WRITER_H - -#include - -#include "parquet/api/schema.h" -#include "parquet/api/writer.h" - -#include "arrow/util/visibility.h" - -namespace arrow { - -class Array; -class MemoryPool; -class PrimitiveArray; -class RecordBatch; -class Status; -class StringArray; -class Table; - -namespace parquet { - -/** - * Iterative API: - * Start a new RowGroup/Chunk with NewRowGroup - * Write column-by-column the whole column chunk - */ -class ARROW_EXPORT FileWriter { - public: - FileWriter(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); - - Status NewRowGroup(int64_t chunk_size); - Status WriteFlatColumnChunk(const Array* data, int64_t offset = 0, int64_t length = -1); - Status Close(); - - virtual ~FileWriter(); - - MemoryPool* memory_pool() const; - - private: - class ARROW_NO_EXPORT Impl; - std::unique_ptr impl_; -}; - -/** - * Write a flat Table to Parquet. - * - * The table shall only consist of nullable, non-repeated columns of primitive type. - */ -Status ARROW_EXPORT WriteFlatTable(const Table* table, MemoryPool* pool, - const std::shared_ptr<::parquet::OutputStream>& sink, int64_t chunk_size, - const std::shared_ptr<::parquet::WriterProperties>& properties = - ::parquet::default_writer_properties()); - -} // namespace parquet - -} // namespace arrow - -#endif // ARROW_PARQUET_WRITER_H diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 2f0037024c7..745ed8f7edb 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -64,7 +64,7 @@ Status StringArray::Validate() const { // This used to be a static member variable of BinaryBuilder, but it can cause // valgrind to report a (spurious?) memory leak when needed in other shared // libraries. The problem came up while adding explicit visibility to libarrow -// and libarrow_parquet +// and libparquet_arrow static TypePtr kBinaryValueType = TypePtr(new UInt8Type()); BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 522895808de..6357e3c1725 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -1,5 +1,5 @@ # Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file +# or more cod ntributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the @@ -294,12 +294,12 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) add_library(${LIB_NAME} STATIC IMPORTED) set_target_properties(${LIB_NAME} PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") - message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") + message(STATUS "Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") else() add_library(${LIB_NAME} SHARED IMPORTED) set_target_properties(${LIB_NAME} PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") - message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + message(STATUS "Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") endif() if(ARG_DEPS) @@ -443,12 +443,12 @@ set(LINK_LIBS arrow_io ) -if(PARQUET_FOUND AND ARROW_PARQUET_FOUND) - ADD_THIRDPARTY_LIB(arrow_parquet - SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) +if(PARQUET_FOUND AND PARQUET_ARROW_FOUND) + ADD_THIRDPARTY_LIB(parquet_arrow + SHARED_LIB ${PARQUET_ARROW_SHARED_LIB}) set(LINK_LIBS ${LINK_LIBS} - arrow_parquet) + parquet_arrow) set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} parquet) diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 5d5efc431a4..9919746520b 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -42,11 +42,6 @@ find_library(ARROW_LIB_PATH NAMES arrow ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) -find_library(ARROW_PARQUET_LIB_PATH NAMES arrow_parquet - PATHS - ${ARROW_SEARCH_LIB_PATH} - NO_DEFAULT_PATH) - find_library(ARROW_IO_LIB_PATH NAMES arrow_io PATHS ${ARROW_SEARCH_LIB_PATH} @@ -56,7 +51,6 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) set(ARROW_IO_LIB_NAME libarrow_io) - set(ARROW_PARQUET_LIB_NAME libarrow_parquet) set(ARROW_LIBS ${ARROW_SEARCH_LIB_PATH}) set(ARROW_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_LIB_NAME}.a) @@ -82,20 +76,6 @@ else () set(ARROW_FOUND FALSE) endif () -if(ARROW_PARQUET_LIB_PATH) - set(ARROW_PARQUET_FOUND TRUE) - set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) - set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) - if (NOT Arrow_FIND_QUIETLY) - message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") - endif () -else() - if (NOT Arrow_FIND_QUIETLY) - message(STATUS "Could not find Arrow Parquet library") - endif() - set(ARROW_PARQUET_FOUND FALSE) -endif() - mark_as_advanced( ARROW_INCLUDE_DIR ARROW_LIBS @@ -103,6 +83,4 @@ mark_as_advanced( ARROW_SHARED_LIB ARROW_IO_STATIC_LIB ARROW_IO_SHARED_LIB - ARROW_PARQUET_STATIC_LIB - ARROW_PARQUET_SHARED_LIB ) diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index f932a931493..9085b0bb298 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -44,6 +44,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: cdef cppclass ColumnDescriptor: pass + cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: pass @@ -77,6 +78,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: @staticmethod unique_ptr[ParquetFileReader] OpenFile(const c_string& path) + cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: cdef cppclass ParquetOutputStream" parquet::OutputStream": pass @@ -91,7 +93,7 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: shared_ptr[WriterProperties] build() -cdef extern from "arrow/parquet/io.h" namespace "arrow::parquet" nogil: +cdef extern from "parquet/arrow/io.h" namespace "parquet::arrow" nogil: cdef cppclass ParquetAllocator: ParquetAllocator() ParquetAllocator(MemoryPool* pool) @@ -103,7 +105,7 @@ cdef extern from "arrow/parquet/io.h" namespace "arrow::parquet" nogil: Open(const shared_ptr[ReadableFileInterface]& file) -cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil: +cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: CStatus OpenFile(const shared_ptr[ReadableFileInterface]& file, ParquetAllocator* allocator, unique_ptr[FileReader]* reader) @@ -113,14 +115,14 @@ cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil: CStatus ReadFlatTable(shared_ptr[CTable]* out); -cdef extern from "arrow/parquet/schema.h" namespace "arrow::parquet" nogil: +cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, shared_ptr[CSchema]* out) CStatus ToParquetSchema(const CSchema* arrow_schema, shared_ptr[SchemaDescriptor]* out) -cdef extern from "arrow/parquet/writer.h" namespace "arrow::parquet" nogil: +cdef extern from "parquet/arrow/writer.h" namespace "parquet::arrow" nogil: cdef CStatus WriteFlatTable( const CTable* table, MemoryPool* pool, const shared_ptr[ParquetOutputStream]& sink, From 45d88328dd73a331b8099c07dc1332cc585ff8d2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 27 Sep 2016 09:45:05 -0400 Subject: [PATCH 143/210] ARROW-293: [C++] Implement Arrow IO interfaces for operating system files I started with the code I put together previously for Feather and conformed it to the `arrow::io` API. There's a bunch of Windows compatibility stuff; I left this until we add CI for Windows and can sort this out. We should also refactor the memory mapped file interfaces to be based on this common code (see ARROW-294). Author: Wes McKinney Closes #146 from wesm/ARROW-293 and squashes the following commits: a2653b7 [Wes McKinney] cpplint d56ef06 [Wes McKinney] Test the rest of ReadableFile methods 43126ca [Wes McKinney] Drafting OS file IO implementations based on Feather implementation. Work on test suite --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/io/CMakeLists.txt | 6 + cpp/src/arrow/io/file.cc | 485 ++++++++++++++++++++++++++ cpp/src/arrow/io/file.h | 96 +++++ cpp/src/arrow/io/io-file-test.cc | 290 +++++++++++++++ cpp/src/arrow/io/libhdfs_shim.cc | 2 +- cpp/src/arrow/io/memory.h | 2 +- cpp/src/arrow/io/mman.h | 189 ++++++++++ cpp/src/arrow/ipc/adapter.cc | 4 +- cpp/src/arrow/ipc/file.cc | 2 +- cpp/src/arrow/types/primitive-test.cc | 3 +- cpp/src/arrow/util/logging.h | 6 +- cpp/src/arrow/util/memory-pool.cc | 4 +- cpp/src/arrow/util/status-test.cc | 2 +- 14 files changed, 1080 insertions(+), 13 deletions(-) create mode 100644 cpp/src/arrow/io/file.cc create mode 100644 cpp/src/arrow/io/file.h create mode 100644 cpp/src/arrow/io/io-file-test.cc create mode 100644 cpp/src/arrow/io/mman.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f3f4a7dac01..d65c7153196 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -570,7 +570,7 @@ if (UNIX) add_custom_target(lint ${BUILD_SUPPORT_DIR}/cpplint.py --verbose=2 --linelength=90 - --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references + --filter=-whitespace/comments,-readability/todo,-build/header_guard,-build/c++11,-runtime/references,-build/include_order ${FILTERED_LINT_FILES}) endif (UNIX) diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 87e227ef80d..d2e3491b75f 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -38,6 +38,7 @@ set(ARROW_IO_TEST_LINK_LIBS ${ARROW_IO_PRIVATE_LINK_LIBS}) set(ARROW_IO_SRCS + file.cc memory.cc ) @@ -103,12 +104,17 @@ if (APPLE) INSTALL_NAME_DIR "@rpath") endif() +ADD_ARROW_TEST(io-file-test) +ARROW_TEST_LINK_LIBRARIES(io-file-test + ${ARROW_IO_TEST_LINK_LIBS}) + ADD_ARROW_TEST(io-memory-test) ARROW_TEST_LINK_LIBRARIES(io-memory-test ${ARROW_IO_TEST_LINK_LIBS}) # Headers: top level install(FILES + file.h hdfs.h interfaces.h memory.h diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc new file mode 100644 index 00000000000..87bae7f3928 --- /dev/null +++ b/cpp/src/arrow/io/file.cc @@ -0,0 +1,485 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Ensure 64-bit off_t for platforms where it matters +#ifdef _FILE_OFFSET_BITS +#undef _FILE_OFFSET_BITS +#endif + +#define _FILE_OFFSET_BITS 64 + +#include "arrow/io/file.h" + +#if _WIN32 || _WIN64 +#if _WIN64 +#define ENVIRONMENT64 +#else +#define ENVIRONMENT32 +#endif +#endif + +// sys/mman.h not present in Visual Studio or Cygwin +#ifdef _WIN32 +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include "arrow/io/mman.h" +#undef Realloc +#undef Free +#include +#else +#include +#endif + +#include +#include +#include + +#ifndef _MSC_VER // POSIX-like platforms + +#include + +// Not available on some platforms +#ifndef errno_t +#define errno_t int +#endif + +#endif // _MSC_VER + +// defines that +#if defined(__MINGW32__) +#define ARROW_WRITE_SHMODE S_IRUSR | S_IWUSR +#elif defined(_MSC_VER) // Visual Studio + +#else // gcc / clang on POSIX platforms +#define ARROW_WRITE_SHMODE S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH +#endif + +// ---------------------------------------------------------------------- +// C++ standard library + +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#endif + +// ---------------------------------------------------------------------- +// file compatibility stuff + +#if defined(__MINGW32__) // MinGW +// nothing +#elif defined(_MSC_VER) // Visual Studio +#include +#else // POSIX / Linux +// nothing +#endif + +#include + +// POSIX systems do not have this +#ifndef O_BINARY +#define O_BINARY 0 +#endif + +// ---------------------------------------------------------------------- +// Other Arrow includes + +#include "arrow/io/interfaces.h" + +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace io { + +// ---------------------------------------------------------------------- +// Cross-platform file compatability layer + +static inline Status CheckOpenResult( + int ret, int errno_actual, const char* filename, size_t filename_length) { + if (ret == -1) { + // TODO: errno codes to strings + std::stringstream ss; + ss << "Failed to open file: "; +#if defined(_MSC_VER) + // using wchar_t + + // this requires c++11 + std::wstring_convert, wchar_t> converter; + std::wstring wide_string( + reinterpret_cast(filename), filename_length / sizeof(wchar_t)); + std::string byte_string = converter.to_bytes(wide_string); + ss << byte_string; +#else + ss << filename; +#endif + return Status::IOError(ss.str()); + } + return Status::OK(); +} + +#define CHECK_LSEEK(retval) \ + if ((retval) == -1) return Status::IOError("lseek failed"); + +static inline int64_t lseek64_compat(int fd, int64_t pos, int whence) { +#if defined(_MSC_VER) + return _lseeki64(fd, pos, whence); +#else + return lseek(fd, pos, whence); +#endif +} + +static inline Status FileOpenReadable(const std::string& filename, int* fd) { + int ret; + errno_t errno_actual = 0; +#if defined(_MSC_VER) + // https://msdn.microsoft.com/en-us/library/w64k0ytk.aspx + + // See GH #209. Here we are assuming that the filename has been encoded in + // utf-16le so that unicode filenames can be supported + const int nwchars = static_cast(filename.size()) / sizeof(wchar_t); + std::vector wpath(nwchars + 1); + memcpy(wpath.data(), filename.data(), filename.size()); + memcpy(wpath.data() + nwchars, L"\0", sizeof(wchar_t)); + + errno_actual = _wsopen_s(fd, wpath.data(), _O_RDONLY | _O_BINARY, _SH_DENYNO, _S_IREAD); + ret = *fd; +#else + ret = *fd = open(filename.c_str(), O_RDONLY | O_BINARY); + errno_actual = errno; +#endif + + return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); +} + +static inline Status FileOpenWriteable(const std::string& filename, int* fd) { + int ret; + errno_t errno_actual = 0; + +#if defined(_MSC_VER) + // https://msdn.microsoft.com/en-us/library/w64k0ytk.aspx + // Same story with wchar_t as above + const int nwchars = static_cast(filename.size()) / sizeof(wchar_t); + std::vector wpath(nwchars + 1); + memcpy(wpath.data(), filename.data(), filename.size()); + memcpy(wpath.data() + nwchars, L"\0", sizeof(wchar_t)); + + errno_actual = _wsopen_s( + fd, wpath.data(), _O_WRONLY | _O_CREAT | _O_BINARY, _SH_DENYNO, _S_IWRITE); + ret = *fd; + +#else + ret = *fd = open(filename.c_str(), O_WRONLY | O_CREAT | O_BINARY, ARROW_WRITE_SHMODE); +#endif + return CheckOpenResult(ret, errno_actual, filename.c_str(), filename.size()); +} + +static inline Status FileTell(int fd, int64_t* pos) { + int64_t current_pos; + +#if defined(_MSC_VER) + current_pos = _telli64(fd); + if (current_pos == -1) { return Status::IOError("_telli64 failed"); } +#else + current_pos = lseek64_compat(fd, 0, SEEK_CUR); + CHECK_LSEEK(current_pos); +#endif + + *pos = current_pos; + return Status::OK(); +} + +static inline Status FileSeek(int fd, int64_t pos) { + int64_t ret = lseek64_compat(fd, pos, SEEK_SET); + CHECK_LSEEK(ret); + return Status::OK(); +} + +static inline Status FileRead( + int fd, uint8_t* buffer, int64_t nbytes, int64_t* bytes_read) { +#if defined(_MSC_VER) + if (nbytes > INT32_MAX) { return Status::IOError("Unable to read > 2GB blocks yet"); } + *bytes_read = _read(fd, buffer, static_cast(nbytes)); +#else + *bytes_read = read(fd, buffer, nbytes); +#endif + + if (*bytes_read == -1) { + // TODO(wesm): errno to string + return Status::IOError("Error reading bytes from file"); + } + + return Status::OK(); +} + +static inline Status FileWrite(int fd, const uint8_t* buffer, int64_t nbytes) { + int ret; +#if defined(_MSC_VER) + if (nbytes > INT32_MAX) { + return Status::IOError("Unable to write > 2GB blocks to file yet"); + } + ret = _write(fd, buffer, static_cast(nbytes)); +#else + ret = write(fd, buffer, nbytes); +#endif + + if (ret == -1) { + // TODO(wesm): errno to string + return Status::IOError("Error writing bytes to file"); + } + return Status::OK(); +} + +static inline Status FileGetSize(int fd, int64_t* size) { + int64_t ret; + + // Save current position + int64_t current_position = lseek64_compat(fd, 0, SEEK_CUR); + CHECK_LSEEK(current_position); + + // move to end of the file + ret = lseek64_compat(fd, 0, SEEK_END); + CHECK_LSEEK(ret); + + // Get file length + ret = lseek64_compat(fd, 0, SEEK_CUR); + CHECK_LSEEK(ret); + + *size = ret; + + // Restore file position + ret = lseek64_compat(fd, current_position, SEEK_SET); + CHECK_LSEEK(ret); + + return Status::OK(); +} + +static inline Status FileClose(int fd) { + int ret; + +#if defined(_MSC_VER) + ret = _close(fd); +#else + ret = close(fd); +#endif + + if (ret == -1) { return Status::IOError("error closing file"); } + return Status::OK(); +} + +class OSFile { + public: + OSFile() : fd_(-1), is_open_(false), size_(-1) {} + + ~OSFile() {} + + Status OpenWritable(const std::string& path) { + RETURN_NOT_OK(FileOpenWriteable(path, &fd_)); + path_ = path; + is_open_ = true; + return Status::OK(); + } + + Status OpenReadable(const std::string& path) { + RETURN_NOT_OK(FileOpenReadable(path, &fd_)); + RETURN_NOT_OK(FileGetSize(fd_, &size_)); + + // The position should be 0 after GetSize + // RETURN_NOT_OK(Seek(0)); + + path_ = path; + is_open_ = true; + return Status::OK(); + } + + Status Close() { + if (is_open_) { + RETURN_NOT_OK(FileClose(fd_)); + is_open_ = false; + } + return Status::OK(); + } + + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + return FileRead(fd_, out, nbytes, bytes_read); + } + + Status Seek(int64_t pos) { + if (pos > size_) { pos = size_; } + return FileSeek(fd_, pos); + } + + Status Tell(int64_t* pos) const { return FileTell(fd_, pos); } + + Status Write(const uint8_t* data, int64_t length) { + if (length < 0) { return Status::IOError("Length must be non-negative"); } + return FileWrite(fd_, data, length); + } + + int fd() const { return fd_; } + + bool is_open() const { return is_open_; } + const std::string& path() const { return path_; } + + int64_t size() const { return size_; } + + private: + std::string path_; + + // File descriptor + int fd_; + + bool is_open_; + int64_t size_; +}; + +// ---------------------------------------------------------------------- +// ReadableFile implementation + +class ReadableFile::ReadableFileImpl : public OSFile { + public: + explicit ReadableFileImpl(MemoryPool* pool) : OSFile(), pool_(pool) {} + + Status Open(const std::string& path) { return OpenReadable(path); } + + Status ReadBuffer(int64_t nbytes, std::shared_ptr* out) { + auto buffer = std::make_shared(pool_); + RETURN_NOT_OK(buffer->Resize(nbytes)); + + int64_t bytes_read = 0; + RETURN_NOT_OK(Read(nbytes, &bytes_read, buffer->mutable_data())); + + // XXX: heuristic + if (bytes_read < nbytes / 2) { RETURN_NOT_OK(buffer->Resize(bytes_read)); } + + *out = buffer; + return Status::OK(); + } + + private: + MemoryPool* pool_; +}; + +ReadableFile::ReadableFile(MemoryPool* pool) { + impl_.reset(new ReadableFileImpl(pool)); +} + +ReadableFile::~ReadableFile() { + impl_->Close(); +} + +Status ReadableFile::Open(const std::string& path, std::shared_ptr* file) { + *file = std::shared_ptr(new ReadableFile(default_memory_pool())); + return (*file)->impl_->Open(path); +} + +Status ReadableFile::Open(const std::string& path, MemoryPool* memory_pool, + std::shared_ptr* file) { + *file = std::shared_ptr(new ReadableFile(memory_pool)); + return (*file)->impl_->Open(path); +} + +Status ReadableFile::Close() { + return impl_->Close(); +} + +Status ReadableFile::Tell(int64_t* pos) { + return impl_->Tell(pos); +} + +Status ReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + return impl_->Read(nbytes, bytes_read, out); +} + +Status ReadableFile::ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + RETURN_NOT_OK(Seek(position)); + return impl_->Read(nbytes, bytes_read, out); +} + +Status ReadableFile::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { + RETURN_NOT_OK(Seek(position)); + return impl_->ReadBuffer(nbytes, out); +} + +Status ReadableFile::GetSize(int64_t* size) { + *size = impl_->size(); + return Status::OK(); +} + +Status ReadableFile::Seek(int64_t pos) { + return impl_->Seek(pos); +} + +bool ReadableFile::supports_zero_copy() const { + return false; +} + +int ReadableFile::file_descriptor() const { + return impl_->fd(); +} + +// ---------------------------------------------------------------------- +// FileOutputStream + +class FileOutputStream::FileOutputStreamImpl : public OSFile { + public: + Status Open(const std::string& path) { return OpenWritable(path); } +}; + +FileOutputStream::FileOutputStream() { + impl_.reset(new FileOutputStreamImpl()); +} + +FileOutputStream::~FileOutputStream() { + impl_->Close(); +} + +Status FileOutputStream::Open( + const std::string& path, std::shared_ptr* file) { + // private ctor + *file = std::shared_ptr(new FileOutputStream()); + return (*file)->impl_->Open(path); +} + +Status FileOutputStream::Close() { + return impl_->Close(); +} + +Status FileOutputStream::Tell(int64_t* pos) { + return impl_->Tell(pos); +} + +Status FileOutputStream::Write(const uint8_t* data, int64_t length) { + return impl_->Write(data, length); +} + +int FileOutputStream::file_descriptor() const { + return impl_->fd(); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h new file mode 100644 index 00000000000..5e714ea9667 --- /dev/null +++ b/cpp/src/arrow/io/file.h @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// IO interface implementations for OS files + +#ifndef ARROW_IO_FILE_H +#define ARROW_IO_FILE_H + +#include +#include +#include + +#include "arrow/io/interfaces.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class Buffer; +class MemoryPool; +class Status; + +namespace io { + +class ARROW_EXPORT FileOutputStream : public OutputStream { + public: + ~FileOutputStream(); + + static Status Open(const std::string& path, std::shared_ptr* file); + + // OutputStream interface + Status Close() override; + Status Tell(int64_t* position) override; + Status Write(const uint8_t* data, int64_t nbytes) override; + + int file_descriptor() const; + + private: + FileOutputStream(); + + class ARROW_NO_EXPORT FileOutputStreamImpl; + std::unique_ptr impl_; +}; + +// Operating system file +class ARROW_EXPORT ReadableFile : public ReadableFileInterface { + public: + ~ReadableFile(); + + // Open file, allocate memory (if needed) from default memory pool + static Status Open(const std::string& path, std::shared_ptr* file); + + // Open file with one's own memory pool for memory allocations + static Status Open(const std::string& path, MemoryPool* memory_pool, + std::shared_ptr* file); + + Status Close() override; + Status Tell(int64_t* position) override; + + Status ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status GetSize(int64_t* size) override; + Status Seek(int64_t position) override; + + bool supports_zero_copy() const override; + + int file_descriptor() const; + + private: + explicit ReadableFile(MemoryPool* pool); + + class ARROW_NO_EXPORT ReadableFileImpl; + std::unique_ptr impl_; +}; + +} // namespace io +} // namespace arrow + +#endif // ARROW_IO_FILE_H diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc new file mode 100644 index 00000000000..cde769ffb61 --- /dev/null +++ b/cpp/src/arrow/io/io-file-test.cc @@ -0,0 +1,290 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/io/file.h" +#include "arrow/io/test-common.h" +#include "arrow/util/memory-pool.h" + +namespace arrow { +namespace io { + +static bool FileExists(const std::string& path) { + return std::ifstream(path.c_str()).good(); +} + +static bool FileIsClosed(int fd) { + if (-1 != fcntl(fd, F_GETFD)) { return false; } + return errno == EBADF; +} + +class FileTestFixture : public ::testing::Test { + public: + void SetUp() { + path_ = "arrow-test-io-file-output-stream.txt"; + EnsureFileDeleted(); + } + + void TearDown() { EnsureFileDeleted(); } + + void EnsureFileDeleted() { + if (FileExists(path_)) { std::remove(path_.c_str()); } + } + + protected: + std::string path_; +}; + +// ---------------------------------------------------------------------- +// File output tests + +class TestFileOutputStream : public FileTestFixture { + public: + void OpenFile() { ASSERT_OK(FileOutputStream::Open(path_, &file_)); } + + protected: + std::shared_ptr file_; +}; + +TEST_F(TestFileOutputStream, DestructorClosesFile) { + int fd; + { + std::shared_ptr file; + ASSERT_OK(FileOutputStream::Open(path_, &file)); + fd = file->file_descriptor(); + } + ASSERT_TRUE(FileIsClosed(fd)); +} + +TEST_F(TestFileOutputStream, Close) { + OpenFile(); + + const char* data = "testdata"; + ASSERT_OK(file_->Write(reinterpret_cast(data), strlen(data))); + + int fd = file_->file_descriptor(); + file_->Close(); + + ASSERT_TRUE(FileIsClosed(fd)); + + // Idempotent + file_->Close(); + + std::shared_ptr rd_file; + ASSERT_OK(ReadableFile::Open(path_, &rd_file)); + + int64_t size = 0; + ASSERT_OK(rd_file->GetSize(&size)); + ASSERT_EQ(strlen(data), size); +} + +TEST_F(TestFileOutputStream, InvalidWrites) { + OpenFile(); + + const char* data = ""; + + ASSERT_RAISES(IOError, file_->Write(reinterpret_cast(data), -1)); +} + +TEST_F(TestFileOutputStream, Tell) { + OpenFile(); + + int64_t position; + + ASSERT_OK(file_->Tell(&position)); + ASSERT_EQ(0, position); + + const char* data = "testdata"; + ASSERT_OK(file_->Write(reinterpret_cast(data), 8)); + ASSERT_OK(file_->Tell(&position)); + ASSERT_EQ(8, position); +} + +// ---------------------------------------------------------------------- +// File input tests + +class TestReadableFile : public FileTestFixture { + public: + void OpenFile() { ASSERT_OK(ReadableFile::Open(path_, &file_)); } + + void MakeTestFile() { + std::string data = "testdata"; + std::ofstream stream; + stream.open(path_.c_str()); + stream << data; + } + + protected: + std::shared_ptr file_; +}; + +TEST_F(TestReadableFile, DestructorClosesFile) { + MakeTestFile(); + + int fd; + { + std::shared_ptr file; + ASSERT_OK(ReadableFile::Open(path_, &file)); + fd = file->file_descriptor(); + } + ASSERT_TRUE(FileIsClosed(fd)); +} + +TEST_F(TestReadableFile, Close) { + MakeTestFile(); + OpenFile(); + + int fd = file_->file_descriptor(); + file_->Close(); + + ASSERT_TRUE(FileIsClosed(fd)); + + // Idempotent + file_->Close(); +} + +TEST_F(TestReadableFile, SeekTellSize) { + MakeTestFile(); + OpenFile(); + + int64_t position; + ASSERT_OK(file_->Tell(&position)); + ASSERT_EQ(0, position); + + ASSERT_OK(file_->Seek(4)); + ASSERT_OK(file_->Tell(&position)); + ASSERT_EQ(4, position); + + ASSERT_OK(file_->Seek(100)); + ASSERT_OK(file_->Tell(&position)); + + // now at EOF + ASSERT_EQ(8, position); + + int64_t size; + ASSERT_OK(file_->GetSize(&size)); + ASSERT_EQ(8, size); + + // does not support zero copy + ASSERT_FALSE(file_->supports_zero_copy()); +} + +TEST_F(TestReadableFile, Read) { + uint8_t buffer[50]; + + MakeTestFile(); + OpenFile(); + + int64_t bytes_read; + ASSERT_OK(file_->Read(4, &bytes_read, buffer)); + ASSERT_EQ(4, bytes_read); + ASSERT_EQ(0, std::memcmp(buffer, "test", 4)); + + ASSERT_OK(file_->Read(10, &bytes_read, buffer)); + ASSERT_EQ(4, bytes_read); + ASSERT_EQ(0, std::memcmp(buffer, "data", 4)); +} + +TEST_F(TestReadableFile, ReadAt) { + uint8_t buffer[50]; + const char* test_data = "testdata"; + + MakeTestFile(); + OpenFile(); + + int64_t bytes_read; + int64_t position; + + ASSERT_OK(file_->ReadAt(0, 4, &bytes_read, buffer)); + ASSERT_EQ(4, bytes_read); + ASSERT_EQ(0, std::memcmp(buffer, "test", 4)); + + // position advanced + ASSERT_OK(file_->Tell(&position)); + ASSERT_EQ(4, position); + + ASSERT_OK(file_->ReadAt(4, 10, &bytes_read, buffer)); + ASSERT_EQ(4, bytes_read); + ASSERT_EQ(0, std::memcmp(buffer, "data", 4)); + + // position advanced to EOF + ASSERT_OK(file_->Tell(&position)); + ASSERT_EQ(8, position); + + // Check buffer API + std::shared_ptr buffer2; + + ASSERT_OK(file_->ReadAt(0, 4, &buffer2)); + ASSERT_EQ(4, buffer2->size()); + + Buffer expected(reinterpret_cast(test_data), 4); + ASSERT_TRUE(buffer2->Equals(expected)); + + // position advanced + ASSERT_OK(file_->Tell(&position)); + ASSERT_EQ(4, position); +} + +TEST_F(TestReadableFile, NonExistentFile) { + ASSERT_RAISES(IOError, ReadableFile::Open("0xDEADBEEF.txt", &file_)); +} + +class MyMemoryPool : public MemoryPool { + public: + MyMemoryPool() : num_allocations_(0) {} + + Status Allocate(int64_t size, uint8_t** out) override { + *out = reinterpret_cast(std::malloc(size)); + ++num_allocations_; + return Status::OK(); + } + + void Free(uint8_t* buffer, int64_t size) override { std::free(buffer); } + + int64_t bytes_allocated() const override { return -1; } + + int64_t num_allocations() const { return num_allocations_; } + + private: + int64_t num_allocations_; +}; + +TEST_F(TestReadableFile, CustomMemoryPool) { + MakeTestFile(); + + MyMemoryPool pool; + ASSERT_OK(ReadableFile::Open(path_, &pool, &file_)); + + std::shared_ptr buffer; + ASSERT_OK(file_->ReadAt(0, 4, &buffer)); + ASSERT_OK(file_->ReadAt(4, 8, &buffer)); + + ASSERT_EQ(2, pool.num_allocations()); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index 0b805abf94c..f256c31b4f4 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -33,8 +33,8 @@ #ifndef _WIN32 #include #else -#include #include +#include // TODO(wesm): address when/if we add windows support // #include diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 51601a0a626..6989d732ca7 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -94,7 +94,7 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { Status WriteInternal(const uint8_t* data, int64_t nbytes); // Hide the internal details of this class for now - class MemoryMappedFileImpl; + class ARROW_NO_EXPORT MemoryMappedFileImpl; std::unique_ptr impl_; }; diff --git a/cpp/src/arrow/io/mman.h b/cpp/src/arrow/io/mman.h new file mode 100644 index 00000000000..00d1f93601d --- /dev/null +++ b/cpp/src/arrow/io/mman.h @@ -0,0 +1,189 @@ +// Copyright https://code.google.com/p/mman-win32/ +// +// Licensed under the MIT License; +// You may obtain a copy of the License at +// +// https://opensource.org/licenses/MIT + +#ifndef _MMAN_WIN32_H +#define _MMAN_WIN32_H + +// Allow use of features specific to Windows XP or later. +#ifndef _WIN32_WINNT +// Change this to the appropriate value to target other versions of Windows. +#define _WIN32_WINNT 0x0501 + +#endif + +#include +#include +#include +#include + +#define PROT_NONE 0 +#define PROT_READ 1 +#define PROT_WRITE 2 +#define PROT_EXEC 4 + +#define MAP_FILE 0 +#define MAP_SHARED 1 +#define MAP_PRIVATE 2 +#define MAP_TYPE 0xf +#define MAP_FIXED 0x10 +#define MAP_ANONYMOUS 0x20 +#define MAP_ANON MAP_ANONYMOUS + +#define MAP_FAILED ((void*)-1) + +/* Flags for msync. */ +#define MS_ASYNC 1 +#define MS_SYNC 2 +#define MS_INVALIDATE 4 + +#ifndef FILE_MAP_EXECUTE +#define FILE_MAP_EXECUTE 0x0020 +#endif + +static int __map_mman_error(const DWORD err, const int deferr) { + if (err == 0) return 0; + // TODO: implement + return err; +} + +static DWORD __map_mmap_prot_page(const int prot) { + DWORD protect = 0; + + if (prot == PROT_NONE) return protect; + + if ((prot & PROT_EXEC) != 0) { + protect = ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; + } else { + protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY; + } + + return protect; +} + +static DWORD __map_mmap_prot_file(const int prot) { + DWORD desiredAccess = 0; + + if (prot == PROT_NONE) return desiredAccess; + + if ((prot & PROT_READ) != 0) desiredAccess |= FILE_MAP_READ; + if ((prot & PROT_WRITE) != 0) desiredAccess |= FILE_MAP_WRITE; + if ((prot & PROT_EXEC) != 0) desiredAccess |= FILE_MAP_EXECUTE; + + return desiredAccess; +} + +void* mmap(void* addr, size_t len, int prot, int flags, int fildes, off_t off) { + HANDLE fm, h; + + void* map = MAP_FAILED; + +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4293) +#endif + + const DWORD dwFileOffsetLow = + (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); + const DWORD dwFileOffsetHigh = + (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL); + const DWORD protect = __map_mmap_prot_page(prot); + const DWORD desiredAccess = __map_mmap_prot_file(prot); + + const off_t maxSize = off + (off_t)len; + + const DWORD dwMaxSizeLow = + (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL); + const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) + ? (DWORD)0 + : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL); + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + errno = 0; + + if (len == 0 + /* Unsupported flag combinations */ + || (flags & MAP_FIXED) != 0 + /* Usupported protection combinations */ + || prot == PROT_EXEC) { + errno = EINVAL; + return MAP_FAILED; + } + + h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes) + : INVALID_HANDLE_VALUE; + + if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) { + errno = EBADF; + return MAP_FAILED; + } + + fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); + + if (fm == NULL) { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); + + CloseHandle(fm); + + if (map == NULL) { + errno = __map_mman_error(GetLastError(), EPERM); + return MAP_FAILED; + } + + return map; +} + +int munmap(void* addr, size_t len) { + if (UnmapViewOfFile(addr)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int mprotect(void* addr, size_t len, int prot) { + DWORD newProtect = __map_mmap_prot_page(prot); + DWORD oldProtect = 0; + + if (VirtualProtect(addr, len, newProtect, &oldProtect)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int msync(void* addr, size_t len, int flags) { + if (FlushViewOfFile(addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int mlock(const void* addr, size_t len) { + if (VirtualLock((LPVOID)addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +int munlock(const void* addr, size_t len) { + if (VirtualUnlock((LPVOID)addr, len)) return 0; + + errno = __map_mman_error(GetLastError(), EPERM); + + return -1; +} + +#endif diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 89b7fb987c6..99974a4a4c7 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -23,12 +23,12 @@ #include #include "arrow/array.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" #include "arrow/ipc/Message_generated.h" #include "arrow/ipc/metadata-internal.h" #include "arrow/ipc/metadata.h" #include "arrow/ipc/util.h" -#include "arrow/io/interfaces.h" -#include "arrow/io/memory.h" #include "arrow/schema.h" #include "arrow/table.h" #include "arrow/type.h" diff --git a/cpp/src/arrow/ipc/file.cc b/cpp/src/arrow/ipc/file.cc index 2bf10dde266..c68244d5025 100644 --- a/cpp/src/arrow/ipc/file.cc +++ b/cpp/src/arrow/ipc/file.cc @@ -22,10 +22,10 @@ #include #include +#include "arrow/io/interfaces.h" #include "arrow/ipc/adapter.h" #include "arrow/ipc/metadata.h" #include "arrow/ipc/util.h" -#include "arrow/io/interfaces.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 87eb0fe3a8b..ffebb9269bd 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -238,7 +238,8 @@ void TestPrimitiveBuilder::Check( } typedef ::testing::Types Primitives; + PInt32, PInt64, PFloat, PDouble> + Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index d320d6adb7c..b22f07dd634 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -117,10 +117,10 @@ class CerrLog { // return so we create a new class to give it a hint. class FatalLog : public CerrLog { public: - FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL) {} + explicit FatalLog(int /* severity */) // NOLINT + : CerrLog(ARROW_FATAL){} // NOLINT - [[noreturn]] ~FatalLog() { + [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } std::exit(1); } diff --git a/cpp/src/arrow/util/memory-pool.cc b/cpp/src/arrow/util/memory-pool.cc index fed149bc359..9f83afe4cb2 100644 --- a/cpp/src/arrow/util/memory-pool.cc +++ b/cpp/src/arrow/util/memory-pool.cc @@ -17,13 +17,13 @@ #include "arrow/util/memory-pool.h" -#include #include #include #include +#include -#include "arrow/util/status.h" #include "arrow/util/logging.h" +#include "arrow/util/status.h" namespace arrow { diff --git a/cpp/src/arrow/util/status-test.cc b/cpp/src/arrow/util/status-test.cc index 45e0ff361ac..e0ff20fea12 100644 --- a/cpp/src/arrow/util/status-test.cc +++ b/cpp/src/arrow/util/status-test.cc @@ -17,8 +17,8 @@ #include "gtest/gtest.h" -#include "arrow/util/status.h" #include "arrow/test-util.h" +#include "arrow/util/status.h" namespace arrow { From 03134b11ffd4f63bda2f3cb448713600df6d8fdb Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 27 Sep 2016 09:45:32 -0700 Subject: [PATCH 144/210] ARROW-270: Define more generic Interval logical type Author: Julien Le Dem Closes #144 from julienledem/interval and squashes the following commits: eb76fed [Julien Le Dem] ARROW-270: Define more generic Interval logical type --- format/Message.fbs | 10 ++++----- .../src/main/codegen/data/ArrowTypes.tdd | 8 ++----- .../templates/NullableValueVectors.java | 4 ++-- .../arrow/vector/schema/TypeLayout.java | 21 +++++++++++-------- .../org/apache/arrow/vector/types/Types.java | 14 ++++++------- 5 files changed, 27 insertions(+), 30 deletions(-) diff --git a/format/Message.fbs b/format/Message.fbs index 657904a7032..07da862c32d 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -63,10 +63,9 @@ table Timestamp { timezone: string; } -table IntervalDay { -} - -table IntervalYear { +enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} +table Interval { + unit: IntervalUnit; } table JSONScalar { @@ -88,8 +87,7 @@ union Type { Date, Time, Timestamp, - IntervalDay, - IntervalYear, + Interval, List, Struct_, Union, diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 5cb43bed2b6..9f81f0e3800 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -69,12 +69,8 @@ fields: [{name: "timezone", type: "String"}] }, { - name: "IntervalDay", - fields: [] - }, - { - name: "IntervalYear", - fields: [] + name: "Interval", + fields: [{name: "unit", type: short}] } ] } diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index 486cfeefc7a..8f325afad39 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -105,9 +105,9 @@ public final class ${className} extends BaseDataValueVector implements <#if type <#elseif minor.class == "TimeStamp"> field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(""), null); <#elseif minor.class == "IntervalDay"> - field = new Field(name, true, new IntervalDay(), null); + field = new Field(name, true, new Interval(org.apache.arrow.flatbuf.IntervalUnit.DAY_TIME), null); <#elseif minor.class == "IntervalYear"> - field = new Field(name, true, new IntervalYear(), null); + field = new Field(name, true, new Interval(org.apache.arrow.flatbuf.IntervalUnit.YEAR_MONTH), null); <#elseif minor.class == "VarChar"> field = new Field(name, true, new Utf8(), null); <#elseif minor.class == "VarBinary"> diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 885ac2ac3d7..072385a2155 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -31,6 +31,7 @@ import java.util.Collections; import java.util.List; +import org.apache.arrow.flatbuf.IntervalUnit; import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; @@ -40,12 +41,11 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; -import org.apache.arrow.vector.types.pojo.ArrowType.IntervalDay; -import org.apache.arrow.vector.types.pojo.ArrowType.IntervalYear; +import org.apache.arrow.vector.types.pojo.ArrowType.Interval; import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.ArrowType.Time; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; -import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; @@ -167,14 +167,17 @@ public TypeLayout visit(Time type) { } @Override - public TypeLayout visit(IntervalDay type) { // TODO: check size - return newFixedWidthTypeLayout(dataVector(64)); + public TypeLayout visit(Interval type) { // TODO: check size + switch (type.getUnit()) { + case IntervalUnit.DAY_TIME: + return newFixedWidthTypeLayout(dataVector(64)); + case IntervalUnit.YEAR_MONTH: + return newFixedWidthTypeLayout(dataVector(64)); + default: + throw new UnsupportedOperationException("Unknown unit " + type.getUnit()); + } } - @Override - public TypeLayout visit(IntervalYear type) { // TODO: check size - return newFixedWidthTypeLayout(dataVector(64)); - } }); return layout; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 66ef7562ced..181d8353682 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -20,6 +20,7 @@ import java.util.HashMap; import java.util.Map; +import org.apache.arrow.flatbuf.IntervalUnit; import org.apache.arrow.flatbuf.Precision; import org.apache.arrow.flatbuf.Type; import org.apache.arrow.flatbuf.UnionMode; @@ -78,13 +79,12 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Date; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; -import org.apache.arrow.vector.types.pojo.ArrowType.IntervalDay; -import org.apache.arrow.vector.types.pojo.ArrowType.IntervalYear; +import org.apache.arrow.vector.types.pojo.ArrowType.Interval; import org.apache.arrow.vector.types.pojo.ArrowType.List; import org.apache.arrow.vector.types.pojo.ArrowType.Null; +import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.ArrowType.Time; import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; -import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; import org.apache.arrow.vector.types.pojo.Field; @@ -104,8 +104,8 @@ public class Types { public static final Field DATE_FIELD = new Field("", true, Date.INSTANCE, null); public static final Field TIME_FIELD = new Field("", true, Time.INSTANCE, null); public static final Field TIMESTAMP_FIELD = new Field("", true, new Timestamp(""), null); - public static final Field INTERVALDAY_FIELD = new Field("", true, IntervalDay.INSTANCE, null); - public static final Field INTERVALYEAR_FIELD = new Field("", true, IntervalYear.INSTANCE, null); + public static final Field INTERVALDAY_FIELD = new Field("", true, new Interval(IntervalUnit.DAY_TIME), null); + public static final Field INTERVALYEAR_FIELD = new Field("", true, new Interval(IntervalUnit.YEAR_MONTH), null); public static final Field FLOAT4_FIELD = new Field("", true, new FloatingPoint(Precision.SINGLE), null); public static final Field FLOAT8_FIELD = new Field("", true, new FloatingPoint(Precision.DOUBLE), null); public static final Field LIST_FIELD = new Field("", true, List.INSTANCE, null); @@ -260,7 +260,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new TimeStampWriterImpl((NullableTimeStampVector) vector); } }, - INTERVALDAY(IntervalDay.INSTANCE) { + INTERVALDAY(new Interval(IntervalUnit.DAY_TIME)) { @Override public Field getField() { return INTERVALDAY_FIELD; @@ -276,7 +276,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new IntervalDayWriterImpl((NullableIntervalDayVector) vector); } }, - INTERVALYEAR(IntervalYear.INSTANCE) { + INTERVALYEAR(new Interval(IntervalUnit.YEAR_MONTH)) { @Override public Field getField() { return INTERVALYEAR_FIELD; From bae33d622421e6377ab3e9c81dd054c796ab48a3 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 27 Sep 2016 10:39:09 -0700 Subject: [PATCH 145/210] ARROW-304: NullableMapReaderImpl.isSet() always returns true Author: Julien Le Dem Closes #147 from julienledem/isSet and squashes the following commits: c06e048 [Julien Le Dem] review feedback 5a33785 [Julien Le Dem] review feedback af5d613 [Julien Le Dem] ARROW-304: NullableMapReaderImpl.isSet() always returns true --- .../complex/impl/NullableMapReaderImpl.java | 5 ++ .../vector/complex/impl/UnionListReader.java | 2 +- .../complex/writer/TestComplexWriter.java | 57 ++++++++++++++++--- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java index 18b35c194a1..7c389e61ae2 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/NullableMapReaderImpl.java @@ -42,4 +42,9 @@ public void copyAsField(String name, MapWriter writer){ NullableMapWriter impl = (NullableMapWriter) writer.map(name); impl.container.copyFromSafe(idx(), impl.idx(), nullableMapVector); } + + @Override + public boolean isSet(){ + return !nullableMapVector.getAccessor().isNull(idx()); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java index 39cf0042115..6c7c230226e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/UnionListReader.java @@ -41,7 +41,7 @@ public UnionListReader(ListVector vector) { @Override public boolean isSet() { - return true; + return !vector.getAccessor().isNull(idx()); } private int currentOffset; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index fa710dae5ee..c1da104da57 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -17,6 +17,14 @@ */ package org.apache.arrow.vector.complex.writer; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.util.List; + import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.complex.ListVector; @@ -77,28 +85,33 @@ public void nullableMap() { MapVector parent = new MapVector("parent", allocator, null); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); - MapWriter mapWriter = rootWriter.map("map"); - BigIntWriter nested = mapWriter.bigInt("nested"); for (int i = 0; i < COUNT; i++) { + rootWriter.setPosition(i); + rootWriter.start(); if (i % 2 == 0) { + MapWriter mapWriter = rootWriter.map("map"); mapWriter.setPosition(i); mapWriter.start(); - nested.writeBigInt(i); + mapWriter.bigInt("nested").writeBigInt(i); mapWriter.end(); } + rootWriter.end(); } writer.setValueCount(COUNT); MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); for (int i = 0; i < COUNT; i++) { rootReader.setPosition(i); + assertTrue("index is set: " + i, rootReader.isSet()); + FieldReader map = rootReader.reader("map"); if (i % 2 == 0) { - Assert.assertNotNull(rootReader.reader("map").readObject()); - Assert.assertEquals(i, rootReader.reader("map").reader("nested").readLong().longValue()); + assertTrue("index is set: " + i, map.isSet()); + assertNotNull("index is set: " + i, map.readObject()); + assertEquals(i, map.reader("nested").readLong().longValue()); } else { - Assert.assertNull(rootReader.reader("map").readObject()); + assertFalse("index is not set: " + i, map.isSet()); + assertNull("index is not set: " + i, map.readObject()); } } - parent.close(); } @@ -121,11 +134,39 @@ public void listScalarType() { listReader.setPosition(i); for (int j = 0; j < i % 7; j++) { listReader.next(); - Assert.assertEquals(j, listReader.reader().readInteger().intValue()); + assertEquals(j, listReader.reader().readInteger().intValue()); } } } + @Test + public void listScalarTypeNullable() { + ListVector listVector = new ListVector("list", allocator, null); + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + if (i % 2 == 0) { + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + listWriter.writeInt(j); + } + listWriter.endList(); + } + } + listWriter.setValueCount(COUNT); + UnionListReader listReader = new UnionListReader(listVector); + for (int i = 0; i < COUNT; i++) { + listReader.setPosition(i); + if (i % 2 == 0) { + assertTrue("index is set: " + i, listReader.isSet()); + assertEquals("correct length at: " + i, i % 7, ((List)listReader.readObject()).size()); + } else { + assertFalse("index is not set: " + i, listReader.isSet()); + assertNull("index is not set: " + i, listReader.readObject()); + } + } + } @Test public void listMapType() { From 768c7d0be7dde9942235b5312c1c46ab035af86b Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 27 Sep 2016 11:54:35 -0700 Subject: [PATCH 146/210] ARROW-257: Add a typeids Vector to Union type Author: Julien Le Dem Closes #143 from julienledem/union and squashes the following commits: cd1b711 [Julien Le Dem] ARROW-257: Add a typeids Vector to Union type --- format/Message.fbs | 5 +++ .../src/main/codegen/data/ArrowTypes.tdd | 2 +- .../src/main/codegen/templates/ArrowType.java | 38 +++++++++++++++---- .../main/codegen/templates/UnionVector.java | 7 +++- .../org/apache/arrow/vector/types/Types.java | 2 +- .../apache/arrow/vector/pojo/TestConvert.java | 5 ++- 6 files changed, 45 insertions(+), 14 deletions(-) diff --git a/format/Message.fbs b/format/Message.fbs index 07da862c32d..288f5a1b6b2 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -23,8 +23,13 @@ table List { enum UnionMode:short { Sparse, Dense } +/// A union is a complex type with children in Field +/// By default ids in the type vector refer to the offsets in the children +/// optionally typeIds provides an indirection between the child offset and the type id +/// for each child typeIds[offset] is the id used in the type vector table Union { mode: UnionMode; + typeIds: [ int ]; // optional, describes typeid of each child. } table Int { diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 9f81f0e3800..9624fecf6aa 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -30,7 +30,7 @@ }, { name: "Union", - fields: [{name: "mode", type: short}] + fields: [{name: "mode", type: short}, {name: "typeIds", type: "int[]"}] }, { name: "Int", diff --git a/java/vector/src/main/codegen/templates/ArrowType.java b/java/vector/src/main/codegen/templates/ArrowType.java index 29dee20040a..30f2c68efe0 100644 --- a/java/vector/src/main/codegen/templates/ArrowType.java +++ b/java/vector/src/main/codegen/templates/ArrowType.java @@ -33,12 +33,23 @@ import java.util.Objects; +/** + * Arrow types + **/ public abstract class ArrowType { public abstract byte getTypeType(); public abstract int getType(FlatBufferBuilder builder); public abstract T accept(ArrowTypeVisitor visitor); + /** + * to visit the ArrowTypes + * + * type.accept(new ArrowTypeVisitor() { + * ... + * }); + * + */ public static interface ArrowTypeVisitor { <#list arrowTypes.types as type> T visit(${type.name} type); @@ -55,9 +66,7 @@ public static class ${name} extends ArrowType { <#list fields as field> - <#assign fieldName = field.name> - <#assign fieldType = field.type> - ${fieldType} ${fieldName}; + ${field.type} ${field.name}; <#if type.fields?size != 0> @@ -79,6 +88,9 @@ public int getType(FlatBufferBuilder builder) { <#if field.type == "String"> int ${field.name} = builder.createString(this.${field.name}); + <#if field.type == "int[]"> + int ${field.name} = org.apache.arrow.flatbuf.${type.name}.create${field.name?cap_first}Vector(builder, this.${field.name}); + org.apache.arrow.flatbuf.${type.name}.start${type.name}(builder); <#list type.fields as field> @@ -96,7 +108,7 @@ public int getType(FlatBufferBuilder builder) { public String toString() { return "${name}{" <#list fields as field> - + ", " + ${field.name} + + <#if field.type == "int[]">java.util.Arrays.toString(${field.name})<#else>${field.name}<#if field_has_next> + ", " + "}"; } @@ -115,8 +127,7 @@ public boolean equals(Object obj) { return true; <#else> ${type.name} that = (${type.name}) obj; - return - <#list type.fields as field>Objects.equals(this.${field.name}, that.${field.name}) <#if field_has_next>&&<#else>; + return <#list type.fields as field>Objects.deepEquals(this.${field.name}, that.${field.name}) <#if field_has_next>&&<#else>; } @@ -134,9 +145,20 @@ public static org.apache.arrow.vector.types.pojo.ArrowType getTypeForField(org.a <#assign name = type.name> <#assign nameLower = type.name?lower_case> <#assign fields = type.fields> - case Type.${type.name}: + case Type.${type.name}: { org.apache.arrow.flatbuf.${type.name} ${nameLower}Type = (org.apache.arrow.flatbuf.${type.name}) field.type(new org.apache.arrow.flatbuf.${type.name}()); - return new ${type.name}(<#list type.fields as field>${nameLower}Type.${field.name}()<#if field_has_next>, ); + <#list type.fields as field> + <#if field.type == "int[]"> + ${field.type} ${field.name} = new int[${nameLower}Type.${field.name}Length()]; + for (int i = 0; i< ${field.name}.length; ++i) { + ${field.name}[i] = ${nameLower}Type.${field.name}(i); + } + <#else> + ${field.type} ${field.name} = ${nameLower}Type.${field.name}(); + + + return new ${type.name}(<#list type.fields as field>${field.name}<#if field_has_next>, ); + } default: throw new UnsupportedOperationException("Unsupported type: " + field.typeType()); diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 3014bbba9d5..b14314d2b0d 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -232,10 +232,13 @@ public void clear() { @Override public Field getField() { List childFields = new ArrayList<>(); - for (ValueVector v : internalMap.getChildren()) { + List children = internalMap.getChildren(); + int[] typeIds = new int[children.size()]; + for (ValueVector v : children) { + typeIds[childFields.size()] = v.getMinorType().ordinal(); childFields.add(v.getField()); } - return new Field(name, true, new ArrowType.Union(Sparse), childFields); + return new Field(name, true, new ArrowType.Union(Sparse, typeIds), childFields); } @Override diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 181d8353682..6e63ae23278 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -472,7 +472,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new UnionListWriter((ListVector) vector); } }, - UNION(new Union(UnionMode.Sparse)) { + UNION(new Union(UnionMode.Sparse, null)) { @Override public Field getField() { throw new UnsupportedOperationException("Cannot get simple field for Union type"); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index 448117d84dc..ed740cd0f1b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -22,11 +22,12 @@ import static org.junit.Assert.assertEquals; import org.apache.arrow.flatbuf.UnionMode; +import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.List; -import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Struct_; +import org.apache.arrow.vector.types.pojo.ArrowType.Timestamp; import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; import org.apache.arrow.vector.types.pojo.Field; @@ -78,7 +79,7 @@ public void nestedSchema() { childrenBuilder.add(new Field("child4", true, new List(), ImmutableList.of( new Field("child4.1", true, Utf8.INSTANCE, null) ))); - childrenBuilder.add(new Field("child5", true, new Union(UnionMode.Sparse), ImmutableList.of( + childrenBuilder.add(new Field("child5", true, new Union(UnionMode.Sparse, new int[] { MinorType.TIMESTAMP.ordinal(), MinorType.FLOAT8.ordinal() } ), ImmutableList.of( new Field("child5.1", true, new Timestamp("UTC"), null), new Field("child5.2", true, new FloatingPoint(DOUBLE), ImmutableList.of()) ))); From bd195e304d82dcf6e2cea266b4d0871bd2b88564 Mon Sep 17 00:00:00 2001 From: adeneche Date: Wed, 28 Sep 2016 07:26:05 -0700 Subject: [PATCH 147/210] ARROW-308: UnionListWriter.setPosition() should not call startList() --- .../codegen/templates/UnionListWriter.java | 1 - .../complex/writer/TestComplexWriter.java | 32 ++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index d502803d716..04531a72128 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -84,7 +84,6 @@ public void close() throws Exception { @Override public void setPosition(int index) { super.setPosition(index); - startList(); } <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index c1da104da57..398aea915b3 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -52,7 +52,7 @@ public class TestComplexWriter { - static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + private static final BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); private static final int COUNT = 100; @@ -115,6 +115,36 @@ public void nullableMap() { parent.close(); } + @Test + public void listOfLists() { + MapVector parent = new MapVector("parent", allocator, null); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + + rootWriter.start(); + rootWriter.bigInt("int").writeBigInt(0); + rootWriter.list("list").startList(); + rootWriter.list("list").bigInt().writeBigInt(0); + rootWriter.list("list").endList(); + rootWriter.end(); + + rootWriter.setPosition(1); + rootWriter.start(); + rootWriter.bigInt("int").writeBigInt(1); + rootWriter.end(); + + writer.setValueCount(2); + + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + + rootReader.setPosition(0); + assertTrue("row 0 list is not set", rootReader.reader("list").isSet()); + assertEquals(Long.valueOf(0), rootReader.reader("list").reader().readLong()); + + rootReader.setPosition(1); + assertFalse("row 1 list is set", rootReader.reader("list").isSet()); + } + @Test public void listScalarType() { ListVector listVector = new ListVector("list", allocator, null); From bf30235fa3672936013db82ed9dd8949433d802e Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 28 Sep 2016 21:44:37 -0400 Subject: [PATCH 148/210] ARROW-306: Add option to pass cmake arguments via environment variable Author: Uwe L. Korn Closes #149 from xhochy/arrow-306 and squashes the following commits: 11a3e66 [Uwe L. Korn] ARROW-306: Add option to pass cmake arguments via environment variable --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index a5db2b025e6..d1be122888e 100644 --- a/python/setup.py +++ b/python/setup.py @@ -95,7 +95,7 @@ def run(self): def initialize_options(self): _build_ext.initialize_options(self) - self.extra_cmake_args = '' + self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') CYTHON_MODULE_NAMES = [ 'array', From 30f60832a5f4bd3063699061796d2107fb7a9738 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 28 Sep 2016 21:45:46 -0400 Subject: [PATCH 149/210] ARROW-305: Add compression and use_dictionary options to Parquet Author: Uwe L. Korn Closes #148 from xhochy/arrow-305 and squashes the following commits: 93d653b [Uwe L. Korn] ARROW-305: Add compression and use_dictionary options to Parquet interface --- python/pyarrow/includes/parquet.pxd | 12 +++++++ python/pyarrow/parquet.pyx | 49 +++++++++++++++++++++++++++- python/pyarrow/tests/test_parquet.py | 40 +++++++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index 9085b0bb298..754eeccecc8 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -37,6 +37,13 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: PARQUET_1_0" parquet::ParquetVersion::PARQUET_1_0" PARQUET_2_0" parquet::ParquetVersion::PARQUET_2_0" + enum Compression" parquet::Compression::type": + UNCOMPRESSED" parquet::Compression::UNCOMPRESSED" + SNAPPY" parquet::Compression::SNAPPY" + GZIP" parquet::Compression::GZIP" + LZO" parquet::Compression::LZO" + BROTLI" parquet::Compression::BROTLI" + cdef cppclass SchemaDescriptor: shared_ptr[Node] schema() GroupNode* group() @@ -90,6 +97,11 @@ cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: cdef cppclass WriterProperties: cppclass Builder: Builder* version(ParquetVersion version) + Builder* compression(Compression codec) + Builder* compression(const c_string& path, Compression codec) + Builder* disable_dictionary() + Builder* enable_dictionary() + Builder* enable_dictionary(const c_string& path) shared_ptr[WriterProperties] build() diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index fb36b2967c0..099e148abc1 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -90,7 +90,8 @@ def read_table(source, columns=None): return reader.read_all() -def write_table(table, filename, chunk_size=None, version=None): +def write_table(table, filename, chunk_size=None, version=None, + use_dictionary=True, compression=None): """ Write a Table to Parquet format @@ -102,6 +103,11 @@ def write_table(table, filename, chunk_size=None, version=None): The maximum number of rows in each Parquet RowGroup version : {"1.0", "2.0"}, default "1.0" The Parquet format version, defaults to 1.0 + use_dictionary : bool or list + Specify if we should use dictionary encoding in general or only for + some columns. + compression : str or dict + Specify the compression codec, either on a general basis or per-column. """ cdef Table table_ = table cdef CTable* ctable_ = table_.table @@ -121,6 +127,47 @@ def write_table(table, filename, chunk_size=None, version=None): else: raise ArrowException("Unsupported Parquet format version") + if isinstance(use_dictionary, bool): + if use_dictionary: + properties_builder.enable_dictionary() + else: + properties_builder.disable_dictionary() + else: + # Deactivate dictionary encoding by default + properties_builder.disable_dictionary() + for column in use_dictionary: + properties_builder.enable_dictionary(column) + + if isinstance(compression, basestring): + if compression == "NONE": + properties_builder.compression(UNCOMPRESSED) + elif compression == "SNAPPY": + properties_builder.compression(SNAPPY) + elif compression == "GZIP": + properties_builder.compression(GZIP) + elif compression == "LZO": + properties_builder.compression(LZO) + elif compression == "BROTLI": + properties_builder.compression(BROTLI) + else: + raise ArrowException("Unsupport compression codec") + elif compression is not None: + # Deactivate dictionary encoding by default + properties_builder.disable_dictionary() + for column, codec in compression.iteritems(): + if codec == "NONE": + properties_builder.compression(column, UNCOMPRESSED) + elif codec == "SNAPPY": + properties_builder.compression(column, SNAPPY) + elif codec == "GZIP": + properties_builder.compression(column, GZIP) + elif codec == "LZO": + properties_builder.compression(column, LZO) + elif codec == "BROTLI": + properties_builder.compression(column, BROTLI) + else: + raise ArrowException("Unsupport compression codec") + sink.reset(new LocalFileOutputStream(tobytes(filename))) with nogil: check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 8a2d8cab572..0f9f2e40813 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -110,3 +110,43 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): df['uint32'] = df['uint32'].values.astype(np.int64) pdt.assert_frame_equal(df, df_read) + +@parquet +def test_pandas_parquet_configuration_options(tmpdir): + size = 10000 + np.random.seed(0) + df = pd.DataFrame({ + 'uint8': np.arange(size, dtype=np.uint8), + 'uint16': np.arange(size, dtype=np.uint16), + 'uint32': np.arange(size, dtype=np.uint32), + 'uint64': np.arange(size, dtype=np.uint64), + 'int8': np.arange(size, dtype=np.int16), + 'int16': np.arange(size, dtype=np.int16), + 'int32': np.arange(size, dtype=np.int32), + 'int64': np.arange(size, dtype=np.int64), + 'float32': np.arange(size, dtype=np.float32), + 'float64': np.arange(size, dtype=np.float64), + 'bool': np.random.randn(size) > 0 + }) + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = A.from_pandas_dataframe(df) + + for use_dictionary in [True, False]: + A.parquet.write_table( + arrow_table, + filename.strpath, + version="2.0", + use_dictionary=use_dictionary) + table_read = pq.read_table(filename.strpath) + df_read = table_read.to_pandas() + pdt.assert_frame_equal(df, df_read) + + for compression in ['NONE', 'SNAPPY', 'GZIP']: + A.parquet.write_table( + arrow_table, + filename.strpath, + version="2.0", + compression=compression) + table_read = pq.read_table(filename.strpath) + df_read = table_read.to_pandas() + pdt.assert_frame_equal(df, df_read) From 391ab64d05fc9c5ea89fcc9a9938604954047ada Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 30 Sep 2016 08:53:52 -0700 Subject: [PATCH 150/210] ARROW-309: Types.getMinorTypeForArrowType() does not work for Union type Author: Julien Le Dem Closes #151 from julienledem/fix_union and squashes the following commits: 01bea42 [Julien Le Dem] fix union --- .../org/apache/arrow/vector/types/Types.java | 145 +++++++++++++----- 1 file changed, 107 insertions(+), 38 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 6e63ae23278..2ff93d4b98d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -17,12 +17,8 @@ */ package org.apache.arrow.vector.types; -import java.util.HashMap; -import java.util.Map; - import org.apache.arrow.flatbuf.IntervalUnit; import org.apache.arrow.flatbuf.Precision; -import org.apache.arrow.flatbuf.Type; import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; @@ -74,9 +70,11 @@ import org.apache.arrow.vector.complex.impl.VarCharWriterImpl; import org.apache.arrow.vector.complex.writer.FieldWriter; import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.ArrowType.ArrowTypeVisitor; import org.apache.arrow.vector.types.pojo.ArrowType.Binary; import org.apache.arrow.vector.types.pojo.ArrowType.Bool; import org.apache.arrow.vector.types.pojo.ArrowType.Date; +import org.apache.arrow.vector.types.pojo.ArrowType.Decimal; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.Interval; @@ -92,26 +90,25 @@ public class Types { - public static final Field NULL_FIELD = new Field("", true, Null.INSTANCE, null); - public static final Field TINYINT_FIELD = new Field("", true, new Int(8, true), null); - public static final Field SMALLINT_FIELD = new Field("", true, new Int(16, true), null); - public static final Field INT_FIELD = new Field("", true, new Int(32, true), null); - public static final Field BIGINT_FIELD = new Field("", true, new Int(64, true), null); - public static final Field UINT1_FIELD = new Field("", true, new Int(8, false), null); - public static final Field UINT2_FIELD = new Field("", true, new Int(16, false), null); - public static final Field UINT4_FIELD = new Field("", true, new Int(32, false), null); - public static final Field UINT8_FIELD = new Field("", true, new Int(64, false), null); - public static final Field DATE_FIELD = new Field("", true, Date.INSTANCE, null); - public static final Field TIME_FIELD = new Field("", true, Time.INSTANCE, null); - public static final Field TIMESTAMP_FIELD = new Field("", true, new Timestamp(""), null); - public static final Field INTERVALDAY_FIELD = new Field("", true, new Interval(IntervalUnit.DAY_TIME), null); - public static final Field INTERVALYEAR_FIELD = new Field("", true, new Interval(IntervalUnit.YEAR_MONTH), null); - public static final Field FLOAT4_FIELD = new Field("", true, new FloatingPoint(Precision.SINGLE), null); - public static final Field FLOAT8_FIELD = new Field("", true, new FloatingPoint(Precision.DOUBLE), null); - public static final Field LIST_FIELD = new Field("", true, List.INSTANCE, null); - public static final Field VARCHAR_FIELD = new Field("", true, Utf8.INSTANCE, null); - public static final Field VARBINARY_FIELD = new Field("", true, Binary.INSTANCE, null); - public static final Field BIT_FIELD = new Field("", true, Bool.INSTANCE, null); + private static final Field NULL_FIELD = new Field("", true, Null.INSTANCE, null); + private static final Field TINYINT_FIELD = new Field("", true, new Int(8, true), null); + private static final Field SMALLINT_FIELD = new Field("", true, new Int(16, true), null); + private static final Field INT_FIELD = new Field("", true, new Int(32, true), null); + private static final Field BIGINT_FIELD = new Field("", true, new Int(64, true), null); + private static final Field UINT1_FIELD = new Field("", true, new Int(8, false), null); + private static final Field UINT2_FIELD = new Field("", true, new Int(16, false), null); + private static final Field UINT4_FIELD = new Field("", true, new Int(32, false), null); + private static final Field UINT8_FIELD = new Field("", true, new Int(64, false), null); + private static final Field DATE_FIELD = new Field("", true, Date.INSTANCE, null); + private static final Field TIME_FIELD = new Field("", true, Time.INSTANCE, null); + private static final Field TIMESTAMP_FIELD = new Field("", true, new Timestamp(""), null); + private static final Field INTERVALDAY_FIELD = new Field("", true, new Interval(IntervalUnit.DAY_TIME), null); + private static final Field INTERVALYEAR_FIELD = new Field("", true, new Interval(IntervalUnit.YEAR_MONTH), null); + private static final Field FLOAT4_FIELD = new Field("", true, new FloatingPoint(Precision.SINGLE), null); + private static final Field FLOAT8_FIELD = new Field("", true, new FloatingPoint(Precision.DOUBLE), null); + private static final Field VARCHAR_FIELD = new Field("", true, Utf8.INSTANCE, null); + private static final Field VARBINARY_FIELD = new Field("", true, Binary.INSTANCE, null); + private static final Field BIT_FIELD = new Field("", true, Bool.INSTANCE, null); public enum MinorType { @@ -427,7 +424,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { UINT4(new Int(32, false)) { @Override public Field getField() { - return UINT8_FIELD; + return UINT4_FIELD; } @Override @@ -506,22 +503,94 @@ public ArrowType getType() { public abstract FieldWriter getNewFieldWriter(ValueVector vector); } - private static final Map ARROW_TYPE_MINOR_TYPE_MAP; - public static MinorType getMinorTypeForArrowType(ArrowType arrowType) { - if (arrowType.getTypeType() == Type.Decimal) { - return MinorType.DECIMAL; - } - return ARROW_TYPE_MINOR_TYPE_MAP.get(arrowType); - } + return arrowType.accept(new ArrowTypeVisitor() { + @Override public MinorType visit(Null type) { + return MinorType.NULL; + } - static { - ARROW_TYPE_MINOR_TYPE_MAP = new HashMap<>(); - for (MinorType minorType : MinorType.values()) { - if (minorType != MinorType.DECIMAL) { - ARROW_TYPE_MINOR_TYPE_MAP.put(minorType.getType(), minorType); + @Override public MinorType visit(Struct_ type) { + return MinorType.MAP; } - } + + @Override public MinorType visit(List type) { + return MinorType.LIST; + } + + @Override public MinorType visit(Union type) { + return MinorType.UNION; + } + + @Override + public MinorType visit(Int type) { + switch (type.getBitWidth()) { + case 8: + return type.getIsSigned() ? MinorType.TINYINT : MinorType.UINT1; + case 16: + return type.getIsSigned() ? MinorType.SMALLINT : MinorType.UINT2; + case 32: + return type.getIsSigned() ? MinorType.INT : MinorType.UINT4; + case 64: + return type.getIsSigned() ? MinorType.BIGINT : MinorType.UINT8; + default: + throw new IllegalArgumentException("only 8, 16, 32, 64 supported: " + type); + } + } + + @Override + public MinorType visit(FloatingPoint type) { + switch (type.getPrecision()) { + case Precision.HALF: + throw new UnsupportedOperationException("NYI: " + type); + case Precision.SINGLE: + return MinorType.FLOAT4; + case Precision.DOUBLE: + return MinorType.FLOAT8; + default: + throw new IllegalArgumentException("unknown precision: " + type); + } + } + + @Override public MinorType visit(Utf8 type) { + return MinorType.VARCHAR; + } + + @Override public MinorType visit(Binary type) { + return MinorType.VARBINARY; + } + + @Override public MinorType visit(Bool type) { + return MinorType.BIT; + } + + @Override public MinorType visit(Decimal type) { + return MinorType.DECIMAL; + } + + @Override public MinorType visit(Date type) { + return MinorType.DATE; + } + + @Override public MinorType visit(Time type) { + return MinorType.TIME; + } + + @Override public MinorType visit(Timestamp type) { + return MinorType.TIMESTAMP; + } + + @Override + public MinorType visit(Interval type) { + switch (type.getUnit()) { + case IntervalUnit.DAY_TIME: + return MinorType.INTERVALDAY; + case IntervalUnit.YEAR_MONTH: + return MinorType.INTERVALYEAR; + default: + throw new IllegalArgumentException("unknown unit: " + type); + } + } + }); } } From c7b0480f5c8dadb78b9586dc4e40f3964929d8ef Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Mon, 3 Oct 2016 14:54:15 -0700 Subject: [PATCH 151/210] ARROW-314: JSONScalar is unnecessary and unused Author: Julien Le Dem Closes #153 from julienledem/jsonscalar and squashes the following commits: 905027c [Julien Le Dem] ARROW-314: JSONScalar is unnecessary and unused --- format/Message.fbs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/format/Message.fbs b/format/Message.fbs index 288f5a1b6b2..e1758bf3638 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -73,10 +73,6 @@ table Interval { unit: IntervalUnit; } -table JSONScalar { - dense:bool=true; -} - /// ---------------------------------------------------------------------- /// Top-level Type value, enabling extensible type-specific metadata. We can /// add new logical types to Type without breaking backwards compatibility @@ -95,8 +91,7 @@ union Type { Interval, List, Struct_, - Union, - JSONScalar + Union } /// ---------------------------------------------------------------------- From c3930a062b2d71e3d277d4db1785e24e9183276f Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Mon, 3 Oct 2016 15:17:32 -0700 Subject: [PATCH 152/210] ARROW-301: Add user field metadata to IPC schemas Author: Julien Le Dem Closes #154 from julienledem/custom and squashes the following commits: 47a02b7 [Julien Le Dem] ARROW-301: Add user field metadata to IPC schemas --- format/Message.fbs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/format/Message.fbs b/format/Message.fbs index e1758bf3638..3d877a2f234 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -119,6 +119,16 @@ table VectorLayout { type: VectorType; } + +/// ---------------------------------------------------------------------- +/// user defined key value pairs to add custom metadata to arrow +/// key namespacing is the responsibility of the user + +table KeyValue { + key: string; + value: [ubyte]; +} + /// ---------------------------------------------------------------------- /// A field represents a named column in a record / row batch or child of a /// nested type. @@ -141,6 +151,8 @@ table Field { /// does not include children /// each recordbatch will return instances of those Buffers. layout: [ VectorLayout ]; + // User-defined metadata + custom_metadata: [ KeyValue ]; } /// ---------------------------------------------------------------------- @@ -159,6 +171,8 @@ table Schema { endianness: Endianness=Little; fields: [Field]; + // User-defined metadata + custom_metadata: [ KeyValue ]; } /// ---------------------------------------------------------------------- From c7e6a0716308766766aaaf4faa2effc5445640c6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 3 Oct 2016 23:14:41 -0400 Subject: [PATCH 153/210] ARROW-302: [C++/Python] Implement C++ IO interfaces for interacting with Python file and bytes objects This will enable code (such as arrow IPC or Parquet) that only knows about Arrow's IO subsystem to interact with Python objects in various ways. In other words, when we have in C++: ``` std::shared_ptr handle = ...; handle->Read(nbytes, &out); ``` then the C++ file handle could be invoking the `read` method of a Python object. Same goes for `arrow::io::OutputStream` and `write` methods. There's data copying in some places overhead because of the rigid memory ownership semantics of the `PyBytes` type, but this can't be avoided here. Another nice thing is that if we have some data in a Python bytes object that we want to expose to some other C++ component, we can wrap it in the `PyBytesReader` which provides zero-copy read access to the underlying data. Author: Wes McKinney Closes #152 from wesm/ARROW-302 and squashes the following commits: 2de9f97 [Wes McKinney] Fix compiler warning / bug from OS X 316b845 [Wes McKinney] Code review comments e791893 [Wes McKinney] Python 2.7 fix 0fc4cf1 [Wes McKinney] cpplint e9b8c60 [Wes McKinney] Test the size() method and fix bug with missing whence 6481e91 [Wes McKinney] Add a zero-copy reader for PyBytes 7e357eb [Wes McKinney] Get basic Python file read/write working d470133 [Wes McKinney] Share default implementations of ReadAt, add Buffer-based Read API 737a8db [Wes McKinney] Refactoring, more code sharing with native file interfaces 8be433f [Wes McKinney] Draft PyReadableFile implementation, not yet tested 20a3f28 [Wes McKinney] Draft API for Arrow IO wrappers for Python files --- cpp/CMakeLists.txt | 2 + cpp/src/arrow/io/CMakeLists.txt | 1 + cpp/src/arrow/io/file.cc | 10 +- cpp/src/arrow/io/file.h | 6 +- cpp/src/arrow/io/hdfs.cc | 46 ++++- cpp/src/arrow/io/hdfs.h | 13 +- cpp/src/arrow/io/interfaces.cc | 48 ++++++ cpp/src/arrow/io/interfaces.h | 26 +-- cpp/src/arrow/io/memory.cc | 40 ++--- cpp/src/arrow/io/memory.h | 21 ++- python/CMakeLists.txt | 1 + python/pyarrow/__init__.py | 5 +- python/pyarrow/array.pyx | 31 ---- python/pyarrow/error.pxd | 4 +- python/pyarrow/error.pyx | 2 +- python/pyarrow/includes/libarrow_io.pxd | 29 ++++ python/pyarrow/includes/pyarrow.pxd | 34 +++- python/pyarrow/io.pxd | 13 +- python/pyarrow/io.pyx | 136 ++++++++++----- python/pyarrow/parquet.pyx | 8 +- python/pyarrow/table.pyx | 37 +++- python/pyarrow/tests/test_hdfs.py | 128 ++++++++++++++ python/pyarrow/tests/test_io.py | 121 ++++++------- python/src/pyarrow/adapters/pandas.cc | 2 +- python/src/pyarrow/common.cc | 15 ++ python/src/pyarrow/common.h | 30 +++- python/src/pyarrow/io.cc | 215 ++++++++++++++++++++++++ python/src/pyarrow/io.h | 97 +++++++++++ 28 files changed, 878 insertions(+), 243 deletions(-) create mode 100644 cpp/src/arrow/io/interfaces.cc create mode 100644 python/pyarrow/tests/test_hdfs.py create mode 100644 python/src/pyarrow/io.cc create mode 100644 python/src/pyarrow/io.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d65c7153196..f70c8ab4bcc 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -166,6 +166,8 @@ else() message(FATAL_ERROR "Unknown build type: ${CMAKE_BUILD_TYPE}") endif () +message(STATUS "Build Type: ${CMAKE_BUILD_TYPE}") + # Add common flags set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}") diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index d2e3491b75f..47bb0893863 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -39,6 +39,7 @@ set(ARROW_IO_TEST_LINK_LIBS set(ARROW_IO_SRCS file.cc + interfaces.cc memory.cc ) diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 87bae7f3928..93f0ad91ee8 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -413,15 +413,7 @@ Status ReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { return impl_->Read(nbytes, bytes_read, out); } -Status ReadableFile::ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) { - RETURN_NOT_OK(Seek(position)); - return impl_->Read(nbytes, bytes_read, out); -} - -Status ReadableFile::ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) { - RETURN_NOT_OK(Seek(position)); +Status ReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { return impl_->ReadBuffer(nbytes, out); } diff --git a/cpp/src/arrow/io/file.h b/cpp/src/arrow/io/file.h index 5e714ea9667..10fe16e5112 100644 --- a/cpp/src/arrow/io/file.h +++ b/cpp/src/arrow/io/file.h @@ -71,11 +71,9 @@ class ARROW_EXPORT ReadableFile : public ReadableFileInterface { Status Close() override; Status Tell(int64_t* position) override; - Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status GetSize(int64_t* size) override; Status Seek(int64_t position) override; diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index a6b4b2f3846..b74f84604f1 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -22,6 +22,8 @@ #include #include "arrow/io/hdfs.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" #include "arrow/util/status.h" namespace arrow { @@ -89,7 +91,7 @@ class HdfsAnyFileImpl { // Private implementation for read-only files class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { public: - HdfsReadableFileImpl() {} + explicit HdfsReadableFileImpl(MemoryPool* pool) : pool_(pool) {} Status Close() { if (is_open_) { @@ -108,6 +110,19 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { return Status::OK(); } + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) { + auto buffer = std::make_shared(pool_); + RETURN_NOT_OK(buffer->Resize(nbytes)); + + int64_t bytes_read = 0; + RETURN_NOT_OK(ReadAt(position, nbytes, &bytes_read, buffer->mutable_data())); + + if (bytes_read < nbytes) { RETURN_NOT_OK(buffer->Resize(bytes_read)); } + + *out = buffer; + return Status::OK(); + } + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { tSize ret = hdfsRead(fs_, file_, reinterpret_cast(buffer), nbytes); RETURN_NOT_OK(CheckReadResult(ret)); @@ -115,6 +130,19 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { return Status::OK(); } + Status Read(int64_t nbytes, std::shared_ptr* out) { + auto buffer = std::make_shared(pool_); + RETURN_NOT_OK(buffer->Resize(nbytes)); + + int64_t bytes_read = 0; + RETURN_NOT_OK(Read(nbytes, &bytes_read, buffer->mutable_data())); + + if (bytes_read < nbytes) { RETURN_NOT_OK(buffer->Resize(bytes_read)); } + + *out = buffer; + return Status::OK(); + } + Status GetSize(int64_t* size) { hdfsFileInfo* entry = hdfsGetPathInfo(fs_, path_.c_str()); if (entry == nullptr) { return Status::IOError("HDFS: GetPathInfo failed"); } @@ -123,10 +151,16 @@ class HdfsReadableFile::HdfsReadableFileImpl : public HdfsAnyFileImpl { hdfsFreeFileInfo(entry, 1); return Status::OK(); } + + void set_memory_pool(MemoryPool* pool) { pool_ = pool; } + + private: + MemoryPool* pool_; }; -HdfsReadableFile::HdfsReadableFile() { - impl_.reset(new HdfsReadableFileImpl()); +HdfsReadableFile::HdfsReadableFile(MemoryPool* pool) { + if (pool == nullptr) { pool = default_memory_pool(); } + impl_.reset(new HdfsReadableFileImpl(pool)); } HdfsReadableFile::~HdfsReadableFile() { @@ -144,7 +178,7 @@ Status HdfsReadableFile::ReadAt( Status HdfsReadableFile::ReadAt( int64_t position, int64_t nbytes, std::shared_ptr* out) { - return Status::NotImplemented("Not yet implemented"); + return impl_->ReadAt(position, nbytes, out); } bool HdfsReadableFile::supports_zero_copy() const { @@ -155,6 +189,10 @@ Status HdfsReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buff return impl_->Read(nbytes, bytes_read, buffer); } +Status HdfsReadableFile::Read(int64_t nbytes, std::shared_ptr* buffer) { + return impl_->Read(nbytes, buffer); +} + Status HdfsReadableFile::GetSize(int64_t* size) { return impl_->GetSize(size); } diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index 39720cc17e4..4a4e3ec5f51 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -164,6 +164,12 @@ class ARROW_EXPORT HdfsReadableFile : public ReadableFileInterface { Status GetSize(int64_t* size) override; + // NOTE: If you wish to read a particular range of a file in a multithreaded + // context, you may prefer to use ReadAt to avoid locking issues + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + + Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status ReadAt( int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; @@ -174,17 +180,16 @@ class ARROW_EXPORT HdfsReadableFile : public ReadableFileInterface { Status Seek(int64_t position) override; Status Tell(int64_t* position) override; - // NOTE: If you wish to read a particular range of a file in a multithreaded - // context, you may prefer to use ReadAt to avoid locking issues - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + void set_memory_pool(MemoryPool* pool); private: + explicit HdfsReadableFile(MemoryPool* pool = nullptr); + class ARROW_NO_EXPORT HdfsReadableFileImpl; std::unique_ptr impl_; friend class HdfsClient::HdfsClientImpl; - HdfsReadableFile(); DISALLOW_COPY_AND_ASSIGN(HdfsReadableFile); }; diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc new file mode 100644 index 00000000000..44986cee1af --- /dev/null +++ b/cpp/src/arrow/io/interfaces.cc @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/io/interfaces.h" + +#include +#include + +#include "arrow/util/buffer.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace io { + +FileInterface::~FileInterface() {} + +ReadableFileInterface::ReadableFileInterface() { + set_mode(FileMode::READ); +} + +Status ReadableFileInterface::ReadAt( + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, bytes_read, out); +} + +Status ReadableFileInterface::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, out); +} + +} // namespace io +} // namespace arrow diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index fa34b43b2c9..db0c059c6e2 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -22,10 +22,12 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/visibility.h" namespace arrow { class Buffer; +class MemoryPool; class Status; namespace io { @@ -43,9 +45,9 @@ class FileSystemClient { virtual ~FileSystemClient() {} }; -class FileInterface { +class ARROW_EXPORT FileInterface { public: - virtual ~FileInterface() {} + virtual ~FileInterface() = 0; virtual Status Close() = 0; virtual Status Tell(int64_t* position) = 0; @@ -54,7 +56,6 @@ class FileInterface { protected: FileInterface() {} FileMode::type mode_; - void set_mode(FileMode::type mode) { mode_ = mode; } private: @@ -74,6 +75,9 @@ class Writeable { class Readable { public: virtual Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) = 0; + + // Does not copy if not necessary + virtual Status Read(int64_t nbytes, std::shared_ptr* out) = 0; }; class OutputStream : public FileInterface, public Writeable { @@ -86,21 +90,21 @@ class InputStream : public FileInterface, public Readable { InputStream() {} }; -class ReadableFileInterface : public InputStream, public Seekable { +class ARROW_EXPORT ReadableFileInterface : public InputStream, public Seekable { public: - virtual Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) = 0; - virtual Status GetSize(int64_t* size) = 0; - // Does not copy if not necessary + virtual bool supports_zero_copy() const = 0; + + // Read at position, provide default implementations using Read(...), but can + // be overridden virtual Status ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) = 0; + int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out); - virtual bool supports_zero_copy() const = 0; + virtual Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out); protected: - ReadableFileInterface() { set_mode(FileMode::READ); } + ReadableFileInterface(); }; class WriteableFileInterface : public OutputStream, public Seekable { diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index c168c91c5f8..7d6e02e25b4 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -123,6 +123,8 @@ MemoryMappedFile::MemoryMappedFile(FileMode::type mode) { ReadableFileInterface::set_mode(mode); } +MemoryMappedFile::~MemoryMappedFile() {} + Status MemoryMappedFile::Open(const std::string& path, FileMode::type mode, std::shared_ptr* out) { std::shared_ptr result(new MemoryMappedFile(mode)); @@ -161,16 +163,8 @@ Status MemoryMappedFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) return Status::OK(); } -Status MemoryMappedFile::ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) { - RETURN_NOT_OK(impl_->Seek(position)); - return Read(nbytes, bytes_read, out); -} - -Status MemoryMappedFile::ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) { - nbytes = std::min(nbytes, impl_->size() - position); - RETURN_NOT_OK(impl_->Seek(position)); +Status MemoryMappedFile::Read(int64_t nbytes, std::shared_ptr* out) { + nbytes = std::min(nbytes, impl_->size() - impl_->position()); *out = std::make_shared(impl_->head(), nbytes); impl_->advance(nbytes); return Status::OK(); @@ -246,6 +240,11 @@ Status BufferOutputStream::Reserve(int64_t nbytes) { // ---------------------------------------------------------------------- // In-memory buffer reader +BufferReader::BufferReader(const uint8_t* buffer, int buffer_size) + : buffer_(buffer), buffer_size_(buffer_size), position_(0) {} + +BufferReader::~BufferReader() {} + Status BufferReader::Close() { // no-op return Status::OK(); @@ -256,20 +255,6 @@ Status BufferReader::Tell(int64_t* position) { return Status::OK(); } -Status BufferReader::ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) { - RETURN_NOT_OK(Seek(position)); - return Read(nbytes, bytes_read, buffer); -} - -Status BufferReader::ReadAt( - int64_t position, int64_t nbytes, std::shared_ptr* out) { - int64_t size = std::min(nbytes, buffer_size_ - position_); - *out = std::make_shared(buffer_ + position, size); - position_ += nbytes; - return Status::OK(); -} - bool BufferReader::supports_zero_copy() const { return true; } @@ -281,6 +266,13 @@ Status BufferReader::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) return Status::OK(); } +Status BufferReader::Read(int64_t nbytes, std::shared_ptr* out) { + int64_t size = std::min(nbytes, buffer_size_ - position_); + *out = std::make_shared(buffer_ + position_, size); + position_ += nbytes; + return Status::OK(); +} + Status BufferReader::GetSize(int64_t* size) { *size = buffer_size_; return Status::OK(); diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 6989d732ca7..df2fe8d6efb 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -61,6 +61,8 @@ class ARROW_EXPORT BufferOutputStream : public OutputStream { // A memory source that uses memory-mapped files for memory interactions class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { public: + ~MemoryMappedFile(); + static Status Open(const std::string& path, FileMode::type mode, std::shared_ptr* out); @@ -73,11 +75,8 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { // Required by ReadableFileInterface, copies memory into out Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; - Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; - - // Read into a buffer, zero copy if possible - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + // Zero copy read + Status Read(int64_t nbytes, std::shared_ptr* out) override; bool supports_zero_copy() const override; @@ -100,17 +99,17 @@ class ARROW_EXPORT MemoryMappedFile : public ReadWriteFileInterface { class ARROW_EXPORT BufferReader : public ReadableFileInterface { public: - BufferReader(const uint8_t* buffer, int buffer_size) - : buffer_(buffer), buffer_size_(buffer_size), position_(0) {} + BufferReader(const uint8_t* buffer, int buffer_size); + ~BufferReader(); Status Close() override; Status Tell(int64_t* position) override; - Status ReadAt( - int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; - Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; - Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) override; + + // Zero copy read + Status Read(int64_t nbytes, std::shared_ptr* out) override; + Status GetSize(int64_t* size) override; Status Seek(int64_t position) override; diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 6357e3c1725..77a771ab21c 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -432,6 +432,7 @@ set(PYARROW_SRCS src/pyarrow/common.cc src/pyarrow/config.cc src/pyarrow/helpers.cc + src/pyarrow/io.cc src/pyarrow/status.cc src/pyarrow/adapters/builtin.cc diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 40a09c2feae..7561f6d46df 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -41,6 +41,5 @@ list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.array import RowBatch, from_pandas_dataframe - -from pyarrow.table import Column, Table +from pyarrow.array import RowBatch +from pyarrow.table import Column, Table, from_pandas_dataframe diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 5229b429f58..cdbe73ad21f 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -35,7 +35,6 @@ from pyarrow.scalar import NA from pyarrow.schema cimport Schema import pyarrow.schema as schema -from pyarrow.table cimport Table def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.GetMemoryPool() @@ -254,35 +253,6 @@ def from_pandas_series(object series, object mask=None, timestamps_to_ms=False): return box_arrow_array(out) -def from_pandas_dataframe(object df, name=None, timestamps_to_ms=False): - """ - Convert pandas.DataFrame to an Arrow Table - - Parameters - ---------- - df: pandas.DataFrame - - name: str - - timestamps_to_ms: bool - Convert datetime columns to ms resolution. This is needed for - compability with other functionality like Parquet I/O which - only supports milliseconds. - """ - cdef: - list names = [] - list arrays = [] - - for name in df.columns: - col = df[name] - arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms) - - names.append(name) - arrays.append(arr) - - return Table.from_arrays(names, arrays, name=name) - - cdef object series_as_ndarray(object obj): import pandas as pd @@ -324,4 +294,3 @@ cdef class RowBatch: def __getitem__(self, i): return self.arrays[i] - diff --git a/python/pyarrow/error.pxd b/python/pyarrow/error.pxd index 1fb6fad396a..891d1ac1c7e 100644 --- a/python/pyarrow/error.pxd +++ b/python/pyarrow/error.pxd @@ -16,7 +16,7 @@ # under the License. from pyarrow.includes.libarrow cimport CStatus -from pyarrow.includes.pyarrow cimport * +from pyarrow.includes.pyarrow cimport PyStatus cdef int check_cstatus(const CStatus& status) nogil except -1 -cdef int check_status(const Status& status) nogil except -1 +cdef int check_status(const PyStatus& status) nogil except -1 diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx index 244019321a7..a2c53fed8c6 100644 --- a/python/pyarrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -30,7 +30,7 @@ cdef int check_cstatus(const CStatus& status) nogil except -1: with gil: raise ArrowException(frombytes(c_message)) -cdef int check_status(const Status& status) nogil except -1: +cdef int check_status(const PyStatus& status) nogil except -1: if status.ok(): return 0 diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd index f338a436814..56d8d4cf614 100644 --- a/python/pyarrow/includes/libarrow_io.pxd +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -18,6 +18,7 @@ # distutils: language = c++ from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport MemoryPool cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: enum FileMode" arrow::io::FileMode::type": @@ -35,6 +36,7 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: FileMode mode() cdef cppclass Readable: + CStatus ReadB" Read"(int64_t nbytes, shared_ptr[Buffer]* out) CStatus Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) cdef cppclass Seekable: @@ -66,6 +68,24 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: pass +cdef extern from "arrow/io/file.h" namespace "arrow::io" nogil: + cdef cppclass FileOutputStream(OutputStream): + @staticmethod + CStatus Open(const c_string& path, shared_ptr[FileOutputStream]* file) + + int file_descriptor() + + cdef cppclass ReadableFile(ReadableFileInterface): + @staticmethod + CStatus Open(const c_string& path, shared_ptr[ReadableFile]* file) + + @staticmethod + CStatus Open(const c_string& path, MemoryPool* memory_pool, + shared_ptr[ReadableFile]* file) + + int file_descriptor() + + cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: CStatus ConnectLibHdfs() @@ -120,3 +140,12 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: int32_t buffer_size, int16_t replication, int64_t default_block_size, shared_ptr[HdfsOutputStream]* handle) + + +cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil: + cdef cppclass BufferReader(ReadableFileInterface): + BufferReader(const uint8_t* data, int64_t nbytes) + + cdef cppclass BufferOutputStream(OutputStream): + # TODO(wesm) + pass diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 92c814706fd..4c971665ff6 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,15 +18,18 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, +from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CStatus, Type, MemoryPool) +cimport pyarrow.includes.libarrow_io as arrow_io + + cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: # We can later add more of the common status factory methods as needed - cdef Status Status_OK "Status::OK"() + cdef PyStatus PyStatus_OK "Status::OK"() - cdef cppclass Status: - Status() + cdef cppclass PyStatus "pyarrow::Status": + PyStatus() c_string ToString() @@ -40,12 +43,25 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: c_bool IsArrowError() shared_ptr[CDataType] GetPrimitiveType(Type type) - Status ConvertPySequence(object obj, shared_ptr[CArray]* out) + PyStatus ConvertPySequence(object obj, shared_ptr[CArray]* out) - Status PandasToArrow(MemoryPool* pool, object ao, shared_ptr[CArray]* out) - Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, - shared_ptr[CArray]* out) + PyStatus PandasToArrow(MemoryPool* pool, object ao, + shared_ptr[CArray]* out) + PyStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, + shared_ptr[CArray]* out) - Status ArrowToPandas(const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) + PyStatus ArrowToPandas(const shared_ptr[CColumn]& arr, object py_ref, + PyObject** out) MemoryPool* GetMemoryPool() + + +cdef extern from "pyarrow/io.h" namespace "pyarrow" nogil: + cdef cppclass PyReadableFile(arrow_io.ReadableFileInterface): + PyReadableFile(object fo) + + cdef cppclass PyOutputStream(arrow_io.OutputStream): + PyOutputStream(object fo) + + cdef cppclass PyBytesReader(arrow_io.BufferReader): + PyBytesReader(object fo) diff --git a/python/pyarrow/io.pxd b/python/pyarrow/io.pxd index f55fc0ab53a..1dbb3fd76bb 100644 --- a/python/pyarrow/io.pxd +++ b/python/pyarrow/io.pxd @@ -23,11 +23,16 @@ from pyarrow.includes.libarrow_io cimport (ReadableFileInterface, OutputStream) -cdef class NativeFileInterface: +cdef class NativeFile: + cdef: + shared_ptr[ReadableFileInterface] rd_file + shared_ptr[OutputStream] wr_file + bint is_readonly + bint is_open # By implementing these "virtual" functions (all functions in Cython - # extension classes are technically virtual in the C++ sense)m we can - # expose the arrow::io abstract file interfaces to other components - # throughout the suite of Arrow C++ libraries + # extension classes are technically virtual in the C++ sense) we can expose + # the arrow::io abstract file interfaces to other components throughout the + # suite of Arrow C++ libraries cdef read_handle(self, shared_ptr[ReadableFileInterface]* file) cdef write_handle(self, shared_ptr[OutputStream]* file) diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index f2eee260c33..e6e2b625e87 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -242,6 +242,9 @@ cdef class HdfsClient: cdef int16_t c_replication = replication or 0 cdef int64_t c_default_block_size = default_block_size or 0 + cdef shared_ptr[HdfsOutputStream] wr_handle + cdef shared_ptr[HdfsReadableFile] rd_handle + if mode in ('wb', 'ab'): if mode == 'ab': append = True @@ -251,13 +254,17 @@ cdef class HdfsClient: self.client.get() .OpenWriteable(c_path, append, c_buffer_size, c_replication, c_default_block_size, - &out.wr_file)) + &wr_handle)) + + out.wr_file = wr_handle out.is_readonly = False else: with nogil: check_cstatus(self.client.get() - .OpenReadable(c_path, &out.rd_file)) + .OpenReadable(c_path, &rd_handle)) + + out.rd_file = rd_handle out.is_readonly = True if c_buffer_size == 0: @@ -314,25 +321,8 @@ cdef class HdfsClient: f = self.open(path, 'rb', buffer_size=buffer_size) f.download(stream) -cdef class NativeFileInterface: - - cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): - raise NotImplementedError - - cdef write_handle(self, shared_ptr[OutputStream]* file): - raise NotImplementedError - -cdef class HdfsFile(NativeFileInterface): - cdef: - shared_ptr[HdfsReadableFile] rd_file - shared_ptr[HdfsOutputStream] wr_file - bint is_readonly - bint is_open - object parent - cdef readonly: - int32_t buffer_size - object mode +cdef class NativeFile: def __cinit__(self): self.is_open = False @@ -356,14 +346,6 @@ cdef class HdfsFile(NativeFileInterface): check_cstatus(self.wr_file.get().Close()) self.is_open = False - cdef _assert_readable(self): - if not self.is_readonly: - raise IOError("only valid on readonly files") - - cdef _assert_writeable(self): - if self.is_readonly: - raise IOError("only valid on writeonly files") - cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): self._assert_readable() file[0] = self.rd_file @@ -372,6 +354,14 @@ cdef class HdfsFile(NativeFileInterface): self._assert_writeable() file[0] = self.wr_file + def _assert_readable(self): + if not self.is_readonly: + raise IOError("only valid on readonly files") + + def _assert_writeable(self): + if self.is_readonly: + raise IOError("only valid on writeonly files") + def size(self): cdef int64_t size self._assert_readable() @@ -393,6 +383,83 @@ cdef class HdfsFile(NativeFileInterface): with nogil: check_cstatus(self.rd_file.get().Seek(position)) + def write(self, data): + """ + Write bytes-like (unicode, encoded to UTF-8) to file + """ + self._assert_writeable() + + data = tobytes(data) + + cdef const uint8_t* buf = cp.PyBytes_AS_STRING(data) + cdef int64_t bufsize = len(data) + with nogil: + check_cstatus(self.wr_file.get().Write(buf, bufsize)) + + def read(self, int nbytes): + cdef: + int64_t bytes_read = 0 + uint8_t* buf + shared_ptr[Buffer] out + + self._assert_readable() + + with nogil: + check_cstatus(self.rd_file.get() + .ReadB(nbytes, &out)) + + result = cp.PyBytes_FromStringAndSize( + out.get().data(), out.get().size()) + + return result + + +# ---------------------------------------------------------------------- +# Python file-like objects + +cdef class PythonFileInterface(NativeFile): + cdef: + object handle + + def __cinit__(self, handle, mode='w'): + self.handle = handle + + if mode.startswith('w'): + self.wr_file.reset(new pyarrow.PyOutputStream(handle)) + self.is_readonly = 0 + elif mode.startswith('r'): + self.rd_file.reset(new pyarrow.PyReadableFile(handle)) + self.is_readonly = 1 + else: + raise ValueError('Invalid file mode: {0}'.format(mode)) + + self.is_open = True + + +cdef class BytesReader(NativeFile): + cdef: + object obj + + def __cinit__(self, obj): + if not isinstance(obj, bytes): + raise ValueError('Must pass bytes object') + + self.obj = obj + self.is_readonly = 1 + self.is_open = True + + self.rd_file.reset(new pyarrow.PyBytesReader(obj)) + +# ---------------------------------------------------------------------- +# Specialization for HDFS + + +cdef class HdfsFile(NativeFile): + cdef readonly: + int32_t buffer_size + object mode + object parent + def read(self, int nbytes): """ Read indicated number of bytes from the file, up to EOF @@ -504,16 +571,3 @@ cdef class HdfsFile(NativeFileInterface): writer_thread.join() if exc_info is not None: raise exc_info[0], exc_info[1], exc_info[2] - - def write(self, data): - """ - Write bytes-like (unicode, encoded to UTF-8) to file - """ - self._assert_writeable() - - data = tobytes(data) - - cdef const uint8_t* buf = cp.PyBytes_AS_STRING(data) - cdef int64_t bufsize = len(data) - with nogil: - check_cstatus(self.wr_file.get().Write(buf, bufsize)) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 099e148abc1..ca0176a7c04 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -27,10 +27,10 @@ cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.compat import tobytes from pyarrow.error import ArrowException from pyarrow.error cimport check_cstatus -from pyarrow.io import NativeFileInterface +from pyarrow.io import NativeFile from pyarrow.table cimport Table -from pyarrow.io cimport NativeFileInterface +from pyarrow.io cimport NativeFile import six @@ -54,7 +54,7 @@ cdef class ParquetReader: new FileReader(default_memory_pool(), ParquetFileReader.OpenFile(path))) - cdef open_native_file(self, NativeFileInterface file): + cdef open_native_file(self, NativeFile file): cdef shared_ptr[ReadableFileInterface] cpp_handle file.read_handle(&cpp_handle) @@ -84,7 +84,7 @@ def read_table(source, columns=None): if isinstance(source, six.string_types): reader.open_local_file(source) - elif isinstance(source, NativeFileInterface): + elif isinstance(source, NativeFile): reader.open_native_file(source) return reader.read_all() diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index f02d36f520b..ade82aa6761 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -25,10 +25,12 @@ cimport pyarrow.includes.pyarrow as pyarrow import pyarrow.config from pyarrow.array cimport Array, box_arrow_array -from pyarrow.compat import frombytes, tobytes from pyarrow.error cimport check_status from pyarrow.schema cimport box_data_type, box_schema +from pyarrow.compat import frombytes, tobytes + + cdef class ChunkedArray: ''' Do not call this class's constructor directly. @@ -161,7 +163,7 @@ cdef class Table: @staticmethod def from_pandas(df, name=None): - pass + return from_pandas_dataframe(df, name=name) @staticmethod def from_arrays(names, arrays, name=None): @@ -264,3 +266,34 @@ cdef class Table: def __get__(self): return (self.num_rows, self.num_columns) + + +def from_pandas_dataframe(object df, name=None, timestamps_to_ms=False): + """ + Convert pandas.DataFrame to an Arrow Table + + Parameters + ---------- + df: pandas.DataFrame + + name: str + + timestamps_to_ms: bool + Convert datetime columns to ms resolution. This is needed for + compability with other functionality like Parquet I/O which + only supports milliseconds. + """ + from pyarrow.array import from_pandas_series + + cdef: + list names = [] + list arrays = [] + + for name in df.columns: + col = df[name] + arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms) + + names.append(name) + arrays.append(arr) + + return Table.from_arrays(names, arrays, name=name) diff --git a/python/pyarrow/tests/test_hdfs.py b/python/pyarrow/tests/test_hdfs.py new file mode 100644 index 00000000000..ed8d41994cd --- /dev/null +++ b/python/pyarrow/tests/test_hdfs.py @@ -0,0 +1,128 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from io import BytesIO +from os.path import join as pjoin +import os +import random + +import pytest + +import pyarrow.io as io + +# ---------------------------------------------------------------------- +# HDFS tests + + +def hdfs_test_client(): + host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost') + user = os.environ['ARROW_HDFS_TEST_USER'] + try: + port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500)) + except ValueError: + raise ValueError('Env variable ARROW_HDFS_TEST_PORT was not ' + 'an integer') + + return io.HdfsClient.connect(host, port, user) + + +libhdfs = pytest.mark.skipif(not io.have_libhdfs(), + reason='No libhdfs available on system') + + +HDFS_TMP_PATH = '/tmp/pyarrow-test-{0}'.format(random.randint(0, 1000)) + + +@pytest.fixture(scope='session') +def hdfs(request): + fixture = hdfs_test_client() + + def teardown(): + fixture.delete(HDFS_TMP_PATH, recursive=True) + fixture.close() + request.addfinalizer(teardown) + return fixture + + +@libhdfs +def test_hdfs_close(): + client = hdfs_test_client() + assert client.is_open + client.close() + assert not client.is_open + + with pytest.raises(Exception): + client.ls('/') + + +@libhdfs +def test_hdfs_mkdir(hdfs): + path = pjoin(HDFS_TMP_PATH, 'test-dir/test-dir') + parent_path = pjoin(HDFS_TMP_PATH, 'test-dir') + + hdfs.mkdir(path) + assert hdfs.exists(path) + + hdfs.delete(parent_path, recursive=True) + assert not hdfs.exists(path) + + +@libhdfs +def test_hdfs_ls(hdfs): + base_path = pjoin(HDFS_TMP_PATH, 'ls-test') + hdfs.mkdir(base_path) + + dir_path = pjoin(base_path, 'a-dir') + f1_path = pjoin(base_path, 'a-file-1') + + hdfs.mkdir(dir_path) + + f = hdfs.open(f1_path, 'wb') + f.write('a' * 10) + + contents = sorted(hdfs.ls(base_path, False)) + assert contents == [dir_path, f1_path] + + +@libhdfs +def test_hdfs_download_upload(hdfs): + base_path = pjoin(HDFS_TMP_PATH, 'upload-test') + + data = b'foobarbaz' + buf = BytesIO(data) + buf.seek(0) + + hdfs.upload(base_path, buf) + + out_buf = BytesIO() + hdfs.download(base_path, out_buf) + out_buf.seek(0) + assert out_buf.getvalue() == data + + +@libhdfs +def test_hdfs_file_context_manager(hdfs): + path = pjoin(HDFS_TMP_PATH, 'ctx-manager') + + data = b'foo' + with hdfs.open(path, 'wb') as f: + f.write(data) + + with hdfs.open(path, 'rb') as f: + assert f.size() == 3 + result = f.read(10) + assert result == data diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index eb92e8ea93a..9a41ebe3e8c 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -16,112 +16,85 @@ # under the License. from io import BytesIO -from os.path import join as pjoin -import os -import random - import pytest +from pyarrow.compat import u import pyarrow.io as io -#---------------------------------------------------------------------- -# HDFS tests +# ---------------------------------------------------------------------- +# Python file-like objects -def hdfs_test_client(): - host = os.environ.get('ARROW_HDFS_TEST_HOST', 'localhost') - user = os.environ['ARROW_HDFS_TEST_USER'] - try: - port = int(os.environ.get('ARROW_HDFS_TEST_PORT', 20500)) - except ValueError: - raise ValueError('Env variable ARROW_HDFS_TEST_PORT was not ' - 'an integer') +def test_python_file_write(): + buf = BytesIO() - return io.HdfsClient.connect(host, port, user) + f = io.PythonFileInterface(buf) + assert f.tell() == 0 -libhdfs = pytest.mark.skipif(not io.have_libhdfs(), - reason='No libhdfs available on system') + s1 = b'enga\xc3\xb1ado' + s2 = b'foobar' + f.write(s1.decode('utf8')) + assert f.tell() == len(s1) -HDFS_TMP_PATH = '/tmp/pyarrow-test-{0}'.format(random.randint(0, 1000)) + f.write(s2) + expected = s1 + s2 -@pytest.fixture(scope='session') -def hdfs(request): - fixture = hdfs_test_client() - def teardown(): - fixture.delete(HDFS_TMP_PATH, recursive=True) - fixture.close() - request.addfinalizer(teardown) - return fixture + result = buf.getvalue() + assert result == expected + f.close() -@libhdfs -def test_hdfs_close(): - client = hdfs_test_client() - assert client.is_open - client.close() - assert not client.is_open - with pytest.raises(Exception): - client.ls('/') +def test_python_file_read(): + data = b'some sample data' + buf = BytesIO(data) + f = io.PythonFileInterface(buf, mode='r') -@libhdfs -def test_hdfs_mkdir(hdfs): - path = pjoin(HDFS_TMP_PATH, 'test-dir/test-dir') - parent_path = pjoin(HDFS_TMP_PATH, 'test-dir') + assert f.size() == len(data) - hdfs.mkdir(path) - assert hdfs.exists(path) + assert f.tell() == 0 - hdfs.delete(parent_path, recursive=True) - assert not hdfs.exists(path) + assert f.read(4) == b'some' + assert f.tell() == 4 + f.seek(0) + assert f.tell() == 0 -@libhdfs -def test_hdfs_ls(hdfs): - base_path = pjoin(HDFS_TMP_PATH, 'ls-test') - hdfs.mkdir(base_path) + f.seek(5) + assert f.tell() == 5 - dir_path = pjoin(base_path, 'a-dir') - f1_path = pjoin(base_path, 'a-file-1') + assert f.read(50) == b'sample data' - hdfs.mkdir(dir_path) + f.close() - f = hdfs.open(f1_path, 'wb') - f.write('a' * 10) - contents = sorted(hdfs.ls(base_path, False)) - assert contents == [dir_path, f1_path] +def test_bytes_reader(): + # Like a BytesIO, but zero-copy underneath for C++ consumers + data = b'some sample data' + f = io.BytesReader(data) + assert f.tell() == 0 -@libhdfs -def test_hdfs_download_upload(hdfs): - base_path = pjoin(HDFS_TMP_PATH, 'upload-test') + assert f.size() == len(data) - data = b'foobarbaz' - buf = BytesIO(data) - buf.seek(0) + assert f.read(4) == b'some' + assert f.tell() == 4 - hdfs.upload(base_path, buf) + f.seek(0) + assert f.tell() == 0 - out_buf = BytesIO() - hdfs.download(base_path, out_buf) - out_buf.seek(0) - assert out_buf.getvalue() == data + f.seek(5) + assert f.tell() == 5 + assert f.read(50) == b'sample data' -@libhdfs -def test_hdfs_file_context_manager(hdfs): - path = pjoin(HDFS_TMP_PATH, 'ctx-manager') + f.close() - data = b'foo' - with hdfs.open(path, 'wb') as f: - f.write(data) - with hdfs.open(path, 'rb') as f: - assert f.size() == 3 - result = f.read(10) - assert result == data +def test_bytes_reader_non_bytes(): + with pytest.raises(ValueError): + io.BytesReader(u('some sample data')) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index a4e7fb6f3bb..d224074d652 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -618,7 +618,7 @@ class ArrowDeserializer { Status OutputFromData(int type, void* data) { // Zero-Copy. We can pass the data pointer directly to NumPy. Py_INCREF(py_ref_); - OwnedRef py_ref(py_ref); + OwnedRef py_ref(py_ref_); npy_intp dims[1] = {col_->length()}; out_ = reinterpret_cast(PyArray_SimpleNewFromData(1, dims, type, data)); diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc index a2748f99b67..82b14fdf401 100644 --- a/python/src/pyarrow/common.cc +++ b/python/src/pyarrow/common.cc @@ -68,4 +68,19 @@ arrow::MemoryPool* GetMemoryPool() { return &memory_pool; } +// ---------------------------------------------------------------------- +// PyBytesBuffer + +PyBytesBuffer::PyBytesBuffer(PyObject* obj) + : Buffer(reinterpret_cast(PyBytes_AS_STRING(obj)), + PyBytes_GET_SIZE(obj)), + obj_(obj) { + Py_INCREF(obj_); +} + +PyBytesBuffer::~PyBytesBuffer() { + PyGILGuard lock; + Py_DECREF(obj_); +} + } // namespace pyarrow diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index fb0ba3e4822..bc599f84fab 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -19,9 +19,8 @@ #define PYARROW_COMMON_H #include "pyarrow/config.h" - #include "arrow/util/buffer.h" - +#include "arrow/util/macros.h" #include "pyarrow/visibility.h" namespace arrow { class MemoryPool; } @@ -83,6 +82,20 @@ struct PyObjectStringify { } }; +class PyGILGuard { + public: + PyGILGuard() { + state_ = PyGILState_Ensure(); + } + + ~PyGILGuard() { + PyGILState_Release(state_); + } + private: + PyGILState_STATE state_; + DISALLOW_COPY_AND_ASSIGN(PyGILGuard); +}; + // TODO(wesm): We can just let errors pass through. To be explored later #define RETURN_IF_PYERROR() \ if (PyErr_Occurred()) { \ @@ -100,8 +113,8 @@ PYARROW_EXPORT arrow::MemoryPool* GetMemoryPool(); class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { public: - NumPyBuffer(PyArrayObject* arr) : - Buffer(nullptr, 0) { + NumPyBuffer(PyArrayObject* arr) + : Buffer(nullptr, 0) { arr_ = arr; Py_INCREF(arr); @@ -117,6 +130,15 @@ class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { PyArrayObject* arr_; }; +class PYARROW_EXPORT PyBytesBuffer : public arrow::Buffer { + public: + PyBytesBuffer(PyObject* obj); + ~PyBytesBuffer(); + + private: + PyObject* obj_; +}; + } // namespace pyarrow #endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc new file mode 100644 index 00000000000..35054e9025a --- /dev/null +++ b/python/src/pyarrow/io.cc @@ -0,0 +1,215 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/io.h" + +#include +#include + +#include +#include +#include + +#include "pyarrow/common.h" +#include "pyarrow/status.h" + +namespace pyarrow { + +// ---------------------------------------------------------------------- +// Python file + +PythonFile::PythonFile(PyObject* file) + : file_(file) { + Py_INCREF(file_); +} + +PythonFile::~PythonFile() { + Py_DECREF(file_); +} + +static arrow::Status CheckPyError() { + if (PyErr_Occurred()) { + PyObject *exc_type, *exc_value, *traceback; + PyErr_Fetch(&exc_type, &exc_value, &traceback); + PyObjectStringify stringified(exc_value); + std::string message(stringified.bytes); + Py_DECREF(exc_type); + Py_DECREF(exc_value); + Py_DECREF(traceback); + PyErr_Clear(); + return arrow::Status::IOError(message); + } + return arrow::Status::OK(); +} + +arrow::Status PythonFile::Close() { + // whence: 0 for relative to start of file, 2 for end of file + PyObject* result = PyObject_CallMethod(file_, "close", "()"); + Py_XDECREF(result); + ARROW_RETURN_NOT_OK(CheckPyError()); + return arrow::Status::OK(); +} + +arrow::Status PythonFile::Seek(int64_t position, int whence) { + // whence: 0 for relative to start of file, 2 for end of file + PyObject* result = PyObject_CallMethod(file_, "seek", "(ii)", position, whence); + Py_XDECREF(result); + ARROW_RETURN_NOT_OK(CheckPyError()); + return arrow::Status::OK(); +} + +arrow::Status PythonFile::Read(int64_t nbytes, PyObject** out) { + PyObject* result = PyObject_CallMethod(file_, "read", "(i)", nbytes); + ARROW_RETURN_NOT_OK(CheckPyError()); + *out = result; + return arrow::Status::OK(); +} + +arrow::Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { + PyObject* py_data = PyBytes_FromStringAndSize( + reinterpret_cast(data), nbytes); + ARROW_RETURN_NOT_OK(CheckPyError()); + + PyObject* result = PyObject_CallMethod(file_, "write", "(O)", py_data); + Py_DECREF(py_data); + Py_XDECREF(result); + ARROW_RETURN_NOT_OK(CheckPyError()); + return arrow::Status::OK(); +} + +arrow::Status PythonFile::Tell(int64_t* position) { + PyObject* result = PyObject_CallMethod(file_, "tell", "()"); + ARROW_RETURN_NOT_OK(CheckPyError()); + + *position = PyLong_AsLongLong(result); + Py_DECREF(result); + + // PyLong_AsLongLong can raise OverflowError + ARROW_RETURN_NOT_OK(CheckPyError()); + + return arrow::Status::OK(); +} + +// ---------------------------------------------------------------------- +// Seekable input stream + +PyReadableFile::PyReadableFile(PyObject* file) { + file_.reset(new PythonFile(file)); +} + +PyReadableFile::~PyReadableFile() {} + +arrow::Status PyReadableFile::Close() { + PyGILGuard lock; + return file_->Close(); +} + +arrow::Status PyReadableFile::Seek(int64_t position) { + PyGILGuard lock; + return file_->Seek(position, 0); +} + +arrow::Status PyReadableFile::Tell(int64_t* position) { + PyGILGuard lock; + return file_->Tell(position); +} + +arrow::Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + PyGILGuard lock; + PyObject* bytes_obj; + ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj)); + + *bytes_read = PyBytes_GET_SIZE(bytes_obj); + std::memcpy(out, PyBytes_AS_STRING(bytes_obj), *bytes_read); + Py_DECREF(bytes_obj); + + return arrow::Status::OK(); +} + +arrow::Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { + PyGILGuard lock; + + PyObject* bytes_obj; + ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj)); + + *out = std::make_shared(bytes_obj); + Py_DECREF(bytes_obj); + + return arrow::Status::OK(); +} + +arrow::Status PyReadableFile::GetSize(int64_t* size) { + PyGILGuard lock; + + int64_t current_position;; + ARROW_RETURN_NOT_OK(file_->Tell(¤t_position)); + + ARROW_RETURN_NOT_OK(file_->Seek(0, 2)); + + int64_t file_size; + ARROW_RETURN_NOT_OK(file_->Tell(&file_size)); + + // Restore previous file position + ARROW_RETURN_NOT_OK(file_->Seek(current_position, 0)); + + *size = file_size; + return arrow::Status::OK(); +} + +bool PyReadableFile::supports_zero_copy() const { + return false; +} + +// ---------------------------------------------------------------------- +// Output stream + +PyOutputStream::PyOutputStream(PyObject* file) { + file_.reset(new PythonFile(file)); +} + +PyOutputStream::~PyOutputStream() {} + +arrow::Status PyOutputStream::Close() { + PyGILGuard lock; + return file_->Close(); +} + +arrow::Status PyOutputStream::Tell(int64_t* position) { + PyGILGuard lock; + return file_->Tell(position); +} + +arrow::Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { + PyGILGuard lock; + return file_->Write(data, nbytes); +} + +// ---------------------------------------------------------------------- +// A readable file that is backed by a PyBytes + +PyBytesReader::PyBytesReader(PyObject* obj) + : arrow::io::BufferReader(reinterpret_cast(PyBytes_AS_STRING(obj)), + PyBytes_GET_SIZE(obj)), + obj_(obj) { + Py_INCREF(obj_); +} + +PyBytesReader::~PyBytesReader() { + Py_DECREF(obj_); +} + +} // namespace pyarrow diff --git a/python/src/pyarrow/io.h b/python/src/pyarrow/io.h new file mode 100644 index 00000000000..e14aa8cfb27 --- /dev/null +++ b/python/src/pyarrow/io.h @@ -0,0 +1,97 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_IO_H +#define PYARROW_IO_H + +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" + +#include "pyarrow/config.h" +#include "pyarrow/visibility.h" + +namespace arrow { class MemoryPool; } + +namespace pyarrow { + +// A common interface to a Python file-like object. Must acquire GIL before +// calling any methods +class PythonFile { + public: + PythonFile(PyObject* file); + ~PythonFile(); + + arrow::Status Close(); + arrow::Status Seek(int64_t position, int whence); + arrow::Status Read(int64_t nbytes, PyObject** out); + arrow::Status Tell(int64_t* position); + arrow::Status Write(const uint8_t* data, int64_t nbytes); + + private: + PyObject* file_; +}; + +class PYARROW_EXPORT PyReadableFile : public arrow::io::ReadableFileInterface { + public: + explicit PyReadableFile(PyObject* file); + virtual ~PyReadableFile(); + + arrow::Status Close() override; + + arrow::Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; + arrow::Status Read(int64_t nbytes, std::shared_ptr* out) override; + + arrow::Status GetSize(int64_t* size) override; + + arrow::Status Seek(int64_t position) override; + + arrow::Status Tell(int64_t* position) override; + + bool supports_zero_copy() const override; + + private: + std::unique_ptr file_; +}; + +class PYARROW_EXPORT PyOutputStream : public arrow::io::OutputStream { + public: + explicit PyOutputStream(PyObject* file); + virtual ~PyOutputStream(); + + arrow::Status Close() override; + arrow::Status Tell(int64_t* position) override; + arrow::Status Write(const uint8_t* data, int64_t nbytes) override; + + private: + std::unique_ptr file_; +}; + +// A zero-copy reader backed by a PyBytes object +class PYARROW_EXPORT PyBytesReader : public arrow::io::BufferReader { + public: + explicit PyBytesReader(PyObject* obj); + virtual ~PyBytesReader(); + + private: + PyObject* obj_; +}; + +// TODO(wesm): seekable output files + +} // namespace pyarrow + +#endif // PYARROW_IO_H From c3cfa3d3b3ce017776508f42fe9410bfb99cd94f Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Tue, 4 Oct 2016 14:07:59 -0700 Subject: [PATCH 154/210] ARROW-313: Build on any version of XCode Author: Christopher C. Aycock Closes #155 from chrisaycock/ARROW-313 and squashes the following commits: e47cc01 [Christopher C. Aycock] ARROW-313: Build on any version of XCode --- cpp/cmake_modules/CompilerInfo.cmake | 4 ++-- python/cmake_modules/CompilerInfo.cmake | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/cmake_modules/CompilerInfo.cmake b/cpp/cmake_modules/CompilerInfo.cmake index e1c821cca5d..02f6fd46997 100644 --- a/cpp/cmake_modules/CompilerInfo.cmake +++ b/cpp/cmake_modules/CompilerInfo.cmake @@ -29,9 +29,9 @@ elseif("${COMPILER_VERSION_FULL}" MATCHES ".*based on LLVM.*") string(REGEX REPLACE ".*based on LLVM ([0-9]+\\.[0.9]+).*" "\\1" COMPILER_VERSION "${COMPILER_VERSION_FULL}") -# clang on Mac OS X, XCode 7. No version replacement is done +# clang on Mac OS X, XCode 7+. No version replacement is done # because Apple no longer advertises the upstream LLVM version. -elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-70[0-9]\\..*") +elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-.*") set(COMPILER_FAMILY "clang") # gcc diff --git a/python/cmake_modules/CompilerInfo.cmake b/python/cmake_modules/CompilerInfo.cmake index 55f989a1a6c..8e85bdea96e 100644 --- a/python/cmake_modules/CompilerInfo.cmake +++ b/python/cmake_modules/CompilerInfo.cmake @@ -32,9 +32,9 @@ elseif("${COMPILER_VERSION_FULL}" MATCHES ".*based on LLVM.*") string(REGEX REPLACE ".*based on LLVM ([0-9]+\\.[0.9]+).*" "\\1" COMPILER_VERSION "${COMPILER_VERSION_FULL}") -# clang on Mac OS X, XCode 7. No version replacement is done +# clang on Mac OS X, XCode 7+. No version replacement is done # because Apple no longer advertises the upstream LLVM version. -elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-70[0-9]\\..*") +elseif("${COMPILER_VERSION_FULL}" MATCHES "clang-.*") set(COMPILER_FAMILY "clang") # gcc From 7fb4d24a35269db99fa112c0512d4a32c372dd74 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 4 Oct 2016 15:11:56 -0700 Subject: [PATCH 155/210] ARROW-315: finalize timestamp Author: Julien Le Dem Closes #156 from julienledem/timestamp and squashes the following commits: 0ee017f [Julien Le Dem] review feedback 86cae98 [Julien Le Dem] ARROW-315: finalize timestamp --- format/Message.fbs | 5 +- .../src/main/codegen/data/ArrowTypes.tdd | 2 +- .../templates/NullableValueVectors.java | 2 +- .../org/apache/arrow/vector/types/Types.java | 46 +++++++++++-------- .../apache/arrow/vector/pojo/TestConvert.java | 3 +- 5 files changed, 34 insertions(+), 24 deletions(-) diff --git a/format/Message.fbs b/format/Message.fbs index 3d877a2f234..d8fa65006c2 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -64,8 +64,11 @@ table Date { table Time { } +enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND } + +/// time from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC. table Timestamp { - timezone: string; + unit: TimeUnit; } enum IntervalUnit: short { YEAR_MONTH, DAY_TIME} diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 9624fecf6aa..11ac99af424 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -66,7 +66,7 @@ }, { name: "Timestamp", - fields: [{name: "timezone", type: "String"}] + fields: [{name: "unit", type: short}] }, { name: "Interval", diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index 8f325afad39..bafa3176020 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -103,7 +103,7 @@ public final class ${className} extends BaseDataValueVector implements <#if type <#elseif minor.class == "Float8"> field = new Field(name, true, new FloatingPoint(Precision.DOUBLE), null); <#elseif minor.class == "TimeStamp"> - field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(""), null); + field = new Field(name, true, new org.apache.arrow.vector.types.pojo.ArrowType.Timestamp(org.apache.arrow.flatbuf.TimeUnit.MILLISECOND), null); <#elseif minor.class == "IntervalDay"> field = new Field(name, true, new Interval(org.apache.arrow.flatbuf.IntervalUnit.DAY_TIME), null); <#elseif minor.class == "IntervalYear"> diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java index 2ff93d4b98d..d9593673156 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/Types.java @@ -19,6 +19,7 @@ import org.apache.arrow.flatbuf.IntervalUnit; import org.apache.arrow.flatbuf.Precision; +import org.apache.arrow.flatbuf.TimeUnit; import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.FieldVector; @@ -101,7 +102,7 @@ public class Types { private static final Field UINT8_FIELD = new Field("", true, new Int(64, false), null); private static final Field DATE_FIELD = new Field("", true, Date.INSTANCE, null); private static final Field TIME_FIELD = new Field("", true, Time.INSTANCE, null); - private static final Field TIMESTAMP_FIELD = new Field("", true, new Timestamp(""), null); + private static final Field TIMESTAMP_FIELD = new Field("", true, new Timestamp(org.apache.arrow.flatbuf.TimeUnit.MILLISECOND), null); private static final Field INTERVALDAY_FIELD = new Field("", true, new Interval(IntervalUnit.DAY_TIME), null); private static final Field INTERVALYEAR_FIELD = new Field("", true, new Interval(IntervalUnit.YEAR_MONTH), null); private static final Field FLOAT4_FIELD = new Field("", true, new FloatingPoint(Precision.SINGLE), null); @@ -143,8 +144,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new NullableMapWriter((NullableMapVector) vector); } - }, // an empty map column. Useful for conceptual setup. Children listed within here - + }, TINYINT(new Int(8, true)) { @Override public Field getField() { @@ -160,7 +160,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new TinyIntWriterImpl((NullableTinyIntVector) vector); } - }, // single byte signed integer + }, SMALLINT(new Int(16, true)) { @Override public Field getField() { @@ -176,7 +176,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new SmallIntWriterImpl((NullableSmallIntVector) vector); } - }, // two byte signed integer + }, INT(new Int(32, true)) { @Override public Field getField() { @@ -192,7 +192,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new IntWriterImpl((NullableIntVector) vector); } - }, // four byte signed integer + }, BIGINT(new Int(64, true)) { @Override public Field getField() { @@ -208,7 +208,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new BigIntWriterImpl((NullableBigIntVector) vector); } - }, // eight byte signed integer + }, DATE(Date.INSTANCE) { @Override public Field getField() { @@ -224,7 +224,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new DateWriterImpl((NullableDateVector) vector); } - }, // days since 4713bc + }, TIME(Time.INSTANCE) { @Override public Field getField() { @@ -240,8 +240,9 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new TimeWriterImpl((NullableTimeVector) vector); } - }, // time in micros before or after 2000/1/1 - TIMESTAMP(new Timestamp("")) { + }, + // time in millis from the Unix epoch, 00:00:00.000 on 1 January 1970, UTC. + TIMESTAMP(new Timestamp(org.apache.arrow.flatbuf.TimeUnit.MILLISECOND)) { @Override public Field getField() { return TIMESTAMP_FIELD; @@ -289,6 +290,7 @@ public FieldWriter getNewFieldWriter(ValueVector vector) { return new IntervalYearWriterImpl((NullableIntervalYearVector) vector); } }, + // 4 byte ieee 754 FLOAT4(new FloatingPoint(Precision.SINGLE)) { @Override public Field getField() { @@ -304,7 +306,8 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new Float4WriterImpl((NullableFloat4Vector) vector); } - }, // 4 byte ieee 754 + }, + // 8 byte ieee 754 FLOAT8(new FloatingPoint(Precision.DOUBLE)) { @Override public Field getField() { @@ -320,7 +323,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new Float8WriterImpl((NullableFloat8Vector) vector); } - }, // 8 byte ieee 754 + }, BIT(Bool.INSTANCE) { @Override public Field getField() { @@ -336,7 +339,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new BitWriterImpl((NullableBitVector) vector); } - }, // single bit value (boolean) + }, VARCHAR(Utf8.INSTANCE) { @Override public Field getField() { @@ -352,7 +355,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarCharWriterImpl((NullableVarCharVector) vector); } - }, // utf8 variable length string + }, VARBINARY(Binary.INSTANCE) { @Override public Field getField() { @@ -368,7 +371,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new VarBinaryWriterImpl((NullableVarBinaryVector) vector); } - }, // variable length binary + }, DECIMAL(null) { @Override public ArrowType getType() { @@ -388,7 +391,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new DecimalWriterImpl((NullableDecimalVector) vector); } - }, // variable length binary + }, UINT1(new Int(8, false)) { @Override public Field getField() { @@ -404,7 +407,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new UInt1WriterImpl((NullableUInt1Vector) vector); } - }, // unsigned 1 byte integer + }, UINT2(new Int(16, false)) { @Override public Field getField() { @@ -420,7 +423,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new UInt2WriterImpl((NullableUInt2Vector) vector); } - }, // unsigned 2 byte integer + }, UINT4(new Int(32, false)) { @Override public Field getField() { @@ -436,7 +439,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new UInt4WriterImpl((NullableUInt4Vector) vector); } - }, // unsigned 4 byte integer + }, UINT8(new Int(64, false)) { @Override public Field getField() { @@ -452,7 +455,7 @@ public FieldVector getNewVector(String name, BufferAllocator allocator, CallBack public FieldWriter getNewFieldWriter(ValueVector vector) { return new UInt8WriterImpl((NullableUInt8Vector) vector); } - }, // unsigned 8 byte integer + }, LIST(List.INSTANCE) { @Override public Field getField() { @@ -576,6 +579,9 @@ public MinorType visit(FloatingPoint type) { } @Override public MinorType visit(Timestamp type) { + if (type.getUnit() != TimeUnit.MILLISECOND) { + throw new UnsupportedOperationException("Only milliseconds supported: " + type); + } return MinorType.TIMESTAMP; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java index ed740cd0f1b..3da8db298b4 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/pojo/TestConvert.java @@ -21,6 +21,7 @@ import static org.apache.arrow.flatbuf.Precision.SINGLE; import static org.junit.Assert.assertEquals; +import org.apache.arrow.flatbuf.TimeUnit; import org.apache.arrow.flatbuf.UnionMode; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.ArrowType.FloatingPoint; @@ -80,7 +81,7 @@ public void nestedSchema() { new Field("child4.1", true, Utf8.INSTANCE, null) ))); childrenBuilder.add(new Field("child5", true, new Union(UnionMode.Sparse, new int[] { MinorType.TIMESTAMP.ordinal(), MinorType.FLOAT8.ordinal() } ), ImmutableList.of( - new Field("child5.1", true, new Timestamp("UTC"), null), + new Field("child5.1", true, new Timestamp(TimeUnit.MILLISECOND), null), new Field("child5.2", true, new FloatingPoint(DOUBLE), ImmutableList.of()) ))); Schema initialSchema = new Schema(childrenBuilder.build()); From dd1b95b90e73c3b0b69bfd6284e329eea41f689d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 5 Oct 2016 16:15:16 -0700 Subject: [PATCH 156/210] ARROW-318: Revise python/README.md given recent changes in codebase Also removes a redundant LICENSE.txt from cpp/ Author: Wes McKinney Closes #157 from wesm/ARROW-318 and squashes the following commits: 9e802f2 [Wes McKinney] Update python/README.md. Remove redundant LICENSE.txt from cpp/ --- cpp/LICENSE.txt | 202 ----------------------------------------------- python/README.md | 36 ++++----- 2 files changed, 14 insertions(+), 224 deletions(-) delete mode 100644 cpp/LICENSE.txt diff --git a/cpp/LICENSE.txt b/cpp/LICENSE.txt deleted file mode 100644 index d6456956733..00000000000 --- a/cpp/LICENSE.txt +++ /dev/null @@ -1,202 +0,0 @@ - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/python/README.md b/python/README.md index bafe71b05ec..3235d18377d 100644 --- a/python/README.md +++ b/python/README.md @@ -19,25 +19,17 @@ These are the various projects that PyArrow depends on. 1. **g++ and gcc Version >= 4.8** 2. **cmake > 2.8.6** 3. **boost** -4. **Parquet-cpp** - - The preferred way to install parquet-cpp is to use conda. - You need to set the ``PARQUET_HOME`` environment variable to where parquet-cpp is installed. - ```bash - conda install -y --channel apache/channel/dev parquet-cpp - ``` -5. **Arrow-cpp and its dependencies*** - - The Arrow C++ library must be built with all options enabled and installed with ``ARROW_HOME`` environment variable set to - the installation location. Look at (https://github.com/apache/arrow/blob/master/cpp/README.md) for - instructions. Alternatively you could just install arrow-cpp - from conda. - ```bash - conda install arrow-cpp -c apache/channel/dev - ``` -6. **Python dependencies: numpy, pandas, cython, pytest** - -#### Install pyarrow - ```bash - python setup.py build_ext --inplace - ``` +4. **Arrow-cpp and its dependencies*** + +The Arrow C++ library must be built with all options enabled and installed with +``ARROW_HOME`` environment variable set to the installation location. Look at +(https://github.com/apache/arrow/blob/master/cpp/README.md) for instructions. + +5. **Python dependencies: numpy, pandas, cython, pytest** + +#### Build pyarrow and run the unit tests + +```bash +python setup.py build_ext --inplace +py.test pyarrow +``` From 04cf8746f3588d7bfadcc0b9c8dbe71707bdd196 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Wed, 5 Oct 2016 16:20:31 -0700 Subject: [PATCH 157/210] ARROW-321: fix arrow licenses Author: Julien Le Dem Closes #159 from julienledem/fix_licenses and squashes the following commits: 0c97810 [Julien Le Dem] fix NOTICE 1489289 [Julien Le Dem] more licenses 0eb2aeb [Julien Le Dem] more licenses 9ac1159 [Julien Le Dem] more licenses eafa0e1 [Julien Le Dem] more licenses 30b0fa1 [Julien Le Dem] more licenses bcfc75f [Julien Le Dem] add ci 51db31b [Julien Le Dem] ARROW-321: fix arrow licenses --- NOTICE.txt | 20 +++++++++++ README.md | 16 ++++++++- ci/travis_before_script_cpp.sh | 13 ++++++++ ci/travis_conda_build.sh | 12 +++++++ ci/travis_install_conda.sh | 12 +++++++ ci/travis_script_cpp.sh | 12 +++++++ ci/travis_script_java.sh | 12 +++++++ ci/travis_script_python.sh | 12 +++++++ cpp/README.md | 14 ++++++++ cpp/cmake_modules/FindGPerf.cmake | 12 +++++++ cpp/cmake_modules/san-config.cmake | 12 +++++++ cpp/conda.recipe/build.sh | 12 +++++++ cpp/conda.recipe/meta.yaml | 12 +++++++ cpp/doc/HDFS.md | 16 ++++++++- cpp/doc/Parquet.md | 16 ++++++++- cpp/setup_build_env.sh | 12 +++++++ cpp/src/arrow/io/symbols.map | 12 +++++++ cpp/src/arrow/ipc/symbols.map | 12 +++++++ cpp/src/arrow/symbols.map | 12 +++++++ cpp/thirdparty/build_thirdparty.sh | 12 +++++++ cpp/thirdparty/download_thirdparty.sh | 12 +++++++ cpp/thirdparty/set_thirdparty_env.sh | 12 +++++++ cpp/thirdparty/versions.sh | 12 +++++++ dev/release/02-source.sh | 33 ++++++++++++++++++- dev/release/README | 12 ++++++- format/File.fbs | 17 ++++++++++ format/Guidelines.md | 13 ++++++++ format/IPC.md | 14 ++++++++ format/Layout.md | 14 ++++++++ format/Message.fbs | 17 ++++++++++ format/Metadata.md | 14 ++++++++ format/README.md | 16 ++++++++- java/README.md | 14 ++++++++ java/vector/src/main/codegen/config.fmpp | 22 +++++-------- .../src/main/codegen/data/ArrowTypes.tdd | 22 +++++-------- .../main/codegen/data/ValueVectorTypes.tdd | 22 +++++-------- python/README.md | 14 ++++++++ python/asv.conf.json | 12 +++++++ python/conda.recipe/build.sh | 13 ++++++++ python/conda.recipe/meta.yaml | 12 +++++++ python/doc/Benchmarks.md | 13 ++++++++ python/doc/INSTALL.md | 16 ++++++++- python/pyarrow/config.pyx | 12 +++++++ 43 files changed, 575 insertions(+), 46 deletions(-) diff --git a/NOTICE.txt b/NOTICE.txt index ce6e567dcb5..679bb59e6a9 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -18,3 +18,23 @@ https://github.com/wesm/feather This product includes software from the DyND project (BSD 2-clause) https://github.com/libdynd + +This product includes software from the LLVM project + * distributed under the University of Illinois Open Source + +This product includes software from the google-lint project + * Copyright (c) 2009 Google Inc. All rights reserved. + +This product includes software from the mman-win32 project + * Copyright https://code.google.com/p/mman-win32/ + * Licensed under the MIT License; + +This product includes software from the LevelDB project + * Copyright (c) 2011 The LevelDB Authors. All rights reserved. + * Use of this source code is governed by a BSD-style license that can be + * Moved from Kudu http://github.com/cloudera/kudu + +This product includes software from the CMake project + * Copyright 2001-2009 Kitware, Inc. + * Copyright 2012-2014 Continuum Analytics, Inc. + * All rights reserved. diff --git a/README.md b/README.md index 84bae78cc7f..89114ee39b4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,17 @@ + + ## Apache Arrow
@@ -42,4 +56,4 @@ integrations in other projects, we'd be happy to have you involved: [1]: mailto:dev-subscribe@arrow.apache.org [2]: https://github.com/apache/arrow/tree/master/format -[3]: https://issues.apache.org/jira/browse/ARROW \ No newline at end of file +[3]: https://issues.apache.org/jira/browse/ARROW diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 2f02ef247af..acd820bbed2 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -1,5 +1,18 @@ #!/usr/bin/env bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + + set -ex source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh diff --git a/ci/travis_conda_build.sh b/ci/travis_conda_build.sh index a787df79a55..17a33ae9717 100755 --- a/ci/travis_conda_build.sh +++ b/ci/travis_conda_build.sh @@ -1,5 +1,17 @@ #!/usr/bin/env bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -ex source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh index e9225259e6d..ffa017cbaf5 100644 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -1,5 +1,17 @@ #!/usr/bin/env bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -e if [ $TRAVIS_OS_NAME == "linux" ]; then diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index a3585507f0a..c3bd3b5f207 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -1,5 +1,17 @@ #!/usr/bin/env bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -e : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build} diff --git a/ci/travis_script_java.sh b/ci/travis_script_java.sh index 2d11eaeb4c5..4679f9c6daf 100755 --- a/ci/travis_script_java.sh +++ b/ci/travis_script_java.sh @@ -1,5 +1,17 @@ #!/usr/bin/env bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -e JAVA_DIR=${TRAVIS_BUILD_DIR}/java diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 61c8e444361..a75ff0778bc 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -1,5 +1,17 @@ #!/usr/bin/env bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -e PYTHON_DIR=$TRAVIS_BUILD_DIR/python diff --git a/cpp/README.md b/cpp/README.md index 129c5f15b15..a1c3ef28447 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -1,3 +1,17 @@ + + # Arrow C++ ## Setup Build Environment diff --git a/cpp/cmake_modules/FindGPerf.cmake b/cpp/cmake_modules/FindGPerf.cmake index e8310799c36..e90d4d00395 100644 --- a/cpp/cmake_modules/FindGPerf.cmake +++ b/cpp/cmake_modules/FindGPerf.cmake @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + # -*- cmake -*- # - Find Google perftools diff --git a/cpp/cmake_modules/san-config.cmake b/cpp/cmake_modules/san-config.cmake index b847c96657a..fe52fef12ea 100644 --- a/cpp/cmake_modules/san-config.cmake +++ b/cpp/cmake_modules/san-config.cmake @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + # Clang does not support using ASAN and TSAN simultaneously. if ("${ARROW_USE_ASAN}" AND "${ARROW_USE_TSAN}") message(SEND_ERROR "Can only enable one of ASAN or TSAN at a time") diff --git a/cpp/conda.recipe/build.sh b/cpp/conda.recipe/build.sh index 2f2b7482667..6d7454e9272 100644 --- a/cpp/conda.recipe/build.sh +++ b/cpp/conda.recipe/build.sh @@ -1,5 +1,17 @@ #!/bin/bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -e set -x diff --git a/cpp/conda.recipe/meta.yaml b/cpp/conda.recipe/meta.yaml index 75f3a8ba3d9..31f150c1f0b 100644 --- a/cpp/conda.recipe/meta.yaml +++ b/cpp/conda.recipe/meta.yaml @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + package: name: arrow-cpp version: "0.1" diff --git a/cpp/doc/HDFS.md b/cpp/doc/HDFS.md index e0d5dfda21d..83311db2d2d 100644 --- a/cpp/doc/HDFS.md +++ b/cpp/doc/HDFS.md @@ -1,3 +1,17 @@ + + ## Using Arrow's HDFS (Apache Hadoop Distributed File System) interface ### Build requirements @@ -36,4 +50,4 @@ will set it automatically for you: ```shell export JAVA_HOME=$(/usr/libexec/java_home) -``` \ No newline at end of file +``` diff --git a/cpp/doc/Parquet.md b/cpp/doc/Parquet.md index 96471d94835..34b83e78d0a 100644 --- a/cpp/doc/Parquet.md +++ b/cpp/doc/Parquet.md @@ -1,3 +1,17 @@ + + ## Building Arrow-Parquet integration To use Arrow C++ with Parquet, you must first build the Arrow C++ libraries and @@ -16,4 +30,4 @@ make -j4 make install ``` -[1]: https://github.com/apache/parquet-cpp \ No newline at end of file +[1]: https://github.com/apache/parquet-cpp diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh index fa779fdd5c2..546216753b3 100755 --- a/cpp/setup_build_env.sh +++ b/cpp/setup_build_env.sh @@ -1,5 +1,17 @@ #!/bin/bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) ./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } diff --git a/cpp/src/arrow/io/symbols.map b/cpp/src/arrow/io/symbols.map index b4ad98cd7f2..1e87caef9c8 100644 --- a/cpp/src/arrow/io/symbols.map +++ b/cpp/src/arrow/io/symbols.map @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + { # Symbols marked as 'local' are not exported by the DSO and thus may not # be used by client applications. diff --git a/cpp/src/arrow/ipc/symbols.map b/cpp/src/arrow/ipc/symbols.map index b4ad98cd7f2..1e87caef9c8 100644 --- a/cpp/src/arrow/ipc/symbols.map +++ b/cpp/src/arrow/ipc/symbols.map @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + { # Symbols marked as 'local' are not exported by the DSO and thus may not # be used by client applications. diff --git a/cpp/src/arrow/symbols.map b/cpp/src/arrow/symbols.map index 2ca8d730610..cc8c9ba3c94 100644 --- a/cpp/src/arrow/symbols.map +++ b/cpp/src/arrow/symbols.map @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + { # Symbols marked as 'local' are not exported by the DSO and thus may not # be used by client applications. diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh index 6cc776d0904..5011e29c01a 100755 --- a/cpp/thirdparty/build_thirdparty.sh +++ b/cpp/thirdparty/build_thirdparty.sh @@ -1,5 +1,17 @@ #!/bin/bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -x set -e TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh index d299afc1522..b50e7bc06a1 100755 --- a/cpp/thirdparty/download_thirdparty.sh +++ b/cpp/thirdparty/download_thirdparty.sh @@ -1,5 +1,17 @@ #!/bin/bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -x set -e diff --git a/cpp/thirdparty/set_thirdparty_env.sh b/cpp/thirdparty/set_thirdparty_env.sh index 7e9531cd508..135972ee9bd 100755 --- a/cpp/thirdparty/set_thirdparty_env.sh +++ b/cpp/thirdparty/set_thirdparty_env.sh @@ -1,5 +1,17 @@ #!/usr/bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) source $SOURCE_DIR/versions.sh diff --git a/cpp/thirdparty/versions.sh b/cpp/thirdparty/versions.sh index cb455b4eadd..a7b21e19fcc 100755 --- a/cpp/thirdparty/versions.sh +++ b/cpp/thirdparty/versions.sh @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + GTEST_VERSION=1.7.0 GTEST_URL="https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz" GTEST_BASEDIR=googletest-release-$GTEST_VERSION diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index f44692d5e9d..1bbe2e92753 100644 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -56,12 +56,43 @@ tarball=$tag.tar.gz # archive (identical hashes) using the scm tag git archive $release_hash --prefix $tag/ -o $tarball +# download apache rat +curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/0.12/apache-rat-0.12.jar > apache-rat-0.12.jar + +RAT="java -jar apache-rat-0.12.jar -d " + +# generate the rat report +$RAT $tarball \ + -e ".*" \ + -e mman.h \ + -e "*_generated.h" \ + -e random.h \ + -e status.cc \ + -e status.h \ + -e asan_symbolize.py \ + -e cpplint.py \ + -e FindPythonLibsNew.cmake \ + -e pax_global_header \ + -e MANIFEST.in \ + -e __init__.pxd \ + -e __init__.py \ + -e requirements.txt \ + > rat.txt +UNAPPROVED=`cat rat.txt | grep "Unknown Licenses" | head -n 1 | cut -d " " -f 1` + +if [ "0" -eq "${UNAPPROVED}" ]; then + echo "No unnaproved licenses" +else + echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" + exit +fi + # sign the archive gpg --armor --output ${tarball}.asc --detach-sig $tarball gpg --print-md MD5 $tarball > ${tarball}.md5 shasum $tarball > ${tarball}.sha -# check out the parquet RC folder +# check out the arrow RC folder svn co --depth=empty https://dist.apache.org/repos/dist/dev/arrow tmp # add the release candidate for the tag diff --git a/dev/release/README b/dev/release/README index 4fcc5d9728c..07402030bf6 100644 --- a/dev/release/README +++ b/dev/release/README @@ -3,6 +3,9 @@ requirements: - a gpg key to sign the artifacts to release, run the following (replace 0.1.0 with version to release): + +#create a release branch +git co -b release-0_1_0 # prepare release v 0.1.0 (run tests, sign artifacts). Next version will be 0.1.1-SNAPSHOT dev/release/00-prepare.sh 0.1.0 0.1.1 # tag and push to maven repo (repo will have to be finalized separately) @@ -11,5 +14,12 @@ dev/release/01-perform.sh dev/release/02-source.sh 0.1.0 0 useful commands: -to set the mvn version in the poms +- to set the mvn version in the poms mvn versions:set -DnewVersion=0.1-SNAPSHOT +- reset your workspace +git reset --hard +- setup gpg agent +eval $(gpg-agent --daemon --allow-preset-passphrase) +gpg --use-agent -s LICENSE.txt +- delete tag localy +git tag -d apache-arrow-0.1.0 diff --git a/format/File.fbs b/format/File.fbs index a29bbc694bc..f28dc204d58 100644 --- a/format/File.fbs +++ b/format/File.fbs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + include "Message.fbs"; namespace org.apache.arrow.flatbuf; diff --git a/format/Guidelines.md b/format/Guidelines.md index 14f10578504..c75da9f98be 100644 --- a/format/Guidelines.md +++ b/format/Guidelines.md @@ -1,3 +1,16 @@ + # Implementation guidelines An execution engine (or framework, or UDF executor, or storage engine, etc) can implements only a subset of the Arrow spec and/or extend it given the following constraints: diff --git a/format/IPC.md b/format/IPC.md index 1f39e762ab7..3f78126ef55 100644 --- a/format/IPC.md +++ b/format/IPC.md @@ -1,3 +1,17 @@ + + # Interprocess messaging / communication (IPC) ## File format diff --git a/format/Layout.md b/format/Layout.md index a953930e172..251af9dd8a1 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -1,3 +1,17 @@ + + # Arrow: Physical memory layout ## Definitions / Terminology diff --git a/format/Message.fbs b/format/Message.fbs index d8fa65006c2..2ec9fd1817b 100644 --- a/format/Message.fbs +++ b/format/Message.fbs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + namespace org.apache.arrow.flatbuf; enum MetadataVersion:short { diff --git a/format/Metadata.md b/format/Metadata.md index e227b8d4afd..fa5f623ac97 100644 --- a/format/Metadata.md +++ b/format/Metadata.md @@ -1,3 +1,17 @@ + + # Metadata: Logical types, schemas, data headers This is documentation for the Arrow metadata specification, which enables diff --git a/format/README.md b/format/README.md index 78e15207ee9..048badb1221 100644 --- a/format/README.md +++ b/format/README.md @@ -1,3 +1,17 @@ + + ## Arrow specification documents > **Work-in-progress specification documents**. These are discussion documents @@ -21,4 +35,4 @@ couple related pieces of information: schema, and enable a system to send and receive Arrow row batches in a form that can be precisely disassembled or reconstructed. -[1]: http://github.com/google/flatbuffers \ No newline at end of file +[1]: http://github.com/google/flatbuffers diff --git a/java/README.md b/java/README.md index 5e1d30d9fd2..a57e35afbbd 100644 --- a/java/README.md +++ b/java/README.md @@ -1,3 +1,17 @@ + + # Arrow Java ## Setup Build Environment diff --git a/java/vector/src/main/codegen/config.fmpp b/java/vector/src/main/codegen/config.fmpp index 6d92ba830ee..92881dc914f 100644 --- a/java/vector/src/main/codegen/config.fmpp +++ b/java/vector/src/main/codegen/config.fmpp @@ -1,18 +1,14 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http:# www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. data: { # TODO: Rename to ~valueVectorModesAndTypes for clarity. diff --git a/java/vector/src/main/codegen/data/ArrowTypes.tdd b/java/vector/src/main/codegen/data/ArrowTypes.tdd index 11ac99af424..c0b942bc359 100644 --- a/java/vector/src/main/codegen/data/ArrowTypes.tdd +++ b/java/vector/src/main/codegen/data/ArrowTypes.tdd @@ -1,18 +1,14 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http:# www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. { types: [ diff --git a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd index 421dd7ef92e..f7790bb3d6d 100644 --- a/java/vector/src/main/codegen/data/ValueVectorTypes.tdd +++ b/java/vector/src/main/codegen/data/ValueVectorTypes.tdd @@ -1,18 +1,14 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # -# http:# www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. { modes: [ diff --git a/python/README.md b/python/README.md index 3235d18377d..6febcbcbcbf 100644 --- a/python/README.md +++ b/python/README.md @@ -1,3 +1,17 @@ + + ## Python library for Apache Arrow This library provides a Pythonic API wrapper for the reference Arrow C++ diff --git a/python/asv.conf.json b/python/asv.conf.json index 96beba64c2e..0c059fd79c1 100644 --- a/python/asv.conf.json +++ b/python/asv.conf.json @@ -1,3 +1,15 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. See accompanying LICENSE file. + { // The version of the config file format. Do not change, unless // you know what you are doing. diff --git a/python/conda.recipe/build.sh b/python/conda.recipe/build.sh index f32710073c7..fafe71e7adb 100644 --- a/python/conda.recipe/build.sh +++ b/python/conda.recipe/build.sh @@ -1,4 +1,17 @@ #!/bin/bash + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + set -ex # Build dependency diff --git a/python/conda.recipe/meta.yaml b/python/conda.recipe/meta.yaml index 98ae4141e3b..b37dfde0a0d 100644 --- a/python/conda.recipe/meta.yaml +++ b/python/conda.recipe/meta.yaml @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + package: name: pyarrow version: "0.1" diff --git a/python/doc/Benchmarks.md b/python/doc/Benchmarks.md index 8edfb6209e4..1c368018582 100644 --- a/python/doc/Benchmarks.md +++ b/python/doc/Benchmarks.md @@ -1,3 +1,16 @@ + ## Benchmark Requirements The benchmarks are run using [asv][1] which is also their only requirement. diff --git a/python/doc/INSTALL.md b/python/doc/INSTALL.md index d30a03046ed..81eed565d91 100644 --- a/python/doc/INSTALL.md +++ b/python/doc/INSTALL.md @@ -1,3 +1,17 @@ + + ## Building pyarrow (Apache Arrow Python library) First, clone the master git repository: @@ -84,4 +98,4 @@ Out[2]: ] ``` -[1]: https://cmake.org/ \ No newline at end of file +[1]: https://cmake.org/ diff --git a/python/pyarrow/config.pyx b/python/pyarrow/config.pyx index 1047a472fe3..778c15a5e65 100644 --- a/python/pyarrow/config.pyx +++ b/python/pyarrow/config.pyx @@ -1,3 +1,15 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + # cython: profile=False # distutils: language = c++ # cython: embedsignature = True From f1a4bd176bc2139ba785522200d7630408328911 Mon Sep 17 00:00:00 2001 From: adeneche Date: Wed, 5 Oct 2016 20:19:42 -0700 Subject: [PATCH 158/210] =?UTF-8?q?ARROW-320:=20ComplexCopier.copy(FieldRe?= =?UTF-8?q?ader,=20FieldWriter)=20should=20not=20st=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …art a list if reader is not set Author: adeneche Closes #160 from adeneche/ARROW-320 and squashes the following commits: 5c6ebc5 [adeneche] ARROW-320: ComplexCopier.copy(FieldReader, FieldWriter) should not start a list if reader is not set --- .../main/codegen/templates/ComplexCopier.java | 14 +-- .../apache/arrow/vector/TestListVector.java | 87 +++++++++++++++++++ 2 files changed, 95 insertions(+), 6 deletions(-) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java diff --git a/java/vector/src/main/codegen/templates/ComplexCopier.java b/java/vector/src/main/codegen/templates/ComplexCopier.java index a5756a47ad7..0dffe5e30be 100644 --- a/java/vector/src/main/codegen/templates/ComplexCopier.java +++ b/java/vector/src/main/codegen/templates/ComplexCopier.java @@ -47,23 +47,25 @@ private static void writeValue(FieldReader reader, FieldWriter writer) { switch (mt) { case LIST: - writer.startList(); - while (reader.next()) { - writeValue(reader.reader(), getListWriterForReader(reader.reader(), writer)); + if (reader.isSet()) { + writer.startList(); + while (reader.next()) { + writeValue(reader.reader(), getListWriterForReader(reader.reader(), writer)); + } + writer.endList(); } - writer.endList(); break; case MAP: - writer.start(); if (reader.isSet()) { + writer.start(); for(String name : reader){ FieldReader childReader = reader.reader(name); if(childReader.isSet()){ writeValue(childReader, getMapWriterForReader(childReader, writer, name)); } } + writer.end(); } - writer.end(); break; <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java new file mode 100644 index 00000000000..bb710336555 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -0,0 +1,87 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.impl.ComplexCopier; +import org.apache.arrow.vector.complex.impl.UnionListReader; +import org.apache.arrow.vector.complex.impl.UnionListWriter; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.types.pojo.Field; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class TestListVector { + private final static String EMPTY_SCHEMA_PATH = ""; + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new DirtyRootAllocator(Long.MAX_VALUE, (byte) 100); + } + + @After + public void terminate() throws Exception { + allocator.close(); + } + + @Test + public void testCopyFrom() throws Exception { + try (ListVector inVector = new ListVector("input", allocator, null); + ListVector outVector = new ListVector("output", allocator, null)) { + UnionListWriter writer = inVector.getWriter(); + writer.allocate(); + + // populate input vector with the following records + // [1, 2, 3] + // null + // [] + writer.setPosition(0); // optional + writer.startList(); + writer.bigInt().writeBigInt(1); + writer.bigInt().writeBigInt(2); + writer.bigInt().writeBigInt(3); + writer.endList(); + + writer.setPosition(2); + writer.startList(); + writer.endList(); + + writer.setValueCount(3); + + // copy values from input to output + outVector.allocateNew(); + for (int i = 0; i < 3; i++) { + outVector.copyFrom(i, i, inVector); + } + outVector.getMutator().setValueCount(3); + + // assert the output vector is correct + FieldReader reader = outVector.getReader(); + Assert.assertTrue("shouldn't be null", reader.isSet()); + reader.setPosition(1); + Assert.assertFalse("should be null", reader.isSet()); + reader.setPosition(2); + Assert.assertTrue("shouldn't be null", reader.isSet()); + } + } +} From 3f85cee51e45165c4be8d251849d2b3765b9b4dd Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 7 Oct 2016 12:12:58 -0700 Subject: [PATCH 159/210] ARROW-324: Update arrow metadata diagram Author: Julien Le Dem Closes #161 from julienledem/diagram and squashes the following commits: f018cf5 [Julien Le Dem] ARROW-324: Update arrow metadata diagram --- format/Arrow.graffle | Bin 3646 -> 4142 bytes format/Arrow.png | Bin 86598 -> 112671 bytes format/Metadata.md | 10 ++++++++++ 3 files changed, 10 insertions(+) diff --git a/format/Arrow.graffle b/format/Arrow.graffle index 453e85025d8d324310e27daa3692a98e3bceb332..f4eead9223160f6e16f97bb8e9d6ed6a05e290e6 100644 GIT binary patch literal 4142 zcmV+}5Yg`+iwFP!000030PS6EcbhmC{yg~=Y`^U8nbeVl0Fmi*j}xbpbkZafr|q`m zIXyDSHd_oX;HH_j|9vH}4aPWjQsX#SZcqHel@Q`S&wY`Qp8e-b-&H=dAas21*C((B zo+!+-eB1H5zdm_=_2Mac@}K8B&;HRkKfHQ(aik1fCybPf*9WI3hsu+udwcc4z-4=T zjjM)oaeC6cQg|JEdq-zalqbC?8tm`wU0+|1+>#~nf`2VB$APa zx>!UcOXYBste=JjQcWsE7KB9=WjM6pXu|NQocLvdv(|8dNZwpv#pAyn*(a`Dkky) zqwaH+t6%(KEXuzKeQwZbra=fm{V5<%fu=xxUnTnhD?bDNnrWS=#1Hc2Uk+V|dCDnM z8myl$({RFpOK+MMb!Sct6~|M4Rm5Aq*bB#HS2u&1B9!AIFB|J(!?%Wg=0$nHPI?c` zbl*saL%}Hp0H0RM9Fj6n(jcg5CdLN8p#UQU@vfq(Y7JvkBL;znfeFzC*a+lDz%64?tsbfV176Tinw%=ywZ2r_kyb*j|*S=QLN!vt;wDe1=n#ys(%?G71#=TfZ!4eVC*MbiQQ3n7Z%bRymm!tomc;!PM;@n*&% z7{?+Sh(&m5@utIAL^rsIsMiTR8Vv*Lwmw8Z4#4`@Se)`@7F zTfLDv6@SJ*EoJ%+{XP%LzbFxRu)_Tk&EI@{V&Pk3119JMj~t!4DbXp>xgvC41Ww-( znXdBjxn=IDdLQ>xCk5C(52Bvm<+^`!%9I|b_=@CX-?o4hApm4ofdmu~ zVXlIFbe(kAQ<&|rrQK)^bN)3Es%i5)XWiuflL^VK$7ek63VGH)=2Qzj<&qima;h8P zhwH`){qWe%n;_0mqc}d(R%hpn@oL=N7MG)WGHK}g zX6QN5zVhymMqKj95GimR5QS^vl2NARiKKXVb$K4) zL{!sE{vz1G5J6!60I#TWbY>e^Lp3F9l)xI;+>W{7x?VHY1rCO$$@NmUf=Y~)u3Km( znj_Xl(~&SpB~K5>a=ny_PY-|Xar)bc`Qd%vX8THWAo})OgL&aYK3o>JNG!2-Vu$+* zhrpkIJ#=dx{29`OepEQ2pVFUB!MdcMYST}XyLVQUkbuOOIeGePP59CzzN(EcvFU|G zSGod%rmb(WK`hr7*@E>&!ie?<8?@SB^XTyWtnsb&PojN0tDl`G-`BR_-~s>|YP!Hr z`O&Lff~4w8kTA>XQ+FdQ+#ukM5t~xQ@nUNy6nkOq#~(KWd2o%3|6nAMS8e1S_`WM0 z%L%%4vc^R_uw1mG>RH*SLnOLVbZN#K(FG*BDsHZI^9IZbVZ-jTmWy(m*w?NoAu^j__B1 zN0g1gVB3k_I(F2H(<92i$S%}(RkW+)f46p=@I&ZyJ!bDKZ5~)M>d75aHfZLas;_Am z-pbE?D|;8-7mn*PyW!Bj@7a%MM_x`SHKG#(A=M-R0n_AaKulo@b>orm|EU%YvI}~- zbC2i76UQ*jPhlNV2dH5kVU8%>goX;RX{s46NeW;35fz1V6DsF$=#1WMG!CT?@~}Qg z>Va52$F&3I$<0bvX2rpMtoYhiBS3_3{x)Y;O)Px^{vF*7XqYuK^CF`OSaZaes=Y$a z$}P^s15VgKgc{d!v~=yyq+%&+IeBqbY!t9oNgb4^^Lh#4#mv*aBWE(rW}8 zI)Mg(I`?-4+90d}#5o#6gt>CWX*En9dn|!g6-z)FOUPK_5ylc7hq-nrBZ*Z-5(wWX zl7MT<2dWy}Rk*%O7yJGx!AIyXBUP=bnHN_Qt_ktLn2M+LN7uPCjr6jeX*ngN>O
OT_Dnx!~H*YAv3Cdp&?6sRjt!akA+21gb_@{BISL2CZ7(H@79hocg z(-FC=pN^plF(lm3a+MWh57JHF07g|Y$ucT@+>Dxc$_2MbmwaW({d;)ih}_pBCs;3V zu`@YxsB1N?+Fp2dM&*&*=xjj`LGwQFs{FXa=m%*);ud1B@YcCxyVn|m#DLPtWP)M5gO*%4)r`1hWpB?6GbljcHne3JI!xkV*<%|?pyhg z+_I@}Rm?31~qR${#oD@gRT&a^&n`qit~S$!rFh4MVplO0CYQ z86>ORr9n8n8eX5j3r*?oR>|Le#p65LWyi5`SnHg$4SAje8RnW1L?nVLh@c0+PTRL{ zPMOyg%bX+;;{#&}@|#=32H??+ZepSm?vtnl)-NYl zd+fBLuZmW*`#p218HQ%4rfEuAk+f1it$4o8^cyF1+9?X> zoFE@>ESS(zw8E+hRy|CJO3YRSLGGFEDp3|oh zn^B-#j!qo4oWSy5IbOpF)5{#i>m-fWQ`sitprcbc>v6^!VwrdqiKj27P8gTJLj$90f8Z=7KMgkC#EcbxP+{L@_f*o zH4pqK^82r7z*~^&LMC@Js;?tUg^yFQ;^~C$e}* zXcwO7pdS5gAHQ_py@i1N_rQIBa_raf9|wP&G>>}q-uY{w!S>siuGKx{-|h3)@O`g+ z{Kl>GHXU?P(OKlRBwLbGB3Wk*N4B|-uzS6iGHMe1f$$m{{{CggQ?tM zLG`nPcasMPA&eLFHdJI95SsDEx8`)VuHd9XEiTz-Zf014u0DqQaw&YEGH=Y4KPaR6 zX;zJI#JR@=rssGAmyFuyo_n(x?3ey^-F3R&5>5ygZ4Ri#3r~b&)))KUjOjmd~$kh zaOE6wO_#QhnZS0=GeKJ3lNlMzOV*vDu5^pJ_?c=op6X>n9`x&~SQ`j+NR@Ded-uKgmvFCw3BGNdmUmoyLiuq-W`UK6;X#sltJiODZ);dc)g0Crf?@qKU;`3FcV|4aU7n!kM(#E* zCdqwD9sa9Jo#@rpPy09I?$v=4(J$l--;v*LFD5?(fS zaKy9xkmNi`go6N!i=XzxDBzF$ifdS7eV=v%HXPJ0_|=;LcRC(1_Uj{&j5IXt8Z^23 z$WT=SWJLDs@`|)E+UyQZTPnEZrPnhi5U74+!>NsoePDi?JyG zBCNTAqnthnKw}qJyFgcE8}LN_^#vC=Jq|P1B&^z{7V< z2Yb1Kp+bG-S4DixCVNdia(y?*CBYmOv$C-%HkdQ)6F-`T>!kKjuKGqM912D$fcLab zPD5d$&l&Ye0Tf0L!Tjjd@38rnEF=mH`Gk01NW=+fD$fyeG8QF%i(6Sd-Ke>XU?!1}xCo*i>oOmEhd2|vMPW}7Y*C}zodG?{JNV*btNEllbl2&NR67TG)$C)!+`WUeHWJd{av zyX;KD31Ualt;H;&n}%eODzJztS!5L~VpW4hG;NVLVhaf%d87(FVn`lY1&^53;1Osp z8P({}mON4g9?>O_tb#|3ip-B#`eGhgc3<*HNqEGPJhBQN(W}BEKwsjKSO7{UsREN| zl1WymyV5-6Eu6-2!piDT8xWpS*!A>&vp!6b5}W))1L*;Qc@71n@(mXL`m7i%g(CJzs1o!O?A*{7yK zZAYAPtawt}bwTi6e2 z{Esu}LZqXX!yJaiEmgCUFX6B)zMEj{d#l5FIBI*X4rq0J%0HbR4nh(EWkSX0WJrGb z!DSuhv49Kz(h7)+biLIL2=T>5+Zz)8$)#EY9JsCMt8M`PFN7%mFf8$bkQRSfKzxX8 zk>bOSMKmoI(LpT2Q;QD+#v+EvMMSlsm)o?N-oXP~J>*2s7iIuMl zQ*pV^1Uf)lBXo2Loiag3Y}>CQa^eAzX{nO|k*2R2IhsUHsX86nCt-y9gAMp`5ik22 z^Mx-Lx5hK@;u1h1FaeaIt{PCP1XP&-)gTV-iwRb@ieA!A-;9XvihS|f6xL{6zD>7tHRUF4|t8j%Aea!S?bP#*_( zmBo%YkZNt~tg0=sQzq=Z_85+MO~i2{8<<+ijD?z94UhcQV8gaYp34-&c-?g^9g`}9 z7)sVLRd=#`yiAnzN;Tf_s|Jq@S|}BGPB+pYwQ2N`x=}Blnp6G-mEC`KqhAz5`}f*< zBpIW-JVsY73aPkP?5k?n1wpG73Tkcg*1q|Bk*y2ER9{oiBW{D@KJ(qxhOB}o3|U3M ztO2MS$gmB=)>UM2GqB=W3B%k`RPc1&*J~QhbO|GgKaQIP(q6Wn$D&sxjL~_&uCXYJ zLh#9hYkX!=d5pW+==2d+fSP6?&9FJsST z59@RuQrE%hQO^yCzqq^p$nlv6)^yXzPI?*DE~1^a9aMxY!Pa)}kdKw`q*>-oAfzWl zp*i=9)0fEGSl|t8a4XFb|2=$7Nc3|26J3Se3o(<~D4UzR4OTs%mH{v2iXh_vs^WB% zs&NihkzUhP+qNLE4ckI?yi%`OFWs|8PvzM|>Df!q{@Hu>9ZEbmlz#nF`SnP9j9(8| zHTf3b=jJTk;m6Ovy+GdEq>iud^g&MR}9dOGM^Xt<2l{B|v z=hwv%magxUO=&;E^F`KUJzq;R3Ow7)QBcs(#hFZHc7DsV@{HsggqPG1v>yYn(!aBB zF3gvzHbW)0El*u5!>MbMM^?cjYkOy3j74bbYhQR?=1)kHC=W?UZ(}tqv9>qz#pfm= zU~VEfWdTRTYDq_<8XTfM^?bds=e5n(ql%qoqya5Ek)^%ti7fF$M7zC+_#Ny-ECBf# z{HD_*ecalxQsZqPYttjI#PbN6iC-?iUWm(ZvvQrU#H)$>GB~(9#F5k6z+Nuu?kOHi zjw{pA2u*U^ z>eu0Vd{99Gxe~Y8(K~o_5SnS_hqcui^_FK94q8t*ri~q5U0a{O3*|{VrSf;*kubzv zauRnARorKROnc1;lE>wg5kVWkPMf)RXT9k@z6|Lxx+Zm~wrf#aXZA)5_w6c0y z@tI5H#>kCEheQ3bQ&JGb?{%F2s@*DthnHq=qxc)g9MfxXP6@TQRktW4Cc3DaaiES@p)?n+`O>YOvknow51MxoXeYly4fMr#E=q8?{Sw|h)nXIGiRAd`*;vu zvwO&=+Qd2xsY}8$=G>4;!a_u3q(9J*wo??&DJC8hTykccpsg6#3I6+B@JM(OB*=-w zocMnT8C0hc3isj^ zPB%9_>X7h$YJ8yNc4@Zr432BdhS7YJJMd{AM>0zKpyC~KX@_?iLhDAR zrN@~f0F7N>?Lwd+xQC2ARa1TzHJ2+Z&M1S`(e1ZGj}l)wBT9qxXA_;Xh}{2iE&`rY zG*!XH`InjgcBCfZ(^4o*PHq?Mwc(h;X^{Vu%N?(=$m`;*8Afl1mwSm-lx+OcA))! z`1@(|xOdRIcn@^g{`l5&x<~xGd+{EA>a|ZkcnAFc(XoDgjoyC#^zj@VoPIn$Xny80 zufKY?M{gFZe?~gd->DwKD2wvH8|In~R+0TQtHup+;S+)BIo`k(!S03c-7N?E6}vs~ zXxG1o6T(HC1MKj`6XBTk#miyF^q)9#oRP0-l;@=*?0>=Gl>3mR=yU%xRzvugiPLr9 zMD)vje6BdL=$5hXkO!uH@6*pik`BfeJpKES9QzRs7FGL)V)rl4-XE#Yhi&sY_GlOc zye8tBOigKyw=HfTe;r_-gYD0xiZk9-+9QruDT1tHmyUP&VAs$`vq=2ff@pfQJMqz{ z7D3Ye*^~QMLoe+oa{|11+IZRR_?1xk=SHC5$vNbj9&Voyf$bU31nIM$%&5b>BHbzK z%GLs`#noUVL)3^?Pz#?J>(BatGvx@8515Za!$Kw~ST`xa#!m&g7yXpwp&VzVp97DB Q;3aE66NOuk$(jYA@B7&4bNH@};f&z*(g3=w| zGrGI)?z><7`+e8LG&GE3 z7zBKC^urGm{14szx|$;TbU#lp_yfm9-Pj!s?FtF%Uv#wWTuQLR`+EjP9!5Ial2*rv|Kqm64f3P5@C)(@@c(sgaHuruTS+D72QF?l?(X3HvZB(z_WZ|p z|M{KY`|H~~dpLtpaJy%v;pAcC1`hVHKwX`z&>u(tU+?kX=hAb#X9Mo~*U^H19R2t2 z{@7odA9dq@8;IYd{OemV%(A%B{C_<%S={Ocp;)4fc8%8}C^)8#`NMG{q5%3@y0T-SNZeS`|~kh=uG5-=VC3&7nlo zVcn&;-G{-77`qSIqPxRVKVN1ESaeuluprQJ)a4<>Nl56QAMJf04`}j`Xn+3w_f1R% zacwLA-2K<)5H`pFEP&vDhx}_W|2Ja)n=t=aUjJJ!EQtTVgo{yKJmMsIZk(47K6`b9 z<5X&QeP)H@yu3GIdv>(FeLV`wGRlK~I%iw6I&BF#Xy#R2i3j^XCel~cx5Ibvmb2{`djJ|DTM#nH!@@giCi; zxBL%hy_u6;7kY*yzgM&ehv8q1zK}%Sk_dH(MQ7ysd!5%3ryF&t@4wuyYPK8|`x20f zS8n26G0dDOO?>6Jo5CzS_Kx4$ld-z}gwM{ckF7tyasF~rqOI_2?G4DwQz=j(aw4D( zF>SNnU+5l6S@$LVY&qpKUH_flU@WKo+Y7yflmhZRzms7|BV$2afm8N)ta^bs^yKg- z14-lM#cA%~bJHerIon;Exv4$mu9e@y5~ZMwbV^`5rL=q^wQ z!>WAs!9;jSgd|ukk$%?Z8$ZF2I~nub3nefH@?$vRFN9)mT-js~K3<}D?af>NHS~s# z==&M>ks;ZLs+JQjQlTeTbk^7#0;A}>GOZFM(v2)*o>#RrdjFi;OB0**xS_t=NhrtS z(Q>?e+dKr*?_lG2?P)$alx6AQmZ`7J=Lc^dyAMk|&2`fp|Ki+w9(1@9#&CFBGAtLKBE&G#-p8KYBJjSu~5dZ90=i0EGecuUOz4npo<>{A8w$(ZRpWQ>^ zZaEKL8{0fGbV=Qyaek{X3XbcvY4X}s&u?dP3IY#-U13D#aEA1TcDqdyPuIp=z^;y> zJ6TwP$>p^rld90A40$S8+OZCmqz8w@PI2$T&rkQO=+^tRGvCP^OcirF*7mo6yFR-i zS>|$jwmZ@~>-Bj%!o`?>0^=JI4a?H zJasPpX3E6kNjeBN1jkLgFM)?LmsJcxX5Y4&z#JKOzWf?Ss5r2wtCFd zQYdvu_H@Hzp$}58JfjU4vhnlhJ7$8Si=1EH^`2E3&9cjjxzyyY6gl6USTOP!Q^^_U zT1eD>mziU%0gAgaQ^-N&`_EU?D{jIj+sl{+U`B0)lB+PZ%Q)fTw&wRYZ`TY)F!#Z# z?L|BBgas%Aclz05b~~={3k)-REcPX1U=wr+9xtZWX{y5C8Ef%3?J@iHEh2Ko3VlJ5boNZl6rCOFhS}!l|cx`?n6C=hZ2BRcr9QQ4SK`1oMGTg)A zgx|AbZRfTRPFj?F)|WK#XDusv{(ad_TVTK7@0D*{CSnBNb+;2zDfnTNvGr7BDXpC! zezAL%dy(YpuStd=I>xm)hP>shS`I? zkNrbt^W_B0#5!4JbulasskKP(@vb+WpY@FCFoSW1u+(k}D6aKiK|hT#o}_}0a9DwO zto>K;6fE}0N$s~4XyL~K_NQPm8O>R2uaBuQbTb5AgF-VaRUbam>hX1?l@~!4ga~qX z+DnH?+$UB}L?gq%N}!YUXv;UFBo_{{F^u&H-BHxnR+&wMMH5n9d=o#IvZ`u6_*Cl8 zmc#Re^0AaRf$JGJh))HiQ2nr{gaN0UZ^n|SrM!5I9UIq5E!L#0Gv*Y=7=pm}gP#Q* zs^|Q^3*J)O8i;Q%Q>eRZuPH}WdZN7?W-Cj_BlX(UZ;h3>B!j??e3V-j{L)cDk4mc_ zDgbzXFfJ@Rdm=`F*)~){hhGIx3p%~jIU^z&>-=8TO7CSF{vwATc35QNGG!4(6moKX z8iPO_!Qv05_54?1L5E2WTG*8&o>qWR_AVJAX80_0SNPw$)bCaO3c6|eNTJXjVKFrv z1Ag?LmWmaDJ54(S+=ALjCoZLtk~p*>L(P$|nS!ph)cJu2R2UQPVCAOW&{Ys=OVppV zKRwudM*t~3jFU1Bu%e}kOtEJb(6xEarHKyW_#TO#G*RRyM{ZdE#bN8^`Nio^*Ir(% z4exk~Qc~ugF-FwrDMR6TLPMou#ZQ^8l;rovjZ1OxU~aUW{_u#Q&^{X*qIc~g3Ov~Z z%i4WE)jF0BBiKqVCg+7vA56S2efR!$%qhZU_sScnSTHKwPvsY*6Qydl(s8zJ_cEX+ zG8P7Q&3CqTbD>Bbd`Bi&gL0ujhQcmH6g`Mht62rpD`>Sa%IWR4s%q@z zQLmi7h)r^AafcCW=hgQZgwjZw2Bj*~<|*rkc#s$jH|Rl?kg{z>oF=4S$jO!}HkE=( zzxF2uLy?Vj#&b%+Yywo}@dN?4hFZKM^V^?o^z01M9OP)u?>a%d@{{t+>J{xZPGZBC zZ&sHw%U0G=j*SM|GB3pP@P+YIH~B*KJcQ>C)&l03+3#GOmM@=5T`h;Y zqGU8HvzgZO1DfD4DXjUgj*yGm*)&n6*L^!yB@b=T57^A@UE)PQm@kin=PwCL$MlbG zt>g#y(4#AGeE9MDv(v^We6}Xpiz6%YgpR>{68P26j>3^S6?a^39aYv1ONzsdsyWmK z3tm%)UXR_rbUw%j^D-obNZvoc@~C3GPSm^;Uks1@)Y6HQj#SuAQfaWFpe7=eEc?=e zoBZ731z(!3(3AW{1lmp@NROxc-+NUByxh8nASK#rIvlrY6upO9c>QcWx&$6jYUR93 z4)Pbzu?16EfgSQZp7Lh&T0vrD4t326lMfR`LLoXq1fGhbW#}EsLU#don&P$J*aDYW zKd~TmYsduhUL7rGd^JMoYbT{BHkB=D@;=ke!BI?ev9%Akorh|Sj9AU5v7b-D2q^t_ zl=jWDY|?RrAQfZ+em%nnhPUoU#G*^SaFnrm!__bJ(grWK1#O@a22oF!dBdz4hlrM` zIa5g{V}L>*^0$a*wEKt?1aWr5h63i3FMx_QK&szBr=|`kiQG_V!=;F<@1(z_I99?c@`Q4J?$fyz-TgOP>31MD%=*h> ztn|>fk_^f5pV!b{&se=od}mGvXAr@x*Ip~o$Az6fjUG&txiKGy@nhh9MCoBFrMrbU zC3f%VEkc^Bjt_ZXBa8=GlMr%o9pN|bjnSFQ-4g#qerZJa8mSXEwa1>#1zVy1n)=;~ z?O?v*&W@$K!q2(p94_H&@8X)wJ1AJDDpan9lu|0p-@#D4+|9pqnf%eUs^iZpUKi=k zuuZr^U0{!FVv>tBNDde>K#kZdB73C}MX^mY5PWQ)ls=+g{k!n;1JUhitJ@we_TXTa zW!841b>6bL<$%|{DC&o;ES~^hf`kjT*$nC|)}M>cgt9?C-=LcO@p3}_Uw7Ntj|<8#$NbEA!7}o8{_4w?V z0DUbCeNe^a+*Qt%r)OX&8WoX@gmS3^^np2_XObJ;2D<;L_QUf4D6L^yqNmm3BFTHoO-J6_T5V!aUO}5L ze?kL_E`~d~!6SDCD0Zg&w)=lGTp;EUgH*&~zMA$Q4)Hgr!VwVwzkok;bhFtS`ss~x z@R$n4E7@ZdXnL=v8>Oj%yK692s~e-=g+ec`FFe(^Kq@VE1c2d&!kRl*JE&0&MtN1! z|8e30RWO{>`Z>u^hdgSbGU})TsW4HhLH8|0zIk;L8qp;hv zZbX!FL1#p2v^ArOSB5~@4=R{h_WS^1cY0x4?L>N+Ei;#POcc5T$yx|2@bdi7|3||L zJLx23K@!@0dxrX)f=Wf&=*m4|qK>m#q zXV3HbrosN$1DvGl^(P16Vetj(`xQjqaQOW3O8(Cel{BAOCB1)~fpY#wO)vc_f&TI% z0IT*^$IegoJGTM8j4A^|Llx;a&|V3To|TpF8zRy`LWUYR?kXU&_hKYzI6 zt}{^Gfy zZIvBAaAh~#1l9jBV122e;F4CL(t$(}i39wNO-LUw)Agk%yM|j0AnRMCD_rHCI2Cx>5V zm<35(p6zZc%L|ydq3zgECw@iV^Nk{)a~TBW0ps__UZ3vmzrXcV;2}?KA38Rz04OWZ zh#u%K@wXf?b@lI4q~gyE$IrBksnReuO!@z8R)$E!5D2W=WknA{Y>$sV_a^RhmLiH4;0eUOB$Pce zFMn1l2{2Z}S+CrM;Lo?a7Xa7@+JoEsG20}T0WdX*20nwVG?Y=uT!M~)6dzz_wGZoA z22lk7nINI9r$40ljC<@jWsle|-LwQAQ_6UQx_SS@?ZkIriuuNJminwb_EsF_eKVuT z_p|wsK7-U5B=KVaew5%v&`@28r8JiqO-Ly6YYRxJIFg4m?wzdD*OCEHkJ3`cV+cC3 zs0fONU(mbtG4?6Dph`u1v5mSBS;4&&ExvZxH-O&4M?eJ~LyEvE^a^>R#HT@WPo<4a zk;qyL2tk26dho*CXWW`*j=9xwWD zzPjZyBG8F<+qu~@vu0tUH8}W(WdfaI&iN@ooM1|cIij2E$ev~9rmBn@%L}55Ubrw{ zh(`~8iK6$5PGMGnDU{J8a$rAfFNqnCZYb`YoP4ip$hFPB2T$%=7xQP9qUOLMj-rfx zTL3O#;|5H&57Tlqmiq-|teABlJegTmfiD5O zn8DJioW*AH+go_HbSbP|6-wF)zsq1y{{FhWQB4s0OmId&vyMumUxlz>)LRS7X?^Ed zjOsq19I(lZ5Z)QgG3o^@^LeUAQ4|`QOGI--9}!U4o5vm-^3<5nEa-SCRl$gX1@`4Qe>eRP;Ev@ zGH>d!G40f*icx{pwTULE7#X%P3XxYut}>oY*`o@yZ&^gbrmr7-PXER#yu=Z17JM$M z%25>dE%`1n<}%HgC5*$(hV+MKI-I(J={rbte_aeZ4yh(c9;oye@)j0HDfkVAjZV>e zcPw7yXI)xE(e*rL&vk)FT@i0c!gLvg);`Wd5F9f{+i$FY@%sEgLO+SjlZVtTT|r3r zH4Pq}WSTIhL?U7>v9Y~4_!6JyD&7+eU3~LSIU$(W#coC_L;xCH!RYH8c?t+MDiQn9 zYie{VRGC3P0okF^j4{Xo3cecgbsxR+UNW z+8-ri%ux*C5N&4a8B7L*q53<^)P9t8S!XhQ;2$b3igbtL;L0W7 zSf`9+h31wVWtdF_FK^~JHQpPKA{Fo60*Ppa7F`#6(Oy0d=2SOybqZ5k^_`a$Rq_HE zpd&?Udm=i@-dB|Eg zpcp0ZhAT*-I1`(%2I_9}xeI<4(r~ z4KvEOqaNAu_tNKM@RBSjN5&y;g@Xl*L(A7T`B(=|nm8@hU4QWDzHzgjDtD1TAnR$< zqNxe@gJ=j46KN5ok5 z^i|)bBJ~XN3gXy6xg~>sNIe-ya=TrVxNSYC4gwia)(8I$&BU;S!QdiWR)?}&7@)$i zQ!wm6Q^U+xc1h#7qa+jYm>U; z&{83_Lv-*_y72x-vqY!h6Jv}g+}d6PFjDb7z(G4(lVIPb7jS4}i@3s9kC3trJOc2Q zAv64IG_Irkq*{D=IXOokPvdG4?Mh7#_5Erw_VoA#m8rr`tHe(Pr%rJ72t)$E%?(0w z$*Iq^fU%JS^x$^-d}ow%{d^>fvW8Su$8%uK*MmzSYTm|muI*=Y4a2}v9+5qD+}WIK zJ_UtgPF$4~F;X~Xz$NAW-Xw~T`Hkdo7AKSW?!92+3-M#p$J$grxFWzfAFa&BYa<^e zimuHoB)XKVazK~H*c)1FZr2Vtp}g^M!=zmox_oQ!1XC7F!n#ZuF zx3R&F!W8CQXt zeEpS8Wd1<-G*p8-Hy$=G)LWA^9fTM#O!Pm$4sWzv|EBaCx`o8=M&%*?eJyohQr zvc(}C>mK!Dx$PkyNUhM!tzHp{Sqx6JNEH{vGv9@HF-16g%(!+g?*OYN_Mkb@-3Y`a zeCjNe6#0%J_^c-I^dR>dap=**Rg32R51k<=Xe7iXgo5g7d=QSw8Bnd=e(;KHoN7@D z#s5b29W*yYuYD1M$X?@JSiTGJ<2g|o1Gm}U#xKvoBsRJTOo%J=yw5Jrmo3BhVO*an z8_w2}t|^w7U2s7Z@eJTBdqS?GNnwgNxT9i&sDlB_vKK#vdYE;-4Du3V%i3$;+f{$l z(45x3E8^3n7*zA*9{5+M%rP7t+>SwQkmT5#KxY-Y=(#thW|fhqZzAmLqERn%rV^0! z*3n3;NHFirdVh@bYg8s$pGNR+r2&+7qz^&KYnd!Uhnik0%t%Q|J3}E(6%tvN0qc7J zB`jKN9w141bFg$dJ`Z3Oh35xf9D^A4?W{e$W?ejF?{p|U1Tna+yP!Sl6Kynq~ zh%bHgHOvT0;W}~}@~v7XPX?CBsNO^906Bt&5bQv@aQk7DvWr3=``P+4@qa9{P;3y* z_$i2sD)c8urH;=fD5~%m(R5j*4m~ifB!K z00JCSm<6=sxbQZ7@yBI%0kEdpVaODtx_F>{Iu)c>sfSY)$HfF&>(8s)3b3wT5r2w& z*_n>MaYbPF=lkarw}}D6T?vo^XE~Y6bL<(eK=Hd?cenDpbs|Tvf2jxqiFh-Ljs9`icfhJYVF6mS?C19;zsSpf3fYO!_R4bs|!nlK#IYbO;wS-~oiS0c~PWvh}>S$e8xm z5-Ph6_B6QHX~gr_iW;ScSoOpyh`0dbL_D1ox%d5)9n)W1?PgYC; z`<0?@i+x{$P9LDSnVE*&(LyDfvYnfTzm~6wro2pOddjZspNFvMcSG|)q5k4-)Acr| z*+#(Oyp7dz_?;pOVqqZG74=ke{52v498i?+01!jF+4o;CiY8obi`Tn${VLI~6P7tc zeu$R+Jb2~x=K&&2A(-Sw$h%k|%h{Bj0XQ!4-rz>y(R@^VUx(zcKqSwIiWQ3Q>i;}O zy?`8sG8O@D@UdkqXHPuM#}{{{;wXwVfzLDCgrmvk7d;!INo;?se{kq;yl4Rwsw=ZU ziH?Dlv=vRneAC#5jJEx%0#2=v#_#Ln217P!=x##w*Zn84AQeN_F*Fd3zoItcG4kq_ ziDOExf5kz2QV?>fpFF92`mbmV4iXCWkd6GK$oX5~rH%kwRPWvU&iSwNsk0ZvAs#0d z{&m-XhR_gIlw0ST$tS;mpKlyw81)savVUDd!8P!x?9)B6sSgwiL6pz#!$qNB&Ya1g*;?Nf^5D};D!HJ5?uQoW)JgJ9SQT}KAo%PsB9WlH zkS&Nq*|rL@aRi2{g{zjaY8?S-iOdkYm2^SO!3hw18fjI`pJ)3|5#kUD=4kdkt}FkZ z^WZ2OC~NoqZ~vZD*{Dfsn>`Tv?@6kOBBnI9v+MsoOKnjBC9AZy;NK@qK@}(0-?H)j zYo(M?feGwT`i$rQWZIX2*zv#H`aNX--+o&1IXEP2liC)%WzY-Vf(M^p%eDopeKQ<_ z+fS4Z9c?e%FRdpxfV@|0)05wSrSTx6|T^T@}yxHM04Rdym?GIk0h7hO(%tTz2kJ@B3ry~%DH2hQsr#(gu0^{>k*T4 z%fIL3I z>PVy?6n|p!=1X0iGawS20+tp?AuT|Hjt1HXywJD?gSjBpLy}+&NhH|>j;S=q&3f>A; z*^}EFRk9KdvjS4w+W3=qC>c$UuUIXy)B$)s^Q^5~6iP-8?NL7xa-F0vXyfy%`)?+% zYuyE2SD@~3c>dsA19i*hTei*5Z`gHJ`NJpyD zc7Yn?)~l~n<5kQ7r_u~0w^4-oUXL?MNdhiDv1#j6J-PFjHS$KcQDeQt+7^8Vl%};W zt(ODbRWktQCP@&ZJeA(109jfa6G(5h0nzm8aN};PYYHcYU`HBBu)0zUFv$9TG=N+QAqb^swT;k5yKK?Afc#&2=I+7hBgI;50*-N(o$OiyZsrkXS(9^ z;ee;}PKnr?&1GR?hJxn|EYA9!P?QHG@~Oes+IM{Ze&3wCqKWzfQN9sSzD&Aw^E85q z5W>ag0*D#`LzMQ2lH;gsGc9K7U88{91yHbVvhnsQ(4GAE*A4BrZwFWvQY|qAgDUm% zXBUH|eJS}&A6ZK1*$shGs6&E$z((-px%b>QaO-7p-3i=(|Ndq&N}M}P_TJrm^Ci!@ z<(Oj*SX8b(hI&DW1vGuo`Kme|m7r`VTZb|fp} zz5iujBgYxFsC{St>itb6iDO`Jdx{IBb`sovaRR9g0m**us<%Ru&!_2+zy-4n10`HB z7qEteVsutL@hG*1-hV^kV|>ab&_+qzaPIwRKAf8gg>;~tldaif{wwRA*Z3Crf=b>3 zJzS8;1ZZ~+%2$a}p5R)CkCa_24%It*NZ$^Y1v#G`@Z5gwdo%_pB2OUUk*q{em?B<+VY_xwQSFvtv2aXZwDwG$(u2eOD(N3t?k7TKTzP$mb@-@tT=gQ=&_oP22pCD65B7A@(FmX9jq7=&q}ko>W_JN zQRdUP+X~%LW583(9255ynMfiV&`g;U3Hb^$7CADj77<5(&L47|>dUHUbtFgBXU#OoQK7FzIFYvbyljS5TU|tJSLiv}_MOv}n6&tK*7NDQB~{ zX4Sjb(MiLA5~HBaac+?ySe9U%2Keov7E3FSE=nD)fBM+Vr1bllk&cRa2w@7*-pAIcjmiswq3*RG7)DuHX(~35Hh9RLVTood~V5f zN9m|<^UOzd722VmXsy9@pvaN5<7A1qRQ~)0lT_>;kdh-(P)61nt$=Vyp||D52$TLn zlSY85H%VPKkICC*zB8+KN{$}y)XY6B#*Cda*4`a!Fb$xKdFWm5Oqk4j74Fw4lRSbc z@+bP_kUbqu0)SK?FMu)gbFt_fpmV!6?2T)YP^)GI1PO3=&xH#(D2HpqGw?oUu3LRV zYRTRT@UKtK3Qi*upthx@Vwr%}Ty#oOe(6Av%8P~25Q&5fGS%Yog&k#ya?D&H*+Ab$ zfhcs~lDK=!H!`l^_8TYC(i=~84cH~#>#c>v^m8$I3yu=PQ^bngRN#+~-+T$u;%LOW zBjj1=zAW_QPV%&Hru=xdq;(8Pl-_N9-Uax(T1letdJ<;ip zhP7KW=Rbl=L?|T@CB2ha$qi$|@cO8jpdHlF44%)2rNQC1JB<3nsyK-aqU@BPUQk$MfeUA^SGt*WT6r^A0c3w`C{56Zrrha6y{L+c5Iq| z8FXimW8HmAI&%uwMlxz40OPMHL zx#&ivmEk>E*lXj?c5Ev5=zgzhpi<)S>RZoSq*0N#z-%I7GkGq)5|lr!ps?_gI@4EV z-s@vdj1z)P-cia{*cyaiz2gMrr>;dM8oo}m7MBe*7_NVSRf+=s_sHv=A7*2<0B{G- zXeour5o9q^ThNwJVA}sINRv=|kd#Ux)A5e?tS36@0YPZwCD0ke^fZ6cecKgl`WVlA zA+U`gNX;!MkvUv~Xt8NoQeU9ku^bupS6t*bJEYR<#Mp<|STivW*7Hdlqz zQs~4olay(pKl>Opl8%wC~|9!g7uC_KelafyZ zip{`sVTs6SHEPFMcVL7B6X~jX&m54K%(^lwO(EsYl(a=ClEl7%=wmTO!7?g{`HTEs z4?}BaCBqS(H)3v_&^TxlbQ^DvR}b38QH0V~>=Xu#yp*#hWfJ@BzcTUNt&od_*8 zn3O8>AXSz-zaqm@0TI5F@kCv6%SEFqfq?qzT-J*%*M9b57b{uXBlQSp?5287GNI%x z!C2NY#*kW~uQfoPyf&=H!RuoY;8kKy5#a&j|k z&LZoIj6nf{#xHxLZl!oLLNSSyTzXpv4;B!cT_k$S&q5{)N53i8k5-w<^@MXQR3pOg z@Xw`e!a|YankA1T(qj6iagFQwWvVgR%HLf%$hRa9cYYA5@T16ry>+J9t zCPj_k_p*_53$KhCxk>2xG1jGrKgLW6~D8&82#=NOpR zht^5VZnUJ8z36tR)P2m11rB%*MNq>n5cGjTRbp=|{ZwXn>!-;J@g#VaS0QHUTZ zFBf-Jdnts=?3hU6*3=%xjInpN7Z6aP8(V!3rw!W=Ur3O00>z9xRy!9#tRmH$a4pSrpCetn!KAloaE83p+Q2bims zF|t*Gl`O^$X7geg+X^_`^r-TBnyMS(DQViT%Bi*il~v}sHsfRI%UX(r%(5r0)sSUh zPP;rYY8^oR$%~Sh4{rhaJ3Jwo`L2=G%2&$94{<^(6BuGrCw8A98kqj*;=nuyny=u* zgMF;N#`ab3EX4M92jP4Ti^Ss!)=ApDYs)+|$~Z^R&By?uSi^TwN<#dz#AU>g!2=0W zWJma`^DUO$A5n#4AeF_^%Rc98i8)4;Or4S*luV9hiS}$ z(vef%XG}a^RN1^TT0^!2qD^V;WeslE?x)ggBw}IK`jEp7{kr+rC@P}Z$CETV{htaX z*$U_t=)_(aJ`aWfo6{0WHy3$6lgGS1291oLhq&H%V3#hs2w-ivLu&kf21W#B2 z*14nvtSm9MdV#y0&*ao6u4OEphJ@#wUs)5g4Uas3j^{4%5xI%g-k50OK$Z3u5R489 zXs1D`HmZ7g5?)5hS|?8r+bT4QRv&6^EhNe%$h!B$XU0AGZhJ6HdjaGzMB1zN#c!rU zK^_#JzSjz<*KX)9ycA|>$jzfk?P!l;hNTgO3Veqtw14ySQ?+Oyd-z@Zn&1BVqhZFZ zVkaB}nvktB_Z=9fm9;0hd*-?Y9c#Y$_juNJ>NrG1JDnzsoh6{%iOiVBwfZt~v&{to ztCD{qCRx7M8=ERFiISP-{y-Fz??t-d! zp6>vA9^@)c`eHT7B2>YKf$gEpBPv+kh5P8cq{gTC)zUAA>><>wZ8W!wS@|t%1l@=1 zMK>1l?owX2t|0E=!NQ}2d-pS~+fGm`$tZ@2=g!}Yrs~30hGpPsON2^sWNs%b#p7#0 z6~=5W#Ao681%9J!#>c+b_{7sCe2Y*U-nm_1pzW|qW;fGU6!=n2 zo>1>;EVX+d>#t&>7*mM0=B0ROrRYGL2-}Grjk6M@7A&O0pI_vF8>`L}*M>d_@QT1|S zc#cS_Q1qSToPD*o@s;7`F5E#O+40l%d9uwY596CvsKYAjzONLmdT2Mva!eJg#hjRp|hKJ7COJYMw-d;?vS+nE>_Ctt6v3i`Z z*(S%Oo;z7q;of*waZ&(Jjx%=k_LM~<>%ATc5%1>0u&(1ELzhapo-8^$PO`PeD`Vwe z<_>95u5>j2ur5)EaAaz8!F#&+TBzNmb(Xyx_x7mjvkp~GyW{}00S&3Fj04zj3HUeX z5)z3(2`nz_yx7yVG01o<{A+gHzd3kcxPBsnP+j%eO$aW<^H(-JA4_nK_H?t81zD0b z+cG%F5<@Ui8Zo8unsfCv&5`6P-Ht^w1CTgx!-W6eqyhl;9Z+;rwn=BL0d@!gr2e2%E9U8p#J6G~RXa3PYry5} zev)CTc`OI)%NtUFxtIe?4*L-ywMg`CkX7!YO(9e5UG2gsQ2n9hRq;0r5xyIjfB2E3`F=55GK*i%_E0D8BD z^wea#7gpPWt@0EYEQf`4iZwvLmh{Jhv%#SnTbTK#FTlU^NuRShUFR@ofdiZTwqp&v z`Bs(b;Aj4Z#S~lqUEQ5vBIkxB@=@1wKt#0y^pW{)*`1OUKiw?ISGMu;VYwD6Lv6!Y z{9k{2;-CpWE5UuhN*M;NpTOu`B>HOTDTShBHSmPX0uS=tMvr?srgPjO<0v*w@^$H* zT`0L(z%~GR&zGN%-NDrKBe-fpi6UM4UIRP=oTfv721o#{Nf;=5sC)Mqa5*Q*6T#A6 z+Hor$Ybyfz&l_@}xIfnse_ig7T0~PzOc&)Y90r|3o&X5WS*PmIgMv_h8_*-$pxxJ! z(;qMoIj%iaAC0WyOT5mv<~zk^Tsi@h^h_y=06ZE~>wVxata`gMfHI12DJ)e0Pz1d8 zLp*hZp=g=UoVVCT!`piS4*~o=aSk#QUs3y|C=6yLO7HZkP~0Vb^%8`9(PhP2V5IsC z*t>)a$h%YZS4!6#&QMPIUkylb-jP-Sa*qL>AZ|NI`GXFmz}BF1mo|1^jeX(k{=gfH zYO$WS&B6CXf*$J!pmm>$D7(2UmvIo)-mKWJWn=a79bZ!%c?i4PA@HFj&wv)*bIS5x zoz@Ca#SwC+{`WN`n%+9jyYFwAJz>`+oCcg8ssXtjlVS!K_gjE}1|Gciwf#Qui~y!( zyxITI*zWNw;05)t5qt&^JN-0AqXD29gSOhS4*|e!2|WGf3XvzmS)@)c!5i#rp02_6 zE2g;P(vHH1D=xtA7^MiT9|!rDXY5tJJ*#|A2xFS{UnH|PcyE4svXj!;i-A7r+4+Km*zz%w_Swx^ph;2V9#rN2)L1HI6BO1CwM?@aQS%ZwI+Q zX!4p`ggr<}mN_!Q56-gCPf)7~CDu?S?5ujvI}Vm|H+zV`TFL#3jZpvp~-Z%26YAhBDY2>!-zu zDDp4>7M>TM$w?|0X{L^&?az>42)k`a%G0A3O#a$=~T1bWy0rQ7Slt<*GH# zz*pPHgiY&$#sCRTlHEl=0ZZp7ZR=5?oSnZrDf&}uyCjr9CM3;5W1x-a;}iy2A83ik zjZiavKwBsPsM2KscL(q12{vl2DOvKT-Cu0zBG#(rQ#pYNg%*#-(wIS(5N(G@Ou&^}6Q9VG zVBH}Ac4hm$oqCNBD!1rPGu7~Syl?K*TSw%^IN@2&a`p_8g;QFBM^AwHRdxB8XPDWf zXJ{~Y+g?5X{0uzK15kKod>b)OixJDdGM~Qq#oQ7(e{Wp4ekIF}{^~5ISSVpR$pQ<% zFiby_5+A*HF;y?20G%Oyvc8MRO?hT`fJSZ|);4tYGtZo7A7hlj@Nx=-xo{O!GI*BK zI@T<(v8OF`2CrMV?KfHU9U#;E*EXiFOL}Q`+nP5HffqyAqzOzfSU8fg^0Ra^4g=mp zK8n1t0-+`gAc+z@pFR|xM-h|njhA_aIq(yD;xTT&=mSi#}k{m zW7#%DvNouityvM8I$#MTJG@YiPI^wr$YK2U4zu0~!+OM`;PMk@ijWO##4YXfhg$Of zP-*{*_3$=e8AU3}I^)-wL>m?q zNP&iYKy5~8Y3~(l#zqEzetmhPQ-Kf8Tm?E^)RPx%>J$=X4k$x$aNGF17WsM}gD@|* z`{2WS6w&5neIG-!l2MT__&|0LFxY}k;}RUgZhL*{W5UoW+>lpWaoX4FGc6*iwv=y# zHDh=KJ1?rG(@Zv9a2e32lt%1_bbA}q!f{WnXn&7zl=><_o!gP)%z4i z=ICjyTgqlUGtqe_E~B&Cvp8%641TMOX-QUQYP6EOdvSQc@CS|zG#cMneLh;FRu`lH zDc}TNv_$Rux}s%{HN*`iKkjT7sl$#Mt zw$6S>1)lUyZqD>N_QQJ{vnpM~{8`uPBwwN$tzH_dTeFM5+dD2^%nMbq?%lQd{tolKq3>g@iq193`2NDp=z0yNU9_{7inkreU zt`kDZB1b+QKdk1k5<({t^<)Bbzu0rD88<-Ge*w9a%5|r;q>Q7_hBmv+K82J+y1@mG z_6e7RAHgI4Z2sFr{^6zk;L^&NeoJb(F7p0I!Nh}hpdHGdw%|ra8d124Gz-6A1Y+Qx z4kNJk5+|Ykx~KWxkYtj0@ z%zMiz0t>n1)DlyVsI;Xck$?K|-c=#U>(mH__0o#|s0QykGPRd+F&92wXs($_b%_XO zp%1uN_|#pdCxF1q4`D7;uw!I4xyabyF<+1|t8dkm#6EfMxTYo@l6?EBPC@y=B=r@{ z+~F;dYnOL`vUalvI*H3&(WZf2(-s$)OaV;}@cXu9J)v;D4tXR(9*Pkc)a(B&$m2W8 zF*s@o@%7r^T#F(V5Tjh!x@TUDK5M?$?=|{#TS3F?es1M1D4%S5Cqb{HJ?kI0MBMa7 zi~wY>_=ykuW;X_Reu8e&%r90fy(QVLv|>n61z8r(N4@6gMd7B+euY;@a3Cc~g*V%p zjkwJlPGi{fWDwj78`J`B;+hY^w z3#_Ovis+szc$fsPZ61&&-s*BVv$ERYcCs3MyU|A=)5T(2fzx-VIY)#3Cw%__1APYkPoV^;^u zE|4$p(qop#O?4*k1-=AbB+;qB)wEK%amO+ka4Q-bh_D8^K@=1obsHjDzPR#L{kis ztzTo}{h=b>{Z#rEljmCgWi^ZOd0Gv2c8(FdjhS`i+LOi2_Q!e*DS@~46g>Kjb-_Cn zP%;g@jHh}*IqYnx-o)_HV28SS3iQa~>iu&}Q6ILg2$R!GP&QTbr3$Ip^rDyVM-z7- z6YrLSHisM1aEp{D53WK~y+@UY<`xF*(LAyCc01A0x99F$2r5$1Rxj6`Bb518Z5%Nx$4Xdd`&}-|c7>I6`-PW7wu^^lD$i z+}8?{r1XT?h*saMwXeQWdJ{QLZeoDy)N{CTJmZu*K-{+y!xfB^OoUjm>r0j~P$$Ql zmidl55B*jWT-mgHFKxp@D< z$_AcPvyCDkp|zsvR2l{CING&V`78_~Pj^V+NufVj zN-ODx;!>xJAJ_}pIXiNIcuZa1P+0*jk3j2bp8C=rlCZj8GZeDGE;!)>>T3h7V;)pT zTu29wc{h@G2*xH55jH*z;#s1l`@ykY1p5Y^4Qrb$siTGqgecu_AbY!Oa7r1BmW&r$QE0C>$+V;brGN=gRckgT0G~^1@$eM9@*w&ET{D;`=qqSt z(d$tzA5z#(Dt$Fh>XG-zZ6leT13*4s!v_uO1#GY~qyg)Y2Vj+JkI8SyN2EtR#su#r zp$(&deOfz~k?38>Iq4LTKlkZ@l$I$gwETdzW2zlg10#6upeb+oCiZh;KXT?Ad7mPC zskk+lPl8hDW3Ybr5G$6*I^qE6Rf%G-nTMYn=G0~YX_;HBFGSaHVacX)!PAjC>epEv% z9dP6#ve8FystFKk`*k;gkgg!51dG^}`-mt>(7-S~uQ*sD6AC2pD5Y!x3(8@7KOofu z^TOTXYnFsoROO?!Jn#g>Px*bUM^V6w$;e}5_t{>?0FhkrI z$G~-!jE2pfKrI@pr7cjcPJI4-B;*0+a7u|O-ZI-g%;t-7=XeymFv*ANPQgu_?+2wh z7D)%yMR((%K=WXhnilUX12{Yhllzi(JUk9f?DUFw7XRWYz!h%v@O&Bq#$i4jws0c% zuOp>6H`k`8DQ2bxvye9a>C@Pp+C9mGFwG-;TJa=itbsd}b>L+{kPxi#jaU7w`f&7? zDxlnnF?zEBcX`#wn&-OC51xKIhAI+G(gX%+MK z*x9U@@yJ> zckZ`qos4k{h+-~Yfo=;l!bUfDB<4c<12}`I-Y>Ec;Ni2#NO`Qn`PJem+j}FJL#8y_ zLfQjbtn?!nK<-fD&g|5tb5{cliyIzjuQYkoCLNB4|mr~=W>M4 z;Z#E0uN3`PU5?%x=l;5=oGRB{>7bBSB$9>};Y%wnaTC1V-hfZx4U2*-KaQ^yRYt{x zD>EQfQg6iht9JUE((T(JZyiFRqN|@}-d^G7$&e-L23xSYS7iL+81A0*9Ci25eb4O- zi||iI@2FdH^zD6q>w(G>&!}wF{EnG4SZOUA4Ekq{gIR|+}W}&CelRt4b z5tu6%JS~pR7EBk0kD5XXKYz%Mc!sL{u9L|Dz|3o~qu~y}^m*1!0;*%D>%eSU9E#E& z$bsDZ#w^Z#pPt%8KBmUkVhV5^Zw@_NRhYu1v@NyEd*u!z$e4p(np;_9ik04`@7Ot& z(C?tZ;(eLswz_(X?0910y(=@R6E_j~frnaLRaD-+%;*uiJ~`!f%}fh=@P?;4&6&}l zmYLo*f17{{uLMm!cHLi_#?3>h^#;`P6jUugx_{Boa_@b4{127dl6_M44b(*o9@4CF zX5SWxA?As#K0OV%$S|&Lq3rw`Awh9Tx#xju+7LT6_n;cld>c4C9WR8oQaqe8=Dp5I zD)7X&fm!9}i)RDmfnPU1h53~5K)_lbZ8B&IT(sw|EtSs!DMBp7Tx()ac975f`sfCD zBrCf+3&u0x6oiZg0E<0Y0&PN{1fi>snFQ0P;*+lF%Mxwvm34j3M(&9~dX+*%oF^g7x7`7#xcF)y;&t}Twf z0s2u8GclVo@+u};T{x+0JpZvTSxTcRg7LU+!%xsPIM%&TEW{PD+tuijlj5UT#;uD!2(|1CpWq8MbO$bX#?Arq`EC=^_E-k*pii z?8gzT#cytDWE*aK@=)ieHTmP-gF>r^Yj!KpBrMFY?0K?O5^Th-FP?a$rys=6Fx7i#2Zb zPKkZy)3$T}hCzuK*1)rW!TpI|KttMyP!BYE>)|4{GR&6DTL`7nH}8W(9=?@wSW!=J z25;AWh;TgV(^+nsQhHr!@p$8AY137D|e3=rcV`9(d8H7nSKJ z(cK?mGaF=m3Sz8H`Zc!Mp~t!FxawL-hQr63gzWFXGr2l)kx|+6$LVm%kBO4hJDQnd zC2Q$M3RCv|`dk~3{h`P4jQCfm59HSxQ<~;)SYhSo{g?{|f*Q1|Y2&4%Ap@*08)V9p z7SD2;JNf9#h}ckCx1E;#35juGe`t+x_oFIOka&rU&99MQ*|+x4M@{~H`}VJ+DHCOX{vTQHaDB+J(Fi_2-iynQo`AQT^FL482Om(%mjliw$G1B$bIA5vQ13I*~R~o zey_^=mo5IN7!@cuaN5FIiQ~)M`ZyM$A?D=5zn6HT`G!M>4n&LH_3ADK}l#lB~3fx9YQi2p`*K` zpTLx)T|G^AV(oy^3Cly>{B>2kM1|Bo>$T zO@$v}V>@El@Uz|&{1?NkLL;b+WL-;z4L`MGSBml1O3b}6SlMw@N2egju_ROEuRL3n zcE2X5Z2naazq4MGP$WT`MxEdG$GBUN+f!1;6S#v`DuS|Q^3mozG z*$4KmxVL9CE_rFhJd-3chU(r$L!?NG*nSXP&> zz*O7!lh=C%1@2e=@0HFNLTw?99`R z+;S03Q^g%k*71jRk}96_Hl0s2sW(X6k<>au@E({$R|wyF6TMSY7RL;llf7|vMo(BM zuZT9bQwk+rwjpbb+x&^eHd?0I2js5bM2z2m7}zDe2wym1Up-ToFezG{{d5YA_Hrlt z0!=;tsbAlohXqmD--TL6&KT8_$7rIA<~f>pSz8hJDvhxq&X;-L%e{u2bv@>Wk`+Sb zttkaKI=_{ML#gum`lA3JRk;#>n zyDK>Eio$$1mTxyyD3ad8xi&2Aeu=8d$3U|BZ5Eo7ry-;Nlkj%-Sf&YXj5bkh{8~n) zc7h|WQtXq!%>Jf+r-ERTb371=(;YXm51f1XC~eIoDuYg(U4*8EZx~e88n!KL%zCp_ z8_!Z*i35hHz;RkJ#YDmC(}@&Q9;^V-?1B981|UQ=CB>y=-un1Y%NnvmL}vEt>;{9? zi^PxgOVd7L9l>Gy((`t)b$-w=rdM2*WEp$cnkuA;mTh7u&-%n>Fd7_31wU{Xw*QzR z`}jk@OoKVsu7)z9Mf|ZZQ2y(VS6}nXeQ;*Eiz|tOQH;;zy(D$-T%Yf@r<_jIBD#uG z!7^2$VWx*)*|kwz@aLwkCh8+;@u4vz0V8hS*L(Is8I`i$IU%2u7^++`F$pQB4Hju| z{lOhA+acGfV%a_z9$ud!O6a`~&7!GNYIIF!LEF)@^K}b*x8vYV8Q^Mxo-eH!KXE-imSu5chKLom5IF#Om@dLA}NEhA`UWg^-nv6GYN&GQt+Gi*In!!yrcj@3;wr09F6Gjn- z=B2FDRqJrLe?OWAUwthtE|tDOH}~KJq)z~ZA^X4IP%3`bhixR6v`~~NK?Sdc7V{sa z+2{)SV0OeGea5U;(xOepJSwGp(6YE%CP#^v z@+s*5{Zef#G<((h(=z^}Ir8rxR;z*iM|Af7{h)t7vzh^bs@gIh+4%qdn%}RM{C{&} z#JjapXV3n-67=XFP|Aku~9o4_9hqc%qD?6Nm5+a%1Y>LMGcYo2080j$)1boji^F;vq0}ZS8 z;AEYT^@_g*1vlu(aPQi&zdtIU>YvBjoJQ)azxHna^$zipkcP6gfi_3DE|;@I`5!M+ zpk^QXqw_5wcY~a9z?zPLM&Z|x?bu2$JiqZ4Y8iq7U(6kzTkF!u%(FXEEo40&HOk{C z)nNSrhnzaF=}%+PpRXcs=n zezg#H4&3G!m4p9WBgaK(IWp`z>^X~EV_Vz6wOnNFOvjo7@+eT^qy(c+JMk~Pxuk5(?vS{wSUY(13B)-yNL!(dWCyEJlGTgW!Xzfh5YM>H{wt4?Uh@cRVY1xZJ^` zJiEz68#P-|*bg6@nedrc)V;^ni`h361SX@|<-=8QWYss3)cI&9KFR_Gijk@Qqyd_7 z0e>pATiKr`GHqL=iOjp)Avxya%tvaQnuI@%T`m}O1ivB4pKXW8&`ld)qHA|<;^j`gRWxhQiEK?{FBCmT8p^q9(fb|w;dDfdyw$S;HMUJIKaJZ__W z_3x?oNBWa^OD|q4|8vw!nlU(LkO$)!A#QK|0)QvlfEw5rZ=q?)<%O%|!N0bL?1@;a zpt4HIKkaP(+~{{b=}0S5gYKiFY3g&tJqS1_d-I7MGc_go?*)BG1#1)?IFeAynxDO< z>~Qf~&}-b%$hG{+p#J0(KGk60*6Rkk0}$Q`3dj%QG$R(Bta#bmQBcTCY8zZ_bh+L^ z`(63QA0pV!K77bOO#Yf0W^EV_|ctExBCq1Q52@r_cyb-c(Ug8KIu9=}&)x2a7 zyF_*lymk|JvM2iH(YL4ja}_?+8xl0cY<^%)1GNVl-0DkFzo7rjh1rD~q;;NxnL6}i z$E{domnNG`0@29u&STDNxf12l91PPwf&8pMasei{!HGf(@5S+k#T}L#3pvC-Z0qw9 zpI2?>pye_NDvx;}7ohkd0@TNS*CL^!_C$ybjdD|!`N1DRcMIl2qb??lzfQ0dP%3&19o_d|=;=3X(hY621kjZ3nJ7w@TF)ArDd~ zUSjk2M~};Vd39S!s2(^C?6~Udp8%$L4vMAv`VA{=)q&^~XZV^!&9n(jJdIl$ADc=^ zWZ&-Amg`hb>PR|#MTDe)rnOyUB*Z~U4^f}{`22>Ot>RBm`GIPxa76ssPM&&1jsbXM zhNiC!KsJTq2j_T=kDRW(k=QhVx4QMGJ@eH1?4<^`wOibW;B#+Ula_tpq&6s1>W@K7 zgV7$+GzGLO-F_-J(IU$Wm8M#!aWd$H_wW<|095U>qrn?EF}$&_0W?kf#1s_34FXko zcGVhMkSHj5AcTU>EMIK>4xT>mN7%JZSxCwUI){O?S8a)Q8q(WCdWPyY#xb}Dl8(zg zwGV7kkh=*Cp-&i_A@5Qj2ucxzX1v?+>vS2m1@842?I&<1V?ZJ+t2!FXY+iEk;j>rC zfM4Fm%KN!nE9SFs;AZqH+A#s=Q5(9|OaxwcNqkSZP{g=Leo$kQjNZ+si8=dzXWQ1= z>tg%pftFRnc9CB+5h}SZ;6-oB9Te$c&aYF9O*j(0a~HV)Qy-%QY*2tE`PIF+VUs^W zQ>Czoa83Szej~z~CP>pEbaIiwnvvx5+Q1jRNszwLjj~Vi30S=v(B;{bT&fnZ*7XTPd$U;PWOeU z4n}<<$fpj{Z^EM}umLO1m2;=agbsmo?dClVrT(q{E3e%95b+Pl2#RZtj$lN@=nL$Y>b8p%tw^P zZMdnvVHw`}1kpRBQ9%5Z19Y8@WBn(AmgojH;R$uEYg}#1h-G!?e>`B3B3mo zNF4LLb&Zc@d>#$T@{&pOynFO^x)}eiPk>PrHBMtHBg23hQQpbtex=zPmFmxS!oB@fjzgX& zJ=Go>IB2oVdZ(f3R7r;VWl)o4l~v(~is*g6nT)UTPX4857aG9mb*(&EgvM#NfSN0y zWWuL<`66B9>-n<1v3HJo?nE9w+>I%U+5x(U_wX^mv>Y9vWr=cetA$LmnakQ{U2GUe z9s_Rz%6tQ2t%DO7Oh_Tbf6xuuGE#25Q#&}fs;+s5qmj*l`;~vy45Sh%h@*hmN`$u+ zY`s&GUZ6Yp8<6!i(M^J)$9DQXCDzuO1KVo@PYlWp3@bB|y;C+*dNWFK!uDo9vclrL zOr|CYQl<$Ri5bE^SRV#M5pN5rF#OYBr-DDWeEw|lB|PEY(0Y;1AksE+<6GU3_80Y& zdq46R5HSXXeVC9v{v&0E*~XF=1{!_(Dnt_%zefsK-KSlS_EX-C$5Fx0ZoJyb?~5I0 z8Eu(kz&~&^Zc1W3;%ILln_U^@7rTm0Bw@w0$YWL3Zeat;V&z?1J1@(3rwi$v`24EI zzRD=fCUwUR41Dr)&8tyKB;@UpBubu=-`qg{xtAXb)WecF<&qDfiqS>8_Y&?Ha3J4_ zUlLsc`k_^>Jb8CSyV8YggscuQz*Z7xAF-6bVUHulFtfeW4$KiDpy4N2WC4WgWfp8s;N;K)AiWI&U53 z_kl1%Z(^LWP>dST3B~g7joxFPBfEK8Y%p8d9v{L-&XTkqEZWQW6g5KEmm_!nDFt+7 zuzO}z)Wy@0J4z(VV4+Eu_3Hi`MaqHuWUgEO6yG7^Kp4FijyWt@BREP=hWET;Ck)Cx zIF8h+VMnw>YpN-&j?3+1;ws7S6> z)G>%Sek@MJ(dg!@9H}5+O)M@V_cdvVj^x=BG`AO}dcYXqKtf??#8tp?^0$*7i%3XfH#vUAMJCpl4zNmL)W`2mh=da=x$ z)|8;}uRl+#HI~vE`VJA3gY6j*2VS}}AUu`N8hNHwcWZ7Z65Y|5=W@1Xn6!Wbqk988 zLo}Zzld$n|lv4e3zWupm{Rfwo18sVza?SEwC#Du$P9Y0M!nS6k-mZ(?=t`H}xAGdS zYGgB1EkSbh`uU4YysFyRN6!h`pUZuP(Pa81O=`4^OZ9xwHc( z_lo(Y`-_czK-RtPt`G?d*$3*J8i2-_5qTR3y|?e>bO2WndLU_oKEOnt4W!r|NHgzz z(U4GzolMXGCB9d>>s6E&O;8?$nN6aHt6|$|OJmtSWR>*7U5H6gHL|K4eWTS9)|2f% z(dg4~QMbH@it0UfdQu{QK?IkL+R5%(g2eKTb@oIqisBp)x>UGuML<*+thT%I7}~I2FP{xMXdiL=}KFx@rJVl>?r?rODZez4ncR_8*Q+uz^<7`g318op$ zfj+|b+pgzCa6d4KvJ}bU)N+KVdy0RX;~c9{(VoL>+cy5srcZBNF8RF&QJ8uH;Z;gF zW<$P{j%uB^v**+kSp$MR84gDqK(^<(c1u_Z)A)(jjf}6Kf#RP=_yX}1a4WXg-m!}C z?yGYeQPDg$gIym-KzA}Von4Z~Bp3}JdT-uI&(sfObG1$tD$!VAuDl3z#F5t|>fuHX zk5%FWN$3x#=EE0v?jMAQ6Ek$6iT(E}#+tYf7O49tr@Hx3Xh&)~(?|j(3n^D@r*lb* z*y**ofB+F(G6kd#O^M1WWfSXrZcKNb_cA7;B2MxY9rq4BI?;ILS`-+aBkM>r6^qE@ zvu7E(M(cpC9VtHEB$G&VWgEY9JP?B4<@*o5`TqF)8Va>KkxP@XP)WDaI5446J<8IE zK`~*W%6#r_pI4BBKVskPKLYRD2B(V`Ff8oZOErP-{3Z7cV+e}{X>3TDQ7W%?6)V4* zdmz@2nCC>@G6!L#uzIX|(6c%o2AZ$vNLRnd*)<1l2DI|wcK4|4)5T0Dpy&;fG#`Bg zse*4GpU-=xr^z1Fct7!OjwMj!xBZDm*Y!3`6sU!Iez=PrySOR8ylP+)3%R|j#lhNq zh~(z;4cb0lZi2%K(cWC4rEP?6;ci(*S)o`C8(&-Rzz`{l=>tLV@LPA=4*@x1^u^7< z-p>gSWA|Q-#YPrVHokof?Ui<_CtOfloAKIBnqIv5{LsT(dj|B4x30ar5i2+)LOk?R zrzIA6?N?KErU5V8#hZ1;jZeKoS@OI@v+g48SvTE!8_a>?EXfn*SS> zFk4|7{WhP4I6h=A86k@7+<#q+R+xo`>tHzk1=V9H6kPpCbBXllLK?*CqgH(v73f{~ z%!T>AR&9D%hLhMOmspQGT$MrtR}Rog9VY!h_H!v z7{8KlKeY0j8BxE;Fb!Q>nsW7gQ9>YGpi&t}L^_oXL~@k_bC~K+p*{@HVnT`^NU+~S z@sjJDn^xT9kG^SxmhRqJPjjt5&h`?H8}u`!SzoVi>Z4xUT`{{5^H^st+dR>f=?A-7 zdLjD{b7^`*B1Xg*WgifO8OFzRqqosx-P@8GDYONTGsVrnFHY!xEwLAwltD|igk0BE zmiOw0;GK8#n~Y5f>1uqo{GVIjB}vieI9!^9U#NA%MT7!rQ?-D2(8_xHb`Bg)*OP%_ zxmbfF6ctu)CpLq+FkD^|ox)h)!BQ^PoPZ}g;E6A9`j3b>P>;9af>8sMTF0v^Grd{0 zgW(tcIK(n`!mlt;nmc%fFqwnPM12`CXji>_h8O!NjljGVUi151q!ZYT+c0Nu0*p-1uY zW#wUi?kQBCH199+LkjolTJ zpQALolwRtGhrhPQ&fkFoP7)dAv71J>;FY|mk6S~oJA(osU>uhV6Ae~jk1%~=1$P%V^}mnh#qz3jWV^VXT|Ehe5<+|+_P*v9X;Oj7;W{j!fvm`o9ZW^4uaecPW!`}emR zHbqG1zG;@kX9pZ%4zR7uI%l@PQK}p4v&CeZ6E$LIGVL_~Y^{HPuR#NKYPl{1GOttg zKGO!@*W7i( zO=zWQRt5{Mh{RnMHUSdoB)!R3x5J=6=-y;3W9`0c8qr@D4Dwp$-W$6=h^5KR;@6JJ zRf2w@UAATMZdqp>3bao}Z+v|Z0xwxLmV@Hkt%vCTYy*)=8ZkB9FOa9yh=N z^Q#|=fp~D&CHv{y8KiK`t~O~lQ;esn`&@q>>^`eYyAECuF|^4l-g{CdXVgFF1g;gv zP+WDWVLQOtOwG)@XAQs}7+dMec1OQ($8YzBqvB#C3Lm;3vmP|jc-uxr=t2e}0uOVZ_IH`i@UbGzPD^T)z>`>>VuxB<&`OJU{j;ceEnOmMPoL6$Zrd7<} zO+)zaQePLIq&KG4Hy5UNU9DNpC5+-D7ZXff@rhE1cM5vdx(sw7ACAR-WV(eXyFs|Y z@y~*2AOHkHgm~Tcwn^@jwy3a78J7uKRzZu@y?^q?rv|R)#D{9U{T&8-R2HD3e|w#RvmGSUSwC3)C*)@+%(#{+DVS z3@wde)hGULd#osu6RQS1SpX=J)vNfzM}~XbAO+M4hx8G1Y0TJe9I};9sC&hEn>hhi>t4H^H ztME-|uTXcZ<-EnQDpsF8XjiM4c#uIl`;cAI$`z(&RtsA+fy%||Ld4ev+y8M>Fu!05 z7{XYua~D1{?9i%RHAyeqdY1y8Ba;8AImTd82ZP;9$&t;!XT>l33A8$m8>#)SJYF_- zhakAKR@SjLJMRD-UJ!TD-9CKXxv9HVY}~%p$1}*2^%vt@CMZr%*vC{WdOxZMFJu3( zY8SZ5$`pxMbPk%ksU#gyi&8WANjd=&I*wp0PbVoD5tS*|D*Sfv#E zOW%O6{GIB$Vuo2tD!O0pauqNt6}ru5Ky_@BSy(`K^R@ke3C7M8Wk{*_c-Y^CsqRV6 zSKK%j`xLb1 zH=Rnl$|AG6{fpA}L>i*Uw+~Mv@7BNQxj>wq8~x_d&bM&%K_y?HL2q5NvGNb=q*v44F_{8^M$2prE}5Q%n- zx;)XCQq7iX->hZJCj`U=5N@z%2#}wDai4xHPUqa4N(Fc(3I>}KtbtKiAYCn2#Fi)xH8G93I04@50cN@K4U|p!840C8wu&tY4 z-;c#BeLc!u`|b5VS^&NQ!vYx*R41tUnX%$$59`x&oO>GO)b zXhH9ZeOs8VWoiQA{08nRwwnB+j?WG{?p zKydpG#|*|e#|=c+{3jT?^~HwC+~GQBr4G9K1j$YmvwwOlrNZQ^P{$K)C9-pq?=N7F zyR-y(%v#6m8Hwe3Qp{8CQt{ws{BarSAOeVNx=|C#`7~PJ$D{a)hj3 z(6eJ*%iqWn$179yMaD8_@LaG#SIC8Hu&l9$L&5NZ!W8u1=zb%HS2OI8X+HW^aGQ%S ze|VnF^T->B?)*6)XG^~eGduU!M`jqIz1JU5uATn1ad=Tu;*z;Bj(xF6@T*o-_{ohe z@x~=t`LFLSXIM$k=Rdz-l|RgAB1GBH+2ip7G2W0BTAohxpWZ#m8U;dhxF-4V`R|JW zU6>q66pyu)%EiY?a|Pe~wvUb9z21HIe%-$TPlq1rt*YI3PYte|H)XHr+uIkGkr7be zQDs`q39C|6*BZ5=--&&1cfH#=dJj*Y!41=?sb^=7buTa^6_H}y=LZW9edVwm-1F4v z&#_Lx*yxu_R6EqzG5MJE4R@k~N6Kgf&mMkn@LYlga$vE+d(SM86(_LJXPlwpqAfaI z1P#^F*y$zOKr>`1aDnK~i3iu2b?!PbW^OFc24_S^&_`W4CL}R)~a*4!UQJt=~2<*ug}>K_|0jHTtG5zu|vawM+!IS+mfNxwI#&#`D35G!8@ z-f?DoT1?WWY*{#n{&n%eqC{LT_qwb>B6$-1C*V!)UwH#+U0ixTtR72)S~Hj~3hjpfB@gQOq%?;oF90 zy?7pUzUon=>VrrC=Pr4dYp9nfM3D0kv>dE3!52J(qy(@V=xJ?YehD+y z;M4s2a;|~vh5xXOTU=C-Br49s=qu=hOH~edqIP;y?Yj7y2n(M|9mEU@mn_l$#Pogi zla{@#{xNGy?*b|I)VL}U~ip=6= z?hf|)%sPA9Hgw9X1dZb~dm$U_+9#H}>+cpS5~Z+8os=wM*DRqRE%H$0Z#gy#Op{tn zP>C779^-B7Zd*-2hg!Kqdw<>Lr$DHXX|q>SF`&N}Oay2HN&+6-gyR(eQcZ*5Et{~2 z(D-gTfMBT*Vl*?UADanLu~C=6*eu7ulY+f<8=`6eA1_0j6N3BX9HWq^>~NhkY5q2U zT$wdHA$ACGEcvTnzseSL^u4B->d)ffQgG-r$bdZYkPLIOcXJGg?It=?G{PWg4KmYw zxbAr(1hPQ*X227Ks`kETLIzI2=XA!-UY;m>AZ($qj~XE6`AXbzPe>|ZDqm9|CY$h2 zyl;wzMDZj*ZjqcT6<^8mM#hvu-4dl*cJdiqM%tv1S~#qO1?lZ=$l_mH;oPWQRA@SQ z>>_{OxU|ZVgSHZQ0PPm)eZI=q87iIO^9w&YrlHB;sl6avsip-KGZf5R2``1V0&)>s z?gRRNHwIw3L>ic}xJJXs**Ru`Ud6M(hj{TnDJgd zJ_N%xxd_Gt8EdYQ|2llka!`Mog$s!Zl||xLo5+tnBuQaCa8M7l1of^m zgo=bvuR~dAf??s=wR{eYN0q=C0Vt0zwGnow*i0Y2fBAQ+Ur~q}(vdsRY;sa3_@k~c z{i_J$SJLU>n9psyGtDrb#FD`0*S&u@12r9n_>DGr^{|PXK_EH@Xn~kzxY@%AIHh)B zP!(lMngsB*`_=0S;%o?>po{;G#t=F?<9rS+Pj{(;$hBt|!;0C=Um1^zpb1rI)6lL9{`^h05^DI<+A8vnE%8<~Kz!kAy$CFk5I#yH~fFriQ_v}R9j5X%&Tk;%lmjDaNJr;WZ_2h=Xqi#28>DkA;FWqVlL4QRJxodCyr=Y+>>hePD6vME<-u%8^n_Y z`$S?L3ysc`excZxFVtlPIcHRn&Se4^2_EVP5JJx95a?CY#EV0ewx74hgkzwe$Vh;A zV0ZQ@D7(@4jhG^_6wG!Rfi&Ud*M$dD5J!X@Uk5{KdPP@_=|fZxEjdyg21tp&miPpM z7piQ9IBoU;J+z(WjI{&$O|Ydp9t)5p-~4?@h=sw-hN{1X5vC^-UbM^AIMu z#nrnzNFoLU3PJHFp#q|Dfo2$=VHMkhU&JP6h|%aK+kS|3z*D@iPf_R3)X^`H2ybN9 zMzF7-3R$V+Kz+1rxf`|!7^cCE;tr5@Q9pYdjA_S(VbYMC1g(}GtANHHN4o_PWNU_S zQMJheO38nN0qwL?jC@x+}aB zDk3$w=n|S8+PwBrG|BRc-fKq>d2=|y{x4O8ni{LlraX<^*(@Kgtv1%GV*v%j6M72M_Q<(*6INesz_F)o< zl`7jFq+%p?p5bRsQ6?*;-BN1uC8KFtbL26aFs>C$mGrHEr;jIILNvdi<>37L^G!&R zrxDTBn!+}z7)2jMa>Ug6rcWQ|^oZCho3i;Hz~#hV2t1GZ6ABwlwqwREr#(V}g3u}Y z3i2sodVrq--{WK-<|Nsg<$bMI6U}{WTW+*&pdy|oJGlOxZ3l9xlrRhke+U$8X-P(y z(Wq2nAehc}nQp#xLwEXB;DE#5xN=LEREKGOZTGawTIfyE`6;bmz!f?2`p!jO&h5xf zn^vIqVZBY_^&ZD7)vY`~?@FILcm2RkGX}Ke-ivUBTE?jJwcyb9f@Dfjmev(?-)q8^ zPPS|J!ZAq^J<4wTaZYWK%L&YUu}K{*f=QjRg^u@-^dkNODRh_Gsf|6SGz6H{#K;kV zIV7F3#juJUTac?XX~9j?9^7)RKv^Gt&L*4)ZgED2_@a={01`pABM>s!iQU$ zR)JiwL!S&vFBf0YRlG!|%A0r)<9r7$U-+^FV`fFnMOvs`P5kwR_+TLQ9}q?oE0$PJ z^#g6KAkw$^;^z5`=Au?*HpRzCNl!6^SrTo^PMB~SO?{$=T&JB0;BmVsfs`oxP~yyP zv034ZfSMdHDoc^<072yyKsK@_Gw95!c2Y^JN8&q3+DGEi#eK)6kU&LHgm}3W)S5ch z9HUGY9nK#BM+HSYGvkZ^((C`oPiq1a=Z8uwY~LuUsANJ6PA$b z#=zx^_ih5|X+%4qDX;r1l*_*N?1h3az!qdbf7(;P(vpAm8Y0|O@GJ2T`Dk8&qHKb| z?p+~&(W?c_!-L0G?c}HNR&~&Gn`pcr0JG+D`5I%vsCc>`U~okq#?!N>xHTLp`JFSz zOAwhl0Au9Wip=;@4Wm10gFrTsYjyUW18&?mX5B>B{V)V$B9n*FYZez+a!@;qiVRoI z76Y^-V&G??jk_6+pvk8CK1bOJqFjt$;9K!Qv~sAVR$i5$M~wx}9(3lPgNsP(dfqf| zz=TGv&e@&;SY|rN@x~#6zHMZbm*$D<&`OcJQEo}_Rt=@Pta@~COi50mQ#mQ5zU2z@ zGWt&Vd(TL42~CunbHn$W{J#Lm^!?VHORjk$f7*vqKwXg0!^tiTb(<5NF72mJL40P* z$*uV&6$lZso{30sa~U6*6FNd@4tx%mc$|s+{wP3Gwuz8%ZUKAtq2n6DgKdjy5;#bM z7($aw)LWSao(4j$_9=}G^h<3htVhlS{5ncTmIEnm-g^PmgVy3o2`O~12<1@qA*7@o zbN^K5oPsJ0xdC>c{5Plxl`#jN-1K$M1IfO(EkikSdP6N#$+!vuTl&k7c7!^$CRpd~KK_W~2fa0LCP0|`IX zpWOXPAM*3nZf{pJsg=+u7cVvGQ)Ql5H~l|2r$H=)U`y|=@885p9ccE46&Z+0=GtvV zJ{TRb2rfTZ#<_pDYiow%FF@EnHGhPP_G5`mgahTpqVV7fDVcoYdPgN?aVjh^iK^%h zJn?QlvW0tFzxd#Z6tq#gVCpZO(Dr42`3san=AgXoBmCkBLD%J-tW(tZ#KUQ>DRuK| zc~?Q~4&MW11j@mVjeTmn_y4xue|*57-@**sgB^crkBXefWbFG(rUJw2yz_n4%pYn)k8FJ24%p&U)%`{FXMLy)aUUK7hy{ z4+hTzO?pnURthtDwoLlwuwkA27>@7y%v>)dTlIntze|ySV|UmD1az-(NhGGXv_j>L z1nCnhXLE6s{d&&RE#MhgaIosIcJZp7~Momyz=Cy-n&9HPAu?jTC*f@A=aZ6V9OCHu{)@PFNyK;un-H zZI+G;=O#p&yYAD<_9gk46zRRk?$6F z5=W*&2R^IQ&7A3*3|dEIj@flHi-7t8oay;aSxVu938K$`BILjSFp&n@f}NSgr+dbe z*YAE7p_+nW4IPgQ;?Y1=q0qQPWmU|~V#_-bi-^Xd879BGRa)>o+Xnue(xog7@+Nxg zHmfb}Nwsa9MRKb;ZYx7&L0aB>Ql?Rf&8;3F7zYVAT0e6=nL5|oiuwsm01QZ$XHZi1 zFnI3`X|esETMy0V&dzA=ax%GkTq5PtHCtg1`hu!-y^c30Z&(*NMkj0EGfv|dK0$c7 z=KAPW(z+t%4Z+7&Znt$nmeH)yFEP%?8CQnbU`1 zxG@;eUHN|TUnN}j0ZR-D?N)jBnrD8L=Pa8MBQU=?QEq72MblXSi)cXH3gv#PFh^Oa zkY{#c7wF&7uyH2r*CpQSbQ4qT0F$QZ+n*Tg4uuy0xyhE(+{b7&{;QZKlF^EZ42Er| zQCipkW)$=fOzLl*_;Sd?m7Ocu_cN59$&cmK)GhAWH z{-vMzw@!-WqQph|$@V11fW_IwGK@i&NS$L>@{R@*A19XWFWXz`znYM7u*r7*Qa=zk zbMJ>yCQ^?aXUW*(0j!A%b8aAmK)wx$&;^aYL|pko_Fp9UccPn5KdEExFT>yCUc$jbfLTW$2j8xYMn`2^zTGied971i@i=Jq9 zUY#G3q~&Bzd}+xod-(W5D&OResz?8ckwrOttz)f$KqW`C&eqP>j-C^4?S7 zW=HRN)mr2s&I_lLqax)=-O_N7Ml0W298@?a%o?4Oc`IxyI8v{4|7+NA%IL)`$VZ}} zI4A1~V4il=$_z+2DrzwP1P^czhAJDJiHrKBZ3&f}VtB}0+H0A_7o2i!A zpRL}d@1DdaF7Cu*ymxbmw9m~X9{jhjL;g+R zrV66!pB57O38mEBo6CSifShO(RDjEPC-qdufiCjunv3caD#$klV4gWL?;>Q9X!ROG z?#0URPcNQ+-3Edxr?g$?#*)|R^j|S>d?wUYPtF;|Wk3}a7%DdJGQ}05?xt+EWu5#8 z^%JY({wVHI`Vr=!bCvG5dP3~vl_PmF4l>sIiz8ql!s(*MSg^+IdQxgk|0s=8u4?7^ zKew44dx-C;-+XCmx*kMF(W=!J7m`RL z;pe5=?$)OPSKIYgsfUJQMMj_8>f9avELV~J$BnNwy6waPsClw(tR619`<4hqOKjglf9ZU{vr3g2;t%?71uuH>B(P!MDf(=)ivQ5fd7^k^nV7Y ze6H_txfiXh8%)3c5cd@zR!tL)pMR42KS#yYkkr+-TEjZgFU&w=75a!hZAUJhA^ZD` z9m>07a9g+IPA`GRTuDRzkA7fSMm6rZ+@n&oz`(UDsDNge11C$8rve)0^aC~Z^pK$R zWs}}jLF>42(Iu-zS`p2%GYy>xQ>-E0Sv5I`J4YfSXl!_JquA4}|}bc6l2B?TsWXDe-%<5cfH~EIO`v-(@ZXU|#YSi_QZt%KhEj1+Fws zKrT02@%{bSxyI!%FP@_xqWl!n1$YBbK5p50Uuv07N#8#G5UE&PWlDM*04J#CPd!g; z9AdnPK`$K-`VjhA1jWQ~kY$~7jzc13j-%ag$IgA;@2#d_$e38YxNGVs<_l{wZ(Bn} zm3#5@!3%w#W;f+?b@OQA%4}4thC}70r+Vmbh#zF23lC~{X zL$w{Ij7S(C6G-1x&=Ej+Ole|nL1|6e_Il+0uO7(KP(>V}hzfBXivqfnLggffI!ny`&)m;ngf7V@^L9j3gR^~j|o0{(%r#%agb}n%1)>{X18eE_O(RInE_9izPilpFHW+E)k0Fm*~ZYp2!bPCm19d2QTZVNJoA(w5ObH1CXcN zUc74D&QtazJx#ZGj&K0iomr1>kUvHgIZljWw!-?pv5{Ma-Jj+~7-_rtwP=S^@ttIQ zI33${G%kr{GN{;l(G=xn?hR(IJhrEq|Y%21s!Lyb|_Xwa&&$r)zN)@ zSURC3d9UpDW9BbGvdh@{-Si(=z04-h$b>Uj()HfPeIpMw1ug_1f3_1b_>T)S3#P^5qLzXlh{7QfwU6bxVCR4%N7J zeyIORY?j{3(W_r;Xkm{22mok;?lucUeroQnc==}L*d6jgVV}2$igko$Rojnv?_2|m zQCO8hws&HmJVeg0x*%U5`iAO~nlZfun^}1O=x~YGb>GvBmI1EwH~%~5oX3{1EZO45 zrRyL0Q5Q%`L3)lG?5!clSM?XEs}$Mc)SPShFJaKzUY)ZJ$~6WO8D)>IOMLqX^iUg} zrjtV|nh|u70r(qF<(W8jp0RbeG2f-`A2$(TF8U3I~_t+ARXPyLLZzfHJ)qdl?~iroM?{pTm~D0=?Nt}>Sx{gFj2 zT~gN(NGtcexph=X1^wU{!&?94Q^#Az8uXRUAGzN3tAAG;Jy{?3d~u?w+<58sa8%+D z`Z|LI&y6PS@Vu+oYIAPAGwiLTqT`jS1|peg8F~kL;SE+PHbJ`9aRps3XXB6q{5K^rcY~3*E`dIk1?RW6%K@VLK9!Kx>NNhnEm{E{k zut#l`{S)ATBZB#)&K+a|210*hHVHQCNgDQwq*{BU66l8UuD1I+kZo%p`0vuJorf4@ zgV0)tN}yhQ}lf7}{Hv=!xS0|n6+5Yu*8{rxBw*DWc z&O98-#(U#4V=(p^`xawgvhVvY`z|4A>}1KFHAB`?M1|~9C>4=Z%8)GCB`IZ3Duqat z_?>yT@9(<&;eD@|@jTBw&pG$G@6R!{%EQZ%A?YE^GO4W9sdOTPCd?em&HuKGr1m{y z*E?`i$V!C>2RO+z(5Db| z!di6p+FZzj-jW>svE?W$zBq*z#r457$LKV6H}AyjJ65(f^Er7B&5s`ldFRa76t&gs*1vK-YuuT@y_8gg@a6#AtLHg-o+;uK50~ z6s$3!z*K2{j$L}s?)n)#I)>2&CcL@%H?3L81C|%1Tko{qSlxE0A&&r`{`!1y9R2Fn zkshkzyD{`l`ML!v!EYUJI%aJ_SL1yvBPgx%?y z%?d9uw|=?vnEX><6eY)K0?o8`0OBkILgjXv#8>mLrRdO_1Q^njcb|jSJm;fl$s)+= zc*l*)gu+jeJWPQLl{uLNwkmMco{cyzXsPlwLio}!Xn9w*PLH3dA$RwmrV6l~8x_#d z4g)ysG4xu{MS8uz;L-s`Xa}X(gj=6q-yJRsUcSYd(sj0l!HK?pW~CD1Hqj{XcO0VY zUOEJgkL{E$zYIV_LE+VQNX5RT0AMg!K<$k)k9I*_}-H@DBF&=~ZhHQbh85L0y_4G8CwTs7@ zN+d(0_;l~}!Rs716Xj5l^J0uP|M7R{k#z`h&~e}ebMQo6nQYFx!f&4DL{E`drOO8Y z+T=R>a$Gt9mYoq8Mk=}sMG`c~t^7^enlWpHsr1{f_BZw+Ja!}f84QU$-SjqNT6iIR z(g5#bx)xx8I(KX~<>&4|T z)|Qa>CnkpBJwQRkbBnC6@M}hAJHU11^D@sO1O~Oh13q7U4?Hxm>El36gIf&XC&2{( ztfZ6V;2r(={lFui%_1}8N}f0HUZ=4BR4bg>+ApzjRV_?#VpM87hgaQz9KhO36kR*s z9XwV|W9YRV3aS0A&#%tu(ch5B!#*TR2)_2#yJ4z|c;lzM}49X&Ky4%sXl&<)SBH zPNQydl}oUzP{D7Mk-*U-urmdM!Ly-u^$!WF<8>@?v?pw%-J;+4Flle*YHI?^_B5Sc zsBON4gJ?5ii}eQfknk^WRz=E)D_W|d_{rkndIQ)6mP8cNDA9O;Hkt-Ms}y=OiniA8X$gEX}zxsNi$>(XN4 zX%mqzaOsN5)~D_P0%LGn6M_o7QB+qbw{F5b6`VQ$ok_vjC=^sSyr!cGs$)S;Tmj1ak-tU9uG4Alm(P>`b^#L5s#^q7%P9EwhU-F zmDh%BDsC?pm)0_X=eQ z`DZ7YDA?C*3qc=GrXl;>4EUHiM&4vkQg7u3C$+586qwegK$+>`Nw5)dk&{Fut(V+H z?ptTRHodOcc)rnd?PMMI|B8<_USz!*4eESa-ACb5XGq(mytgEVJk#_-Fl)j4Q}*Ac zX)v@lcny=WsXJje?rm+ghp6PEXoyNg6mEpz~pjoFo@wB)np{D7fWL{NcfU3g>dycsZxK2W)rI6ymK3cuSU*wR@)d~SAk=1Kf1!2@|hxlKmAcz=Gry_H?f=bzs+ ziP6Q#csG$(9{CNI3z&*82Q4^=N}=Z5m-%%m^MtVDxj%HLu{zpQw@fc)UN{p;-({|6 zMfl7i<7WDy+AKdvaUb=Kt=CY7VgPGflz*=A@GU2COu_(rf=rj{+|%FRyuR(GT&<_J zev)P5b>JT}C3*Vw0Mk2)m;QOq#Iwz_PonIKb>!_+sSuJ9KC}_YW=&U^A!91JY#=k-BgG9SHyHdcKP#+{OI1TEVml zHqd==G(~YtU#iX8oI&{7@z_wrW+Kj6vdj#XDr8_!LRk^BAolRxbuL$X6s>4diZi+* zE;OAPlPIm?YD8`1=kbmKLM%oJM#ZrmAK!utx-yI02!*j`#J*D;VK? zMWI2-y`Yh7S}V8q#*mOABpmyYySh{_lPUDA<`E_VOlAVuYcA75iGLk+RR))A%0fb2 zl6vhNb||hw?Hj-D1mvJ!3@KeW8J1c1Fb>CQ5K?*;g)Gqh+3MED`X2AJQb^A#VUONV zgW9KN`R%x}Zxn5!LF4CS3&+mn_nh|8 zA6aMH<8N}smLiV2RoCs2!Tj}`zuY{V}y)uDSvg@z>x5ZWKk5r4|&PrcTncO2U%|LVcNMLiK?RD#gI)+ly4 z1%JMxQDv||b_P5xW0N^DKLsH@8?%1L*vz+J_55Fdk}!%FtG5ob6wFe3Zq;<&4CinP zSU^&QF=!Wj8|v(t#pAcG1PEho_-B=>GQWQi4aEqDozD>(tS_Opzjou?QGZdh_~DrNBK%RdDv`JW8&s8M`F;A=Rsyv{Fsf(A*O1!Wyt#$%aBsq`6ihkftsWW#HRfsbi#~>FA@?Z@i5-KhEGrb z@9w@Pjto~{2%NQ>doQX|d9RnX?DM6gA;lLK&2-=)ls9ZHZ46n1x5+R8DwGbHi8$TU z+8sREFxNrNA6XUv-;g83Im`8KPZQL&K{V)fDH`|=9dKg0=~7yVKf7g2xD4?QjQ2vY z_6q+UfYy9O!0lK9p0!!?`9_svA+yxcmVtA57E8_K8@TjlC9JuoW={e#;NbJ_W0W;48ym3D<96ovv6ljG(xP`x} za}bN~1B3G?Z?K#FlX!m85a78K$73m;1e8*&#tpiUkYyA{Mno-aGfHi#)96S=mn<*a2V87`RI# zVjIHFtka9PwG010DFZ+dpy>1%+L0V?S{@+J@OuWo95MTQMoNJtQ1*-Nf~^*c;huSR z)!z^(=MYQEqr2BEnnHZyJKvQ>o)Y|Ni~8a;@ydvOF%QnB^sz_HFiG3g>j#Hql}=FC zZ_l3__X$$O)Mdf4Q?}CcK{S z2wH#AY#6og|38%nLpG*E2lta{Ls%_9fHcJQ*vTl{+(PWpL_B)ZaohrpVxE1;+b}J1 zd2haPYPgec-TBDz&<;v?o!hu5C^eN1(UAl!=E37z95>PIPn{Y3Odv3P>odfXe5@PI zYcj}<>od4ddgc*6$JQK;JKE&S;FEt7k(Q!P-hF5fC*5pnk{wdaS?Qg88g!y1YY8L zHgw^$0e{tj+sGd)=e<O89*s7-bO`z4zw&aIxy$;6urVZcHW07Y{Ck=y{{rr@=3P@5l~Q(vu*i>mE${ zZ4gNG(Cub4E!I30QlnMi9v9G^%~h&=sWE+4o172i3v2{sWoXEe7|)k(d7Q*LChgCmqZlwS6H;r3y-WoWGQ1ZWI4 zvih4qlKl{DwP{v~Ct2`6MugLP8sw-y5$TlCV0Q>g+y}L7x1>;6`UGm?N$ZCxUskoY z&o^(CDBrLYM(w{)n#e`s9w(v>R<#gpwKhKGmNbipWQEKo%&8{d2b#ww_mKX2wcIF6 z$z^{-waxN^x{s_#OzGD%mq%R&;HClawt>qpUHsmMP`2&w zBIEq0(QTO3lUrNsQg}z?4Qv;Ooa{WM=kT!0Tg^Xsu*WbQqSpD2RVJ7JL$RaGA5w}6 zY!(O^UH-z=f{lE()SPrEF`JQ#p~zCYluC2C7W&Gb^Qd7$gxz08U-FYrZx=^UrL?hM z(B@8Rig=doZ9bd+7;D7^o>|f0^K2rhpq!?=PXQ+}@oe&Aer`^(6+Mc!zIzk{(-VvK z`^{oE@MkGp$Fk*;C%Fl)_QxG;MxUBq6r9{=87ucs09(u{SCPVP%AiV?AH*x!#=~UG zW62oDH>&z(n!Z3@hE}+PPt=oec@K9%8OgRFP$r0|rIiG&w1bWZ4p-5e1~3rJa?^{W zw8k0b)*=geWsv)^myImzDo30MWvFv%;)sEn~FStryK;hrxfP*Pb{N-zjF! z`~Ww5ZAIl`$@lGZejD5brsEZQ$c?+E$C}XSDnBLL?(*hYjl3aS)bBoA;*MwQ z4~n?*yuqa?g0?(C<&{KX8n#7YKwL0NWRq`c!4Vd zRi*m^ffz~Qu=IXWS?++w%~!Q2d6#w}NS6}ejNYOLowtX5*` zsc2I!A2cm%x%nzz=gY^P@vh+)e{Oz_s9gJH^lGf$Y8$ zk=7}6o$y2C83ix^~U+EZ{ z4};(k2@}Oa2dWK5hCx$gSmSErFKa_<5CB zQ`5vMt;i3eofn3fDf(T-?hokqauQ|BF)H0t<2n|P^(FhT>L@P3*Z1StlR24Z zPo?apIU(kE*XMIr)tqXeeo{J(Zb_DI$UoUXgeu;Oo|$&xPg7vrZ9_2{-F!~`p>w&A zZF^CFEg*MM;AS!m0p9p_nN$Qhn#O1op3i2-5sK3|RnxU@$NaC~jwwZq($buV_=VA` zkWD0dtlRlagA49}Y3busaL~3c9x}}q$JHKttTOefn%*CD;!Ekf@tmFv6)92airc@l zkL)v7BwyDth2+bpOkW;B3d-f_#Gl9R=HHDI(nd@%N3jNZRH??M_aB>N?s$h@cXf30C17e zIjf7N|NYT_e*>JWjco|krb}-ioAF?bgvy~4${Ga8Rz`+{;Uz68CnAz zAwQ^&%3Ta<2l6zAUjhZNscAqcRL3!$YOlM(zR?B=l{EH@`+p$gO;d+`%+p3upxf;5!0D*DdMK53-pM1|tt) z`9=05#zrs0I*}?qG9aaQq~D3OEATT z0?`_9X*As6F25cQqYuoXIcN;KzYI4sP3!~L`+2C&GX$ z1!iG!cJ@%<{|4llO31PavT~B|UQKPkuY-P3AbZy`U&nI(yRAk_Yujxa4GE_oQZ3;4 z*CCJ)6K7K&`E>OCDrtC7|EQEM%JtjS113u#e2{S;r@upX&7*^yxoisZ^>Ke39UO_Q zuaamO^OYE4--H#G$=z{4n37U%3@%pmyMTNW@?K+N7_M88=yX3*INc;0xfRNlwc6e5 z8$kf&nZJVP`x1R+bNiXsHR}^x^uJxsH#q?u<&w)eJ_xGt0d_EcL2)1A{G(I>T>Z!2 z!ZpAZkaIwD0=59AcTe%ZOjJx%18B`f0r$iohbJa$6-xpGJXr|SGhkcuuQ>4u1kErc z@CH(?lSf_Diw!dOTm}@;e=-VBHaO@JY)Rj5tvX`SCCa6&(ypwBQ;uD4iu*|Q9>}>R zotF?IjOA!W_BiA>#tT#e!l%riGel7x*L&%`Y1BN&q72-sar8U4N5*dfN1}H3@P7s& zh)(K2{{bQytBjl47)4zYKZ@3DmO?m$$z^w01cjtot7LLs%`;D0cSd{Shy|A}VmHFy zf-QYa>WoXjOP`Bb^p}bAh7-@gv()%w64wx}lCgNI15LUFJx$^x*VfQVb))z}^Wr}z zXvZrQ2v#B1jat5=-&XQ{vpt8X5>)4(>?{>8)!S)TJe13*RpjL6NMwW#`0THV=uXee z_ctLbXL{bj^|SoBZ5TLNSp@MeBcGue8}-G3&ZdG}Vzr1{*(svSsBgT;tDK?!JRbB_ zP>2kXJk)W{k0R^T*{YJROel$tRDA7Xm&?ShNcS95_^Jw=~d10S)RK;G$Dc&@3teW3o&!NV++C_#S8UK-_89Xpz+$I31R z*x5HE9Ym{9H5zw@p%Aj;oM+yki%xc0J4B;*TfeZ0q2+ixWgS|Z`)B=0>%@+*eCx%Zr^+e<3qtcdC!<%CkH`Q|&R z_Q5yU)fQ?kR{v<=yA^i3S$Av_bB$K~3Z~lRV-cHsWy<=?G=~;YOo@Z9-Q~+UX zYm`IIWnye856TXYZNdww?49Wz7oy?BPFJ9J$M8v_`j-TzQC)XF^@ycNSEQOOyd=)A-%=3^_7$NuYWedn`%Qg0Rmp~%3f`(@e4G07w`Etzhf>po-suo? zt!Fl28!*osKp#<*EF%>BZwQKSLx-MUoTF|ZU5maooo7h=BOIM5$8BZpb$?w39z$*M ztHL|Fm=cYsvx&#H;!E$xyoUkAs0$1M1o)F_oWIZQ4m&B9QP%G!z71`K=yY4`hQ^mc zfgd- zbN&FKO;WADTT>RrpmtOv>$dw7?!%P*LA!LlVrXqY3qN(j{t%Ufv{$BnobfjUun2~= z>PdLTVYCC?)G;1JSwtDD5jOOOO4oaY2~8v~JgLbNdGMh#&88RZiJ?!E7WRjaH&Y>Z zY9fD~_)pl&wL}Fmfg_+OILo``NYff~?qmI!*hfvOpCC$mT=?)=8f*zu6ym1O$IG0P z;6baG_wm~`V81@|=LdO)YJ2WpZS)}-xv%mJp&GRg0gUp>KUcL3l%=*~$%`c?(L=8R zcQmIc*iYz`$GRgGsik&;2?i?D^7lyGGjFE9cSIwKQT0DTo1$}CLRZhdetkl`qyYV8 z=8foeA3+?Umvs0YWjzmgpnMkTwFCKZnQw{V$*nnV7GlwE3iv9lfU}T{*t@4tv4lxR z<#`c2A3Xs;!}^cV&YB=M^_ABD6|Y8@5L28w`sc{g&YuDh*y}@-m_B%v%cmQC*}{V5?n9lc7*O{biM4;=NyjZb3T`f-OaOJ}t3o=Q4DB zDkjoVTx1+@xrC~v%Jjr2xpA&JAufG=>4KSPZ?d}CFD}vRn;Jj@wc(IhrQe_nrlFj8 z4GHXnVw|u3I1Mi{=GEQfGp)M94&kgUTQ5`S(GE&4JDP~5)Fpp@GkV8^0?VoKD*sj5 zfAyt=Ad%)HJQ0JduCpGy)$Jibyg&fkbk`!S#mUcIrQjnB{g`;su}(lQcOP6E%wPG7 z6HwFl_Ilz={C8t&7S37MCj(2=(}ifPL1(9kI5z+YWX5x0O~URC1mq|BY-Jq_efCt) z%RP9Cnkk)Uo0QLYBXg+id9Vij4`Q2slybW`L@_7|GV*_;3OdfxpIeBT98)yWh%oPG zEfj69a6T-j+%Q6_PB!x0dO}W?b8{}xB<$3?AdigF^PkR^3T-9y4R#5bAq+G=vc~@q zZA;4Bqq(2SE%@po6IU8XpTx-P+@r}H83Mk8E};8Z)LiRGirYE(82Opv-nOgB8Ejs$ z#Xvk@<0h|>%1{UCnf2z+dVYoqaF9B^kYm<3IVG8@kq_p#b8Sxl#j%omakJD{YdAjM z(Oc}cA4bv;j4e#1__8R<4*Qf~O(*@&UyXrn%b8+tMAI2|pBrB5Q)$g|?Q!A#?y|^3 z<1$oK&a(1Vo|kk8x>;ae$ue-GH}h;;61*8Wh1H^iF61&Vv5Lg4Dxh%_Na6c z*;3@-+PR0bPFHCYulVxeR*6Cf-Et}+wfiaYcGQ)}Qn#(#DI$@)_o!}&Tyl^7z{bli zUc7=>Hpwj1)7`{YB}K11M8BjGy1k(t=}r+YufY>>yp>~3tPu61!#7w)U{U}lQd&@~ zP;>w*3TEsr>$Kc=MXwPBEP}G((wP|DzKpXl-L@eXNVzbZw`~(~Cq|RpPF#A8$FmnW zhWi~8%<9Tyf}z`UGUm3;fOy*Hp|0c4@J&%hOe+i% zsS~udDgT~@l^@NSYBxK$>|m2=RJFO|*!{udZ4_F^*}2={Od2IEN`VpEHd3Qc&jdqk zfnDK@t%)Nhc`xnG6tawnNiIFmLkTk`gY|1oE!oR!-ttYUqR%aI+0g3!$U)2Oax@su zs!KiIZOwlF$X9TV;{pGa#0T2_C};lJ ziR8-U6-+0SDzEnETSC07uX^?|%Pc>1*(I~|glRz4ql&jN5dF}5Q7CVrcGVj<%awfq zXJ90D>ppkvc~a8*h9{yei@;l4<3!Hu&wooE>3*gAb&Q1#d`&m9_Cp}g=mh>@R z<)0<9(axjmv|bsXB|B)A7cR~6Bdl~!pyfiug7aC?b`*SZ_B6E$EufbiAmnR^zC)iiKQ3Y)i2;LK1m$DhG4)O4Eb#{nkvCh|hY zW6t&nd>wU>XXYCBTOZNWNPl0rm3|Y{qxECvyBMQpMDjgl#=AO}YfwCX3_cK9KeI%a zclYvpt^NVAll>D7Xj`nU%4&=KU)kzRwqzO>(cE+w+7##YpZo>QFSm2QO+T61zjR`r z2xGLcP!{FC{vrG%h9?>3U@u?8En?`gM8@sFC}FZgu1-)6JXMgZzwz1r7^?(IR9L?{ z?nt}52mmt*=TH5dm3fdB{V?X+(+7DnTjamvk9ICvAG0>MNWbnMBw%WIm>gBoQ^J6~ ztOfi@P$Oh^6S#%mpwpT7pN1`TgV$5>&5_zlKLEdy6d#zU{69|u{9qL>2253%PKEwW z(fbKLCbN6-mrl@J7czzH#9sgbc#rFc@PKc%3;<8uKr4l)Z%Rte3_xX~|_=lDe84lSGZBfRap5Wfhi6K4#!hVyd$SSnDjh=%g>KnHo zOuIG^Iv}n^wq^kuI6iXG1=#OsGWxxli98^NITo2o9k6Pyoe3*Fpe6hQK%~xJ4i>f? z(|=%QjjX%Cue%5B;bU8^tV`_VPh?uzo%15dyzt8Q@q^6hrQ!17+oh@}-Gw!dl&2H# zy?Flpi=h{_80p57VY7Yiaw-qquAWNxP{Y=$sSN(pG0;syYWM-LFMz3?qxIc%vgd4M zhCem8lKWonw{)a3IsEweX0%vR1WCLD2r~S?Ij%=`+XhN~{+!r(0YAVSz|}C zSk42b0|p>+b(|#l@*d1u$ubEzWnpq`s0$HfBL*CH97ecDU?dEr-4zIGcamN-&vinN z4F&iBuXT|^nu_U%6|&QlyiQ#iya%DTUGIZIEzPl8r%P*cCGNLJ7*!MGapI|i^}NZwg5`b)Oo?jV|Gd*ceMfjuYx9LAb|Y|?7wbGDUcd-BTQ zD4gIXUzlKLr3>@;ZqsPlDX+Qn^_4x^-U{!eiv-}6q~Ss`uR-{pq3pxOJeUuAx{&q` z!<3P1M~2lg;bx!@yKHRCXu~F?M+T!L!4hsaw%QJ1>E|3z<`r*b!){%Zw3f_9DUdb}*X*=w4iE;O}gYTOhX{9BmOQm;Vsmvld_(rx4 zFz{({wo^Qc1J+7Vv8dHm&RA#4V_*!(d<}3Q;JseEWmT^82j-Jj{RD-PJYS;|dJjqQ zkj?@?ZLi272hv{I0aHrc8a5f++8e9HVcZGjPHaW=$EL3WJ1i!StoK*$SHv4jfIMOL zG+;-@yyR2QPs{t5?A71JH{Lw|jFh)B_%h#&BmC)0JoR+m*ZAxOM1+r68DI*$Y)2kN z1^W3ozt92l*k*vn#S8dsxmSn%Q3*_`A=t_$ShC<($NOjlButMM!`g_K+50{!V=lrG zy$XC!5a*&JUMxBuZ}L5_-qnS8MFf`VLsGuD@zQVYVkB1+006pvI(}dWUL?WHPhUxB*fCh@#!);Jp z9GFf56i3!=LuBR-Xr@3)<*H$~Ro#`BTh~$mosG!aj9JsWoI_u)HM(v$+iqxA>lx#a z!Z7%gZ;#U7t8uV;ki6UO)!`#=?#>X7)J?OX0h;5+P$z}TvPNh|^WB%Gd=VD(#(LsZ zLfOx+Q#}b}$|r2C2%2oD1i96jG6{TD6T)gcH~NGAG~WK`bBwa`S2cTo@ zTW0K%HoS(HB;yNje1s^0yx@&-a)JvVj*f|$7vg@DG%=Rov{P}_cURVH%;vf}5O5tq zd$BC4c5|k7_T&G2UiPwjJlu)A=m?r~$75hbzBJMXR?C9j-7 z0*o!Y4qm0HtIZqzFW+P}fgQzy6$?Ud1@l1)&%{b)IR@vi8|8s7R!YaznP#?aQ*)6E z?s>8mEqXyxvbLcFtVf*QpSimm$~BuZs5?NLv%7!d)!N&Ia@;;@@6&*d@UO?h?d)!v zbunz&tDhk#y(ZQ1Kz>Gd(roTzT3*3NTI=Uu>RJ5UA%gH1UlZVwQk1bTltM^te>p;g z6N^-ju1%#MJaCJyOKg;z7R5KQbJ0t^V~>pM8Dybhf3BvuQP?S@xw_5N)NX=Y^ahBcdrAOPcxmr4&ab-hp`Nc7aVv=M@~nj@rV* zzkr$YOrt9@aE6QH6b;{Ha!4e3+WE>Po6gw1TLE#vvC;mO0#Mhv@xgNs8`!)K)VNs9 z-9CSn_}ii%QvUhp%@1|(Ue#Ye=j)Y!o%O=I?g*aSXzd2E01-QVAnoHJ+Q0I3s+A%d z&!_Q5%ujy6MJHdWsYX21wC%9Ng4?a8VVm$MkW;w$Pk$8nM&&y-bv^$ecZlFtIFEG|6N+Uii7pSz3HYr+2K1j56o$kJp9i@8DhBLK?S5oxd}V8 zIY}zzWV-axmQ>DTHAbWlXgT)T2a~n!qpY)QncR#;*M&mo&QAeG%hyTkAoDHC{8}&o z2?}qyb-8!E9!o=6RI}o2-OsSj*?ZwKlTf1O*uHM(`S062HY~o!JO%_C7DU9a##*Wl z3tC2iIOzO+!OpU%Fl{KSN$XG+AzIYMu_?7XvWSu**tu#W`&&@97vJLxwM>(8!fS=l zEeY*jfO~HK8hX6xXKw^k`;Gco=Uo8vAMBhg+Hi zvPtepW0pCw52)kAZh7)vhuUCK?U}?O6eVpor+kW?t$k>Oi)mpto0OfJa==(Mnc()- zYK5B*5YchaUZ+tG7Sho%v0IyKe`w>7G`T>eVeM?s`ZJ^CRp|y>()caHbYA%*Z9l5} z%wj)WZ4y=YG2m1y4qM((ZwL7%fVi&6)G>5Ls#hJ*&HFx2X}9kROQr1CM5=7aE;tQ# zo`#vnGW(97k%j-%8l_J^YjzMlt1H!0_Lz0bv^l0g$<4go2h7!^H)e7Whbh5iD$VjZ+KZ8rgrPt7H+&|-bMQu>#qOE6j8r0zdl{JCJcpaLCU^0hs zgU|1!*Yp1H*Oq6?<)tt&dOgms$16~Tz5}5z`c#iG#yq_3?eXmDyU&>3KSTB8<8|L-o5Ro>>Dm zuUuh5utAifwlIYPg;b`Q@9#zU!{VE8v|NzBmEMZ>p36wxo8+~-bJzTtFXdJv5|4v_ zV9+Jof8?TvzZZ{Klb+bFLt(f0(sJr(>FcVz*$$2a$$vhR#4>N$vX0T0Qr0vu8`y1w zk)d%e%X%<2mZb33&`AgVtIF?-o1R7ZprSm=LyJec?0< zCG~{fGA8sWMy**=TOVP=QgmWUGmWxb;_<|(^d1@(k?|KL-6;=r5SBATszWMQR@w|W zI`@zmZMX5E8|U7da+^XZ_8@w)rL*U9=8QW|ttkB7xL?y~+gj-}1+I0c^MZTbtdB^Qnb6keIg1T8p0 za4%5h&ujkd2j`!mxLFoGM7mMgkB4)#HMAjU#7gP5vpwIhUsla#fWqUS{6EJ&xq>zX8}* zrH-CokLFSDT3a0R6jG}RwuO2X3nS80lz=>a_BB(56P=cels6siY?s@=nQ$vtoKYFM|GQp}D`l0M-y?FF2K7$o zNt-dgt^DB0!DC|v_%?n-!KSF@U!fcz83VgkD|kVZBZ!lqXMr#{=A47nWgo5o_~z%r zPT8zwB5Ky z#px!?eymrAx?jJCG0H?(|BiXukktF_$(xF2i+n;HEW73U>mvdzP!p8IsdXJs-%hIC zv6p1hmdm9v?`&sY`UPaW^-rn$n7q^Z4~c|~S_=XM%_i)>B0Fi57s+;(oqFawK5dfS%-;^3=nN4gohZpgPHr{b>~KKv zTU65p5Azx%G}B8thN_Zgi-JPM^@A1&hsI;rL(e`xb4+)3AR8g&Wy)>BYRHW}o5G=a ztGcG;z)i3g3lkqRtnW7d>@ddM=kAZu`D%@9h@E@Bq<&-Dnv(ysv|rQHq$fcPu2i@D zE0~`kG*B9IKZnq~^3prIQ^)o897alUhglC&)VyR^HxGWmYCHMHqU6V=Fq^eHfkMn4 zYtRqPFD_FHXNW4v2yr#KQ2$}7d?brlN>)Xgo=Qnh{W?c|3XRuq$A1yrDY)$!S=$Eo ze+*`(IAqgqv@3_EY!9hW+eT&IWv=iSwxw%`PJ%^>7_;il9~xqMuUA06a=6S}PUtfL zLz+(OQH|3|ou5okgq50;EzdJ&!7cl|o!A>%$b@x?gI1iZW7|p&yvoa%Vs!idDX0gAn9T~B5=f`HmKN+lzH_XgOaadI4BDfnOE5` zl2pwUg(CSHWkHNV236Iw%vF5zVOBXAuJdVx0U>!o^a)3VQnOn>_sYX9Y4 zi@`um%Q*RXU9}8SJaw-z^4JC|%!U^|ZOqQ`v+ZVQRtuVYd^DtG3`d+?MBFv23VN8$&LgfDNbP=_y_EJ(pdes1aNp;KKRm{Nn zx&e>0i1O7Y-Mgzl%Oc8#H(dUHJK1`GP8q$(m<0WGwm9G7!b*2J+qCNK9Voau z1Lw^O>^V+u4=4^d>#S#ekz9C~KLCCC!;(psn6H&zLAT+KXL&WCkPJeJ;nZ)z*@M&P zGI?0MFl=th5NBqkzO}WG+C#B#Ug(DF6X|cVf7Z&J%ct+Q{x_{TlLJZ5LZ+5wKKYo* z9Mx4M?(o-9!r#X$5ZxLl2+8$6pUbJVDL z?J7FrdwK-j-3s3S1-`J!-gx4cTAbfB2Py?-LGBPJ1Y~pr*BQ2bl%Lt5ak^x6{yrbx+GL2h0#diqHBl@+hKffh0Hn5X#ky3KfxHl5kQ41WKn(QB0$En~d#_M+R<~ohK9Eb^bQEJE zLlXJf|B-8|$&aZNf07%#64b1Li8 z!d0O%J-v~K5q?`$mq4J2f?o~+vv9Ucn6n6OWNl1n=p^JTzTrEs`k%h|E;C1ai?f^3 zEqo%=kk*|{058yTtPb-ooKtK8!l{x`3{_doJk9Id%pcpN^IAs99 zGQPNF0q>ZQDsbo;*qYcu;-mEPTUZDx_{gj$F%Go2ZC?~?*4Nl#jm4bowx|q zhYa2;iz@?fUWaApNNeb>wdH##<{U%xY=c%N#Q!HEdS#kc<~;2$GN%D9qO9B3`i0kl zJ@$PlePhf*RwN84%E~9F-+fS2`UFa$fDb;IG3Bt(VCw|CP#UPRe?YOh1^A8~P&DaR z<0vi2`+!gG{HA-mxeF!@<`ThTL+|bEIrv3wu_aEv(&$4fH3fmi{S$cA>9~>?{}>)Ei0Ix{-E2g~zPafY9~R1;3(wq=bvgPpTMyMBfTb2sOo1gC7xGp{O>*p09yP*xyk`e5|k~9z>j|eu4dHrGVh=wNG z7AaSr7C8oa`%QB-I;DM-!238Ge#5f{GKppKxB8mAugD58L~$KK<>9*6PAWuSh@Jq9 zK#n{61&>1h7LP?R?+tmQYXcI2=C^e>laq5wu9{+0XL-ke_BW#L_CrZM9BH?p)#v3` zVBUlgHel&-WeBkoWlv+ie$bGY)w^%&2Ps4ICBd9^lx8}_Ry{wPirn)vluBLD_9XuK!mpxNXf}1&T(`>1LV216I2XJBWeWg3{yip9nIPz7!!FKB%mV!(bmLvK!xKMPA z>n2H!+kBAO zyK_Is!EoHL6?exV-SD@T7gx*Sv~67OaOZZT)mWJoFEMp^Fh{9;yK!Ifs3GGnP3{?0 zQnm)Hpk)hxKzR)A7x_4YHwOv}@WH z^{yq4@o4}WGv*~&4Dai<(7^stJ19eOwBIidnv?Cen%`O}ERfX?Gigd5aXj*46|fMc z7LnJ{+3swG#9=_nFE0K!=>8~=sLU`@2Ii&SyG*^SvTUi0>A0Hv{=DMjp0wJIW@?8Q z-~9@;==@zJ@jORwb~s`7z^{7ki(WgHhQ?tc6_0s^rWeuA`*!#K{5GWDdv>I2SlyF9 zym~;g)55E-Um|j4`>00#U`ErrEuz0M#xb1cUIi75&5rofNz-f z5YmngMnCxJc+l>9QM(p6WZ}VEnXhsW7!5oafK1x2OE<-*(V6EiR1fxA z>!I!`W3aw2BGd9Xk(6q;OrB7$-dmZrV}H@P5~h?H&Y@$0Rjnu{&Lbv#Mt|v3(=gSN zM{dFNtF7RR_E`46eEB~Y+g}|Ix$nu1LD~0MX>E9O;UW;0h{@^4B(zHz*NbD@m-RP` zJ(y|*5@BM@yy=TEXS-*HKX$nNUY7CT%ff%o2FYg8S?8s#og2-vOi$*K3QzQ9;a8~k zB-53*-@aKn+DtBGQ}eeb(5{yNR1E?vl} z&LeYwOfa%h>gXbJ_L#gWt>9YfKjp4)9yA~b4RtiIM_|Ph;exf(88IA^uNWf1a(pbX6_aQEw%f2;?Nz0KyvNz{xc<#d|IEcN zq+|;Dt-2F^hscJq-k&tV%esxZVlUe7ndcY5YxX6aQc@*%7p5{Om$n4HYVSJ?8^{-7?TUU4`G4vvm*v=dHtu8E;iT;-pot3 z6ll8_ZiGL@4BK&d_E`(3Ccl#-C_tZBFMh0-jDm$yato*wTAN~(^R!^(y#C!n_Q@qp zmw;CiM&yl5uiYZ7Xsw(b22sf{A%^Yp#TB1i{BMcYnd&K|IJO1CB4{|5i-=?hnUYie zWk59{?|M-9KepZjn(Fxf1HSjVxYxXPWL#TDWzV?wUL{Eg$taak$h;S2Z`m?R8i=Tr zbweSWhRV2BC`C&|Jnvime&_dpp648=Q|DxTzn}4bzuvF01ry)Zft7vi_F#5oub};F z@mhMp<76RZtQxX4&x_cy5zM6{VfykL7<#Q&?my|QvICvLM1#*EC>A%qw1B)GdY;we zG1;F$sCwe<`Zi&S0&ym$_tBOTU0C0#))t7#K^L`xPYyk+;1+!Ux7pH1YMMQI*vDIo z0teK6=RzJY>)soIrlJ{izLLe~*}b;Gil+(Y>A|1gKCN2?XYsG#FW?UmQYLqLM1uh+ zqK1kK7Y`}EgbBZp6*b2O*zUq@U|tIXX{xixiDJKYtc4Gp2Xl&{@Tckd#UT++ziKjsEx>d>g#s=XV*+`29NfOvhX-Gsg&yGg zr?zEC34cOk0wdy){ODIXrYKAvRiDdu0AA5Do&P}!XA^)qqhLM6?^51zp@Uf!?g2$` zL8|*4$GrO$v@ch0Qm-;j8nzB_q0U&a2mL!p!`JbDWGZ)(zw)&2`Ng_~euJ*2mRPHM zfJd?kyj{@}Np9f)_|J~OJzVto>be8vJG30`x#qNMIXRdW19fdQ?xWu|Tjj@_KY3J9 zd-}kX!n>Uw^GalSP15>mUv(6iRfC4v$JwxHCA+-8^FVvjherR|5HXwMj?E!;_shS* zILhJe=JsIJc9zJuWBi}olGY92rq;NGnKLbWbn5nBkVxH}Sb`t33tZm-6Px*E3#G@H z>q^MvuzVfKnxbZZre4pxe$(xHT0|_hk!aAY$5rKYJ$t4BZ|qGhbZwh3P7sgF@(3iv z54?ui@MQQmONBRY>prc#d$_M;(n;jPW4ImP^oTyt_gjTA;k})?ttpMU;)VUq9ADoY zE>)hKAw0#k*0z zWt8id$w)MNNf;0nAOq6CSIK|I;3LxcI%Jh(u02|j${SiOixYed?1Woypm(Pxi)#rQ z?9rP}+PpN@9s2EJ+Ip>FczL*nt*rtFz&{F80{xcM6C^VUaXT5hZlS>TrC%_)o>o+k zB*N`wf(}qeDm?^=H&ZMy(o5eYG01y`0+4ZXZ&&&JCP&EcAFshICr?~Q4Wm#F8GSUj29Mv&Nztv|~qk<`O$r#JH5 z6PQHB#JwN$8KE;X$-wWimQssF?GmW%5=|*&)f3bL)rP71aZBk1te--%CNVB5>Y~@I z*3+5mx_4m!j&kf_QF11`~2kL ze)Z)wfAc4=zW{auTCLD=|E~OG@;;HoD|Y(Zm4mswJ-Ix z<%gVOXP>AP(vX22>1vy7reZioYGMewv9R|F4yot5$_E*{8cvf(VEIN+@(w5c++R{% zY8}BPE=hu;`>m$@-cP3=!dioCr{fjED@zRlI2wqRF+R6al>@Co4Gj zowgC)8-J<}9~6K)b#f`Ed!}h5cR2-pc!7;`C<<|%S~RvY%)WCv)gA zX;m?+{Sc2u3h%d(l?=Ku80~+7l%>ZvbM{raU~S9!AGVfVgk-t$BXLi!gmQ%RO1^dP z9E+PgH9X=?u!;4G^xz-hU^%}Qrx z+E_xUge6pR;nmT`nmNq^No0ihWI{a2V8nv1v9~ejdwxQc?Xq=tl#X=&MN~L;=?^UK zV?-Q$rF6{JOg4@f5BZ<^x0J;_3q7-#w(?n<^@S)2{Jma$3Hvkvzx+iDHX_enXnCAZ zT%Q;uxtTsp!oyZWe}fC|0di;t&0J&D5w{9W}e#{Czj&cJfZ7>PkLMxs;+I z@iAxv&;(ISGEocs1iVT4cQu`!*_buxDfQVZx#o6PSWBWQt~TentE^X=3I%_i1Vq>0 z@P=lLZpu6^nsh)elH2*xcazwuyS_Ct&H8gI-QKlUot2GDe@CX;prG0L1?w#cPLbjRvRurpcJN^6m zaTbS>bGB8BI)auCS6%+BUlsJ6uH>1s72NEiBy!0HS@Zi*6Opd3`Ii%C%m~5iM$L7^ znO*r2*FJsOYC8Ekm_kPOJsEpF%^BmYVcZ|vq;DyZL5ezd<(VEnXOqiZr^YHIPK{^R zepdqEf;QZ)--6)&YkDr$c%}O+u667i@!$xsvrMWwnN~DKOa#R|(36wd1G1#m%`0yN zGwWM@nKQq&dfWfKqyGJgBO#1`)?qMJC&3j_?ULuY1R^nF7DVM>KDiI6i>@*)F|TaCF;DA+kEA3cR@HSNv;e!Yh3yxiSPk8q~rTxT!tFJ zhJS>?wIk?b_u}PUB>e(!vc8;qYlgA`l55&l6`(p)vy z8RzHLyHP!xs}-|-Is1+eaAzRmWK$b2Q=8<>s?3RHD7*$GR5~^~)jk26sv6AhJ0?f& zqe@t*CWhU#ky1xzw>G|EJvV-~?fj-vn%grITrBLKGVKm6sbP}2qnF5@o{Tp#bkB$aqi$ z6)o^Z>yeL|0T3O~_*i9-a@T((hxi>cvi9nf+y8#*Ei0;X9Y4?1h1bpo;%f8RhsLxx z%7}(#*3^6uA@zVxxmr-f!krbATL<14+T*tlRk}Y-ted9hlmMu|{5JCYurah1{XUh& zM&q3@s-^PB?4g>5*gXif+7Gl=5H$@8e^U)4LhU|oH$xTC0+nuf5VCQPwu3}z!f!8x zQO8pqYRu_wAFOx|_BTO5n7d||w)epA^ubQ=P4JHLi?woxSfH9Szr?Rze9wzV)Wg+6 z?Zi5X$}Yg;l49{&acFG;gEh43h=GzC_?*JDsaA8n_xN^n9aaOy!boE0Q}H#aLT4+} zu?qUO0OKdNH1BxC53YV_w@gn8s*h~`TnT0tl?cC@9Zr#7HNBDBZLynecWU zdN{D0`VN}G80>CO1#69^>+saG2;sO%)6luvQ`dE5!=W#vhGL5B`pjwGw0y6(mTED{ z+WYXed>4Np)TWu#Ztu+n(T^bF9s@V52si~#%#fOpj?)$GTZe|^@*zN&0^)PiNDwN3 z;b{ce+>aEY)#KS4HFbV}=UDRIw<9SwlX6YQ!$>P2{$a>uh)SMxBu*Q=c)hPl>Cj{F z8Y6!)BuA1FLN|OKh{zeExBdf0Z2ccFf|>s5vlVE0{QK2kdZl&EQ{#l^&$a!2#Z`C0 z!)st3g!xkLcddSb6P)Xbl@GC!o`Y8{=b25|BeDq|u@eL|Z^7(!Ye8qn#}K5< z0|2l>R&3*U-ro@Y*9)T`=mM%p5D$@eQ#at&!y+9 z*o2T-?+Y+ek(c>6RJ3Ot_}aoW6`d!$f14$!cPW@IvH5xZ_(v}JM==o-CpwY)L4q7` zGoz_@S(3^^PF<{HtgRUfyr*bDTWp>(_Yr~-ip;qj@}#Re>}Zuq z6OMG0>pfBgQW=Ik7qs}wkA?O)W>xvW{@#S!?36iXOUD;wb@Wy*mAxnT=n=oUsU8|gSEoCl z`^LIdfcUuep-?QUS#7e5n|$U+H7YkxFhs`KB23bk>z?|;DvM}pVPyWe0QX@C>3*p6 z-E7s&dtLN7&geUlO7Fd19C8f4l%*p^uJ! zOPj}H+x)$whgOpn2mAt!m`GFT;9CX?I5_)xKS#g+gW=sWAfBL)y#9<`7(46GuHYY@ zVb(k9v|~P|S2JNzK<=7HlUR!wrODcctSWvX+@C%{zo081|BYnluj^dyEO%Y7+R8=y`W&7eE=E&- zc4WHpAB=78AS$oy>!=5XOnP+w_ny{e2McbRTUm}pHbRZSw)CgF{ODY<#)aD3^XEat z~6MdnUZigUmW#wIsy%4X^yzTMH-&?*jzl5 zM8)m=c$Db>b4rt5;ECdLh4Rf&U^tOb@?en@mtJ&h_E+zxdk0U57o1GwASWP#mZ+_> z*EX~*W>=K2eFgyRN~GV(xT&z2G5_m*9I3lekWTb27+fmWTc)7t6OoSoB%FyC6Er-! zyExGIZMl=DKSynfSCh9DbA6ENA}&lQd2RINTwHpVt7HzynO2WuG|CaRgAXYg!=+^g zt8kdOA15;#-qYttYTvb}8iF&4SJBh;EKv|?X1z!#%eA}bVfuJx!wrKjRT zrX995$E`HV#mNBK(EK07dDM<5OuZV+BeTK1Jjcc>ByE;yXge6_{qs!WtGkW&H}8_T zBfVsSNIE;dNgcp9vWw{n3og~}A8e1V6Ch|JboF7TJthMu9@W3_Ci@WAZd*OM%npYT z5!Vwtzy%NyD3v;Wt)5Hll|VoW%v6T4T@&HQW2HvCPQmFghEP1VIoV_P&hA!n0lPa& zn`3N)LostOof=827{0Sn|C4T?DrXFvAXCt$mYmY`D9vv4h}s^qt@!U`!2@p}>%As= zCEScP!FB6OqYwXil<|Y#`BcwpQDp^z3+3V@JdGwsQ1boKsDWXE1TH?O^iPrk>ADD6 z(&u2mWWPj5uzKvfhvx<#UuF?7tCvQ^eiPXr2D4F2-+N|lp6#)BJD9d%OVuXuLC=W^4pl8r*lp7) zPT%C@6gi|L(X=LHP!J`wm32%zcPn+_#cO5j1oWs?!o~ONVv3B3huqf!!sk4r-otsT z{E7k0xy(b3EV7kKa9kClEKV0!8d!O*9J!p7_)7R;-freqrm$6e$~UwC$BI|B%+po` zb5wACzd?hfnEOL5bCn8!u}BPy#8TGi1{wExvBBh^w#@LiyjIqs=<%a1RN$R}zGsMN z=@iz#?C0@BQ{amX2lF$i6a8DCi)XZ8M<)QH^i3&yCX&vN2KHW(H}O$zSpn#fbeH0B zm(vh}0Sea+tbw7uQTDSh%T#538%wIg#-{;Wxwc2PQu`-JQ2z*>Ji!A^GykEHFDqsh zk4T7nV&&|vv!B`=^yc%0Hq(O@`D-8W9}O=CS|7>IFhhfJ+jOs>rfGfuvmyBdIX4c+6VyncwSkTt&0oPwJI3mxlxNZMQ^Jzrd{=y%n_&*@J0T%KqEsbu%Dk1=(;znau2tqtv3H zjrN57Gn?IPCK4$Ai(m9)3LD+Q7Kl892s~SqC@h<#X32ukpXoEMdZv@Qp}C*O{e?OV z8|4jC9ggPzX&DmHHf{v>OZ72O(tcqx_DG2@V7b-U(2yTUmuTu!ZzX3yeF_HnNVLM8 z-@DrKZ+cOu;6=V}8cCJGbb6Wy$>M#@t0=y=P6c`~=N4spCmd!ZDsF8d^>rRkq1o`{HK7&&#o z(c-g&&276|;QU2lp;~GiMQ24Ng+dy(f7i2xI#p3r!HHD!i0J%B_F}6iyEUb;ISCAV zuXgxnvkJ!1p#(g#rE~7HT~D z1E}Upv-UV~lq^d+AMOV(F*`vBf{-oN5e+zyC3lMjMM=7|S$u z8a8GN5rT3oZQdFVl`=$ElfVwI*2jWaS_uWYs}^@7j9`wqTzvW?`FI(`nLX;=#qp)k zC7HwKeh4nTa6H83a0z^Zx&=X-9Jo%mS7SIa8#T{E7dox(bfjq~`tG%YiM>Z*g(a!MrMZ7NA3t2pSo%~KQ6-(7=p64xB z{%&t|^?c&wd?yX(%}&L^f%reT!JF7A|1JUOh@KB(Su;Kez%t?Gb3bEo9UQDAQ-9)Fo`wU~d3+D=9oS%m-Xk(t~3>j$-);4|0H)SU|P ztp3~T8sTX)x>{ro1svFam`)DzI^Wwd#j57OrQUc@JRlZo81-o9nExYTrplkI(0&H) zg4RM2--ki)3UGFG?MNiFhOnW3LsSEm1h?DHDW8c~q4Sr3imwB-kYD(Q?maz>hWuzQG5+5+k)@Ykd@b^fXX+7v8Y@-W0?ch z>5fkCV&XtuUHg58z#9*#g9#XxwcnA?;#c#-r#gRz1puJQ2GjL&0~}%ddM*{czs`%) zJ}~NQ=$Hydg@2)-k`2iK*%F_dQZ?xFxhFbhY~-WOGEe(WJ@TjQ;o5Ip=zWYCGFJw zD2VMtewSyYn_>%H01N+ONl53Bc?(V`t>GKKi!d9>(B&OSTy4-}Q()N(b(Q5XTs`Lu zO*p_f%n!P)Hz&Wg#ZBFvD+!cU%oaoj+y9R+p338gd9Vu#5cg0>3Q&uD$g?{@3&}zE z)HB(MY}o>xm)ha^%rP0?ZaLXISEwjOjpLumNyjF`WkkGKdtoEL9Rfr^F_m1sd+cu7 zP|cmsbMYyk@>qgFlVBjt9E;NcB!V?106I05h{Fv%SGE@(Nn5q)Px{{H3|s#L3C6 zh_f+#!B>a};Sum+B~c_)??MHerhkPtmXq7>3{e$*PGB$F!AAab3wt?6=~ljX|12px zebc}RYm$bq33>Sl*5ExH0iB9<>U`tc?nM`w9eRRRbk%6!)0yaLp{b_4jYk&kg=K}o zfTHX@-dZH$OY5AW>OX3!>bCBs9;**6cT2@_U>Z7J6kxn~%+P!@ZQk+X>luZXgyCz( zg`;?n4Y+`x@YjJApBh8^^`T>OS}9jVTjTYA1s4(n}?wG$NvI|Oiy}j9#Ty~ z5PO7suTx$Ju6%Dg(8 zqsBHg);d}EzHaxO?#D8S=cl(L3v~qzJ7G~J8>(sRCedUHJ!?1U6c2L-Xh6O)he_ z>O_08)E-QA;g)s4)!k6wFonluF!IU)R(FpH5vRMH)8Q2asAAUAT8`9k+5Lfy z4H&CB&jW72M%w~#q_{Z{RSWK(Il|dyfIs1XIc-De_+$vH(TF?*`v4$VTYOc?hh65v zTFT_%Qf;ETNZ40E*e_u0J6YKrky#FAd7588pE>Qr&sn$Og=yd#;=^M`8+=Bk+>fLk zq=}9JCey5y4~)?bd|+$Zg|IF<;k#YGHe0Zh1iQfUZ@xO=p+{42PLHq=-1N6rs;~VA zy133wnrsOh%#)`%MicZRcK_m79qjAvZfjJ@VF+ND0~frCX@EfGK9FAPlHpqH;?Zb3 zWwZWsDOO}|JsuH0z4}2kyuyyCxbLQSE=UNEOcro%jDV`ZKmf= zPqZbNaS-6ip$V^W@Z?N&5ubuvAFsD|zDP;A$>Yl9mu4p=$#bmc z+&f2AKnr)cU7Dxbc#m<0e01QOkCJ6`($-PydqpktwuXd#?{WF_KY7v)>0r*5xp=LE zSG6t{)zG8(B9lTavFR^L2Z(uOWntY2JFH#KI27lO!sF|4+t)Z;EuL8dD&O zS1kGlzQ5RbKbTnuN)umqKuk}8M0nU&s>7k(&Uygr7k}KOWLRXX>DCD$k)K`cI5+e* z?WzI!99JY3x4geu^&E?iPHia!-f+YNZut9=W*GR(QbP00NKs9ekpsEZ_q1uC?BMI&HyHTyzP0oc9O#Ewn~sLQ&pe=leQ=MP z8FT-l$JTNSZ?P3d*S9#!2u;wKn-@qD(AGTDn&*>>jx~Q=9=Yz=T%0$Bj^;Whp`)Dc zjbC>)+sdT8o9GTH3>h_Xh!0o}=o4k@=gt1cFvrJ#sBy#_D<)CuI$-Cj<{`he?sw&= zn$mcMIfaO*HL#m$M&+YkG4#E;0)=Ys`bDGn(FFQXfXnMdG=GHvZ^gO)yLs)TD-Rd+ zx1;9&|Gcxh^W8jqH8$#S6k|p+`kTKG@yZ`qDLUw?y!5+0rJ}^*<@9S7h6H()>B54Q zv(kZfWM9)&eJVB~uQK@MRNeSp{t^#v3EZl6qHG_*@=sslhng08(~TL*TIhWD%suNO zK+cn3P2r@$i}ZQ$eT=bjqx5%bHyMTEsQyW{P>Am)6q#ocM2V#IsI0Zm@8GP8|6CHm zdnoo04SLS%710eM&;X>;4TwXyKQu zYGjL5Z(Qb}__bP>Y~6Hx!?`KJN34;Vw)(XN7EEuDOE_Mn`b{6eAUkeX%xiW=I|JYD zm&LySC04qS64~rB?HGX~`A+!wy{+52%>%dSI@cdBI3clNnL{V|&b9pDHh+KY-Hq63 zeW1;M5|g+nv80&gx{^Cc!0yeg{#(J2oOFqwDh`zyKVlI;W`Eu~l8U9K=MCrTcc)CW zK2qsaFL@;9de0Udi@Z_nCVH(plgsru<%2m76kl6P6`(MxgQ=Ql|5T>iysU2ASU z`b4Aap^!ETW3L6|E0wiHHTG)HV2%W zr$}@G=L_RAB%!@Ed|jAb2i#92oIu@}{_2d9XAogRN#Wm<%b$oCF!3qHGdS@2Ri?8w zKA%Jf^XJE_8vFdi^i0m7;%7H%iWE*V9FGof6D{?k7aVr8vGS+HafcRcS+tY0hD^Mb zXUa+epioy9N;!@GiusFlvZKP4LysgrdIHRMR$5pu(CEhCdali+ zCR0GmU}#K!mu@M0ceO4Y#Q+ar`O~P*r!(N0`|iTduk3pzm0VX$uhA0MI|kMsb?yQcS3GT9EBg-X9ltzL6AeNeh;COGcSw>(WD(xOZ0o5?CUr?JcAFu6E1+%tG zcD--RgMN4$jUmD0wwBFLk2&bY@;k)}W8w}fXO`=5-nm>>;RlF3D|A(uHm22uOP*-Maft>fxb&W69^y3r z=#0K+im2S(-M-ut?96sN#=?bd;>_byVm1%U`1p(_Eh?x@KHdtwel+@tXlm>_GRqEy zdg!8UHT>Bifq+7FiFt8kvdzgxrR;^WOAR$y+v2b|eI$+M8&jMyBMUEbEx;uVX+Wd4 z@{~#2$%t$nmHfN8j^+zj15m}7=*>K6`__j)s7{2n=J{7qUUqarlKqddg&1zc2)CYk zqr~-T;^LF}^imcL#`h3RjqFY#Uylxs7OSg^+bz9&(q7nSORx}Z6co9;ShhZ2eD_V^ z$lzF}S_+MpO>AG+kJ4C-SRTT~YF(hO%`Lj?r60AFhYuVh;6CnU*EQfDO9X`G2C$7z zh*Fb+QjrOV#-JlhzjUdK;aL*MRQV5S9zYeOQ=JQuVIm(!W_^X^VxU~yk0!AXm;AhX zv;F|#*3~ik5Dg6PL|8%(;@)KtPlYNfUST<&blU0#V#K%b;=<6?-&$ptq16#6r-2DC zHHwv+N#U7#DI?*#Zn?ZLiclg{oCIN2Y_qtfn1}S|F}kE=3J}xuDiKeSMQnZ#&h3}C zAiH%^V|^a-sH?c0bMUm%1BztF_w>`Q$BD;{(8xT6a!ukyTv#!{9}R$q-4=M+E+k-X zfPGaqX^8j2;@LWGJS$3j|A)KxmrfkZ0aJ;hga9fSmxTygfyB~1Ppc91oBT)LS&cL~ zzF59Re2yrz?v-Rh7JLHE+?wF8{9lL}ng?Bz?{l8f3$rDEVehmR-aJ@P&l~h||GM#% z;Gu9m=BB=QtTf&|-htEHbKTKN#$BnFWPGQj2nOL)m#j`k5A`ud-1CeHZDE=!g+M|z zSvfgpmsZIB874IIIv-(Lh`8t+_mkK!d=(?FiI#Xv@4(zwsLE=aw1lVlv&GQ#g0biqE+l0f;*YV_t*MO5^I@2+Jc&i$)``&$X0DKj>Y3$(Gmoca=%kLaNcHkH0`3%9w3z3(?|YZz24 zx&M?7DB?3 zBEU|)9tb3nrfEdYJAfh(_KJ|PDs9Ug^{%p?Aikm`uTDXe;R`MHKCwgZHO?4O51GE`+l;I_4>Q z4FLN);=@f7^o#-G=&n?Sbn}9*8Lwp$Q`Vu0a{V5Qse+tOw8+!w%LScAx+^`F*bX#j z_-f}9&3u!Y+WY<$W`7rLJfW7#O3SGX<)Bh~VccXN2t)|u{$~I>=>e@rlHhUn2hU=d#?BM#&8}jTCUv+QNNZXk9c6e z{e04fU8Bu8k*Xr!^W@3x`D`%8wC2A)OnMF!f%jU|za1=F{?i7oQGtO?rk62)&>k_Q z%5G>A&3kTnd5$diXTF*dRs3i(AtvJLLRZVUUno|k9wfC{u8J%7qvd^~2VYncFPTP< z^5NAmKZf`CeO10Y^rTXT>z}}YfJa|9#$l*|p8pO?=%jJPtcrlE*6A$nrFb5)s-&A* z&-HG8+_UcJUnU>@&YF=^f$KZUz;n47vWBD8ZpjCgpIS0Ne8Js4?`Ru(SA4bb$L8~>+ESvd-D z!uJnVS^=w>U&VJHs2jg*=Np7^>>c9uF0T&jJV6)H+1OZ@3KG#%PkdQ;)@g7r zVzT=q#YDR$2+BMFH!-7*IA7q>nV~8Ac!|fw0VqQVO}iuTRDj|Smh!Q*$}0#H`n-#v zJO=gWQ;?OP4B|470U8^^5eosh702A~LcVkhASd~sjDP%R;owkFMnUW*#vlFw|Cw=! z%8N_ir!Na47jl;Mw<1=`jLB9sI}6B3%y*36nK2mNAU5ree8J=Xj<)VtQA_`By1WUu^w9lcIJkNn?0mZx8J5A_sd1+sJ*TmN> zY86Z%-!Zk{u6Lf6x(B&bopyoDI=>c0%kO4T>AxJ7Lsr_=*_clazq)j`pKRHtIy87^ zoSwI&5UHYtbzA*7s=C;Up(F7uSjp87XfO_*f+UY|9}|al2r$@BUoKe$LIg1FjLUqe zUtx%UiCyjEIA01Zy*HdVGIAfCfs)A)r0(r0@BZV|QAhw66afJ*KfQ7{#Q$WcdIeHV z#eQM;w}yOimyHC;hYu=28GHb|PG~!ozuW=V4^UcsnU|^|{43Wf0Q;=Rmut)fxcLPs zsuf^n{A4T-w-`gm?EY{EcrT4|-yv7+&7FElA=R_|)v#zgJH4Zy%geunQ1mz>@=e=o zYo8~2S;pyRic>U-jg!LH$TY0Aiju7w`)&(AAdD04P?6y=l&DuW3U@4f-|@-Bu@A=q zdxKB`R|RVA8+MfnGF>_1;xFnL&0+Pejo3?5Iz5i8i{_d*KWbUU*mC;C^^B*~5aT%k zstXs8W#2(w&UMxX_nknH+akrTdpZ9v<}JpCj1>6rAO0)rn{+OJjjru{3zmjAV#oG` zwuABD5{O+m555WGt=8sS_pQIxvqyu}4^zGcM}OiVIvtRrYhiDK_{XA}M?|y2P8f$`XjW%1xo{b82VR>|(CGoQ{AQH`q!kx$~4a z)^C9V;<3mJabFwX`X%1GEB2k4yqwd~cYrT~CJ4BmJOBUbqPgyBz@`V*C>*ZD6xFs1&f4XE_PvaY$kw=}!9wLcElZi9iro8O_ISZV-{J{Q+bJAM+pEkQ{c0GeVk zo3;g~2^SOoAyJBN$xryN*IIn5^X9;^Xfc#$s{F;BMzu2{ zx3^z9cr9M)w8z1$;2WgR#QW`@jc|w@d*J)!?H(Hg9gC&5Ea#X`(W_?Ugx_F% zCx5$3tqm?)NJVHFOy_;|u%+R3V&2&n0eAAs-s0Capwmx3`tYpt3x?&Fpg~2G9k{{t zkp52O>sSLYuxrWHURu4p{BBs5d=1IM*3A4!DGl2k$74HcR*hu<<12zFr7f!kqslO* zNzL)IBbOr7F)m!IA5&@RwK!Ad$SJ!=2RwBE&~wZ$3d3O zCJvSkK$+lVSCDs3B)r8@+9)8M-vpbE{soiNyGc=GU@W{}S5o@N`)Uw|C;N8Xj5@=` zpFnwkO>rS9X?Pq3B4y~-G?qjSGSrQn-4LBAwAG0=;ccZ zNHb;+FzDsJ5~*4K@crb#wF}-GF2`f|<=lmz+RTDOv;l!U1H8ayp{$XKx z9@+ey_B-J|$Q!qU?^O<|zvo)C!E+Lnh&O4)QEH{xZmq)3=gi^>nK=AqDeHiSJbax6 z`_2s*9qHL(ns$c=rIR^~Wtlk+#Z}Z1ezr@>Oj>{Gn7RMkw9i5*@lT&iFrzjF{|LL@8%zh7PUMFiq<)@#Ts^%<3TCvChemGcYD*9?-Ku%Dk(jzlFLcMaNR(>$MVz~KAx zHdo~Ktz@z2;C3O(MEcavSSoY_2egsS%dI`mSI56lk*vvys!Zvb2lz zy7~~X;|-JkEeTu#LV#X`lOj(N2^67LQaK4FhWtvdnFMbdcmC32q-#i3DJ-`d1c~%9 zLvVfmx~%K0?hDoLoKjC$W-~;MSKWz$ogHzkiU*pdLknf(P`9iQ8$2S0=L6UG>F>QP z!x4T4M~L0Yz3%uH28q8utnAy0VegNPmj=t?tE!Qw4))t;{heW&7@?5AbpBlZBI#o2 z^#*xYH!zW(|q-wkB9s^cxyvJ6}Zw!qWDairHa(t%o4xF6a2aa271 zG!gwI8gD}{kLE+xWUs~!_@E-nA5ao)yjNb#d63LZPXoS7({49ZRc`)3wtD?i8RqsZeRX@!2hr*vQOWtJ@HCo;G3esV} zFR(q!@SE1zFb1Xn9sgC`1{O|_<{gDC`uh;EiR7PF2jWP|cp{fN?-LjpC-oro>SXRV z2S8`8g1mZ+KTxEZF`qTp^V?w4L(H*4Xe*AgL?XdWIaP!;fbc#>dI-So57MGCb6P_A z_*0rDziimb->1k8c?G;TG!D`H~quZyUk-Q zf6#9c^cuYK;j)~5t5ZEHg(*;{eX_N?1=Bw`A^g-5t8qVuS1a``;>LFBy*%kt^O

6cw!H^Nujbm=2{zNA#2t4HW8S#) z-AUC7w>5UXcJ;4^dMW+j-aSwCkztYVRZhK-5~Dq<(9OKWJxtS4#@$)O<}!qSLcbBd zT`RYsA9F_iAR<;kHcJ96bux38k!=P_bBo235h-7>R?+IO@52}a(C zq*H{hLFudD6|+oVyFLFsEm%>h`xV~{2eg7de$`8Rl_kQSLz_G7MC#9KY*U2F%VkGr z)@4u2i-J2m-kT>LoG_GX_p=$yQ2SHayaVv_C|fAK=jE7W1g*#%?Q*PhT&F%*NI!Ni ze?5EH$nOG#dJRUYPJGo9B<*8;ty9sB=@xr&Q`t=+Ua)q2I#WRH9!>9pfU#J&G8kqO z`*l)ntFo<%4zTRiq53=ztnDeMAF+=MPG+>RZ`YApVmgU#v~q1YJ*(t+kgsyVcBm)VXwQCvC+p_1hc(WRgT*3zuTPw8KS z_&nD1LC@Xlr~z7=y<{GQ2;!WU-WF$L9Yn@O%hNtF0LzwmkJxRW_yHXwi!rs;0A({i*`7g(lFb9`Q4_M(ECoT#JNL>x_g6;)$qOT5qH) zH>n1Fkj8+nh70S#JlX5*qeSZ({gsxn%he5}G?NA14k7zGM!)18T$g(O@BS1-wHZGC zD?EVH3O)`dA)eE&(~FZU3sv6vR3T&=_pEro53_y?eUX>4?QO3U$;k-y=$8Hbl<$*K z51nq??O9H+pnsC$&CXC+Mk!{B+p)M@hY=_08jr}s^*TSwGF7(=n>KD;#P^lrH)xAVE~bf_SEQCd7AXZ?P=W|L0G zc%SJ>>&gzld?@2@5%hn#=2Sc+-3PUfBHtsrM&@BDu4!X&7~0Pef2YxIQ{sv=FdA$0 z7uCF&I#YZIF`o3`)Jujg((?2hns1Y!3NQa%$7S3k_(&gwgdLT7VyJd**D`XQ)&m*` zjb=z3YqaWYFL_`ubdI)D+u&go*ojyWdvFnmYgm@8TgfNUE_8J~6fq%9+VYgk$?XXT zGWf;@IeHNnsoCIPPI`;5x(nfENDtNzZ2>816;Hem?g`(?;s=`7Hj`}gT)=Dzr;3xy z6iLF7M$vPkeX;SG2CA5ck5<=syR(}Gr2lHE(oyKl(JSSifBL^S!sNJfK?Qdp@tyyl z&QF(5o7;_ITW)T2$gcG`-~Iw5PY!x_-EHM5^)^p!grX z`~3E#V|2fw@o#Mslir7S;&oWRZQ}yJWd8aljm|^-00^}*$%zTC?bO7hx#{!hYR{=Y zL6rEr#Rr=UjZ z(4m0p=7i;`ljqL;D+4;yh~eDN(yBxMD^fH;BdZYQ%(D~E;$G;o;pTfq&>Ybz?`ywuFD&f=DcFpoCV-p0F>LP2t;=7o9=suOjcR@gAO7O_D7636 zI|Ki4t3Uhp@9qUUCk7D6lV`6;NdFflY(Rta%a5p?pHT@Vg4!$zSJT)1mCOI`qsQ#& zJ!c2DE>c#*CWjzoTvu`N(}Vq2)ElZ(20{JSfsYw0LMYnbxC#k>XSXKM+b4L{um$G) zGizDJ3Y~_%2PGd!4qL77$nvnMrEgPR9_#3x*nz#$0MG=R`ENz=l>nOP2#|g@k1!r} zh8%gU67djWW*=_b6=dLOr_~id&fA|OWh=gG(P{=XbH`oV)y)pso2frfvDRZwdk1aub+Cp!U`dY&U!s1vnq zzW0#_m4hT=w9lnE1gLxi+MCN%9|iDUDa($`8?#iMn5*_#f|7gOuY)lFg$@X zu?B>!3)zRPbi-G{>&%w(-s4rPY)Ezav$)jFfj?6;e|Px5i)))eb-st;(zCGRfS;M* z#;8f9K;nILxV>&F7Rx~$bIrs27>8>HpN8O_?RGW-m|(zbGZ+TBWB9NkPeusPKS7H4 z3l6}8qoJ#EobfAz)F%Zz;qN)c{6O1DaOmtb}5mj(L&k)9bgCAOCxQ@nj*) z`Oo9LD-p{(KuEPlcCFlG*8=U`4b9bOyU%TbA`!T8)%SZq_Xf?%_OI7%0SSujU|r)n zGMFC)!ngsToCCz*cpn&dJaROB+tvqu*!4~e9KHm3E{y=zKtH>{?oWoFC&7s?Yh!wt$$i2x=`-T^9cKj@QKjOc|Nvv$XiPn++(cDBoH2msI7CF4C6Sd*bo^ z+2~3{u7qu)pBu{tR=g21d(WfytAf*4BG<6CA06YXIk-r~r$0@D-Q2p=&fleN(6F!eIop)~CCXQ>jlnXTzXp5w9;y&-PSs@e&oHSQz5jyeYqo9Bu#N$K z+p`Zkv$-7k1!@y16#~(aqmH;2DQCYt;SQu{lLp46mdAiskw2YR8_Y0LY&2zM*3@0; z2j=SRSU>9X-ZCx)y^dEgq^gx^?LW&5AnbkfEXvpGSysP#SN6$bTGwaq0Dv<2oJ#Ml zPU!6{C2KJ{vIHUDMMix&cZn6{Bqnm*@5+B4RD&2iD91^AIkP_}C7w)&^*RRuwgo?) zUS_!T2^vp%V$II?O0A1u`2+jpZ$<^GHM>4RrCv;6EpZ#H8*~H9CW}aro$;>SV(76( zJYqk)vtN}wR0>GzI^Hi(B_^uLgC^}EDxCjdi4854Y$!r@;T(=p{eZN**~78;iBu_Y zf#xIg0`0n{m@|M}ki5Yxpuwq(&gQJLTlvrSs6o%w?{{9QdIhyax(vq8iSOS1?RQh# zYvX}~PfvUa31+Q5Cffm7!5wv<01w94P<;YYPbu!q3}n-z%^~ue zfl8P>?0dT9kpw>6%pnRKDu$ZG16l`(AkfS`qvi9Vz5^(?2N4!@wVI3ICliT-IGQDA zPJ22Ne)x?8K}-vbuupKCw*7c@EMuR80DE^HD5ft!ETsZ(C%w1SP%PAQ`1ney>;1*B zqyqx2kH_BK{txk^vk;En8B2%X)YF;3kF8B{d ztI$*K-p4074D(b~j_^<4yRUCad=3Y6aLi+LWh1(xi&j7#PPcno)Ikl9&&SMab9(o+ zjX%0k`9hN(^`Q#B~2kH3)iG8tEVimJD96vSHWx=A$&Jvta_&4p@2kdeYS#I+u) zcwMCzH=G}G-s{M+OKec(dvGI?(pV@8tl~<2^XX%3L~3!y zy%J+T!V=IjEN_q_uXg)+8Uw*+Ww~)beW`FF_PYGHXA(u`DsxJ62DwU_feo$T*YEwY zMAUvDV9&hks{q^q8e$m85J3Id&hdrwci5=?*+kA4R^uTV1s#0Bw zT!4>(^IFt&Ks$#?KluHhv_5CDXU7R-0&KtI=IBFAn~i?Pg4?i=yaBOT()x(}>R2=1~s$+>GBp8OyOFm{6e zo^tlt`?Eivd${iFzHYS45QO96vOLGLgbNypWVym+>`YA0;)~IOoU$5I&h0l^UiDWo zRYm=}5Da{vo{-+X|76?!XE|thH&#&NEgahZ73~uHFd%F5$(4;kkJ8pqcWr#wo@^ht z#0=B3uC3em3(dt5k9i6e)z~?Q>=<1AoHx;fGOer3Q-0B;%zz zGScYWK)#n!VcMY|H+epNSULKuC0c0xexRijlU1)Hh@X+X)lQ^&ce!hjw~m6XV)d|-`9gwem^F%+#@=crT(68 zuI-xh^XHRU`q9!Zf;84E?BCua`a2Y>B{u`1%JFrJH)WVt9rC%i`;Os^_7t^ELdjKC zYu|yTaQ`Gr=lp;+4$_ani2892y(n5JK?I>@l#Y)JNS6U(Hm}KdF!Gm9>ODIm3Sq4; zs$S0`O0J6I)eh#}MPG5XLHNG@#?pXI8OyBN?okUB=H6^P=&!n7 zYoE6I$ur}7g;n5@9Dt^)T$a^Dj4PSV<;q8^9mG=-3c<##5Pz%AIEd7uU4WXxA{YUt z0#)K7gJY(JfHRw)MW}?PnIJ`smd+-x6}7eGo*eBPjTr{86{F_o9U?idpMQTwqR^9I zFi!_%TsK?L0%!2X2utpKROn{RZto9`ilSl1o4YS9kcU3{-yj=lQ6<+%{CkVmo8!<7 z{j3~QuUO}e{x``wZj}2siw(E7*E&U5vHWfqRa-GIcXpj5LnHm%^~e~F=j1J94XBB0 zx(O?kLncMuJRu^JZJQqT`TP%UAZw`G(bh3L1eEWcew-6B*vfg~-JyjGV%rsM<;O8p z8%jpTRI+$8J~VjM=k{9#B}Zw^g0%y3+k6&WG?#)XDFFONc^qcXXA|%I>^tu!)Nz^m82T0I}buw1lXg8 zNThtS+7{*mscqZNy_>v1KyOp$)CG1B7g%!LnuoZ61THZ8Q2Mj=I(11ErmpmLna}R0 zoL2D>f;xBK?3OQHJMPl3-FKnoF@2D?z7MM_Er5p*6luG(cmgFc+cxBwI2mQ93$WhT zG)_y*zC=XbG+(ls5KTbx#^}`sp?TCFI&pcbK$<7-h1e(@grYEui5>A*l1)_CrHIs6 zW1Y~+lFA>@^auKNWvEBrvz%`ox>T3-3$S}}NQH6DR=5<|@=2HJuqLhc)87|<)eHuA zs-JI=pO|t`v{WXH!GCHoRBEC1(w806CcTNgXb35ven1C8W#oAJiwsl^+zEZJc0z(D zNvB*B?dSGTK76ruc^=3VV}lQMjc152tG9d9{#?JkrSy30?8WZKUULDyyAGvuXa<96 z(3IA2Qhf}ywba*`YOv~Za<%5FuZw%kJ}qaxO-jDFOn4$Zz*=$Y*d?~JMuYImFtb)`Opm79$}$`G zJvcUJ-@60bFmu}SoKNHBSCHp%KdL96EZz=-g8_+NulsgR3V3$knh@|jHhRrz0P;q! zq{h(dW28v%MX#o!GF%uk3fGejS-3@e>pFxP8!$%mBWM*)uVI6(WM9)57^xy(`g3OO zvLh?Ik&SK`JE!d6a&-&W>$j7cyXWg`-C+8!R%BDZN2K{)?n~4$x(fp+Kh-2B>%B6S zSh2tMHJfxO(#+ErP;DyQ>r;o}k5Ai(uBb&PwcF$37jJ_W%JJBbv4^Aez$IegG0_kx zB<%-J!C5(Ij)q4qiwgE+GKQK0bP4^k#`y!N+2Ov(IW2iHs*?WnmxX^c8)_M`FO zvZF?274;aJB6gLdROJN<|2IMLXrKa%eiJ=vkQ+ry{!!)KTxHrntAv&9XSh89mC5zj z+jB}7hBUtD_*2mwr{e7Gg9&9gkeA7jQhIpp!UE|vKg(d1ok8aU0lRMA5f&0@pm(tI zR!8blt;L^{Mu?eZ$HNp+Iq%k<&8`6D@kE2_athS9g91N2I}Z64!6>+emdwJh^vj2jr^Yf@T3I)OfUV} z*D13#Rb0wmF0b+MRCC$=SoX~tr9rCuVb{I~-YlBCS(_I86sQ-vv2aj}`B~pEWrh); z0X?(OktnCC?#1fSU55VG}={-)(h z+ciDywpC8_prSR*hZh@K(@TlI{~VK;&4JF!AO2u|ub!*hc}#}nI>o->lxYz9X=Vge za>+9!S`Vf?Ws2sB9^L&X=5FI3w9QGjy02{>y7TCL0z-h=yr4R|sy39)jJkH8U?@fH zyIxFmu5zQ(wNT`W-vdu+{h6u|`U?h&`_JIVllB@I;^Bo3$MOxO+LjUv*dloBBYy$) zaJ8Y4JW2cR{4`%5D&|lvh-0yQNId-@<@Jm?k4lWLr$TdTLv#6N&-^I*2_Pt+7Jb^^ z^e8pv+&aw}8nsLHoGVM7WRq8fi!bZp3s1T@PI4VLmr+8jzhzn+Vu*C=W*Cc8z(CBk zdYI%CshU-PFn>ZSGHtW}*Ssh*zMk&BM6tfduM})Z#+pO9Sg_Ux`r z-RAV{)biJBPn>`C9;n^(hU?^Qg9@!SUdWhyuRWY8Dspf>-xldvzUE-YJX9=qzN)?N zck2<1tv$vT;)4o#@2l5N&(OQ4LQfB`f(-pt1)r^B8b8^~1Ph(qe~Nv5F*kq;@6;x_ zvP>#+EY_r0-`?w#aQqAoVXaxk(&HLLQae!PILb~o7Yo|2|Ga*szEb4I)Q5Bq&fhu_ zcbyJ>V#t5kU0X)WWP7EDQ@KIzBKHbP?(-d}Ui+%JnGZ)QetsLDY`0S}uyp8v3+d$&DeV`MM3iXd*7bR)9ws!S5JSu<;fC?xjE1hP2P~2tli2 ziqnjBeo*RKD351U7}k*L_$ME@$M%L;ANyK>exI!>f7Z>^^V-~~kT18i`O*~nL!=>R zpqse;O{&_Y(kuIoO<3y0pJg~eBAq! zkefAG9(MS1a>zwFg@rBQwaZllH!pYG7r%GT?UKDBSCeTLtV`*GlodgPJur2K#g#9I z4d3m;;R)>8+!PubDz<&N{CC~+EFV{cR=#)H92$#v^ek~RcqVIH{>cO~r+G8gtKUW} z<`6#X9_l@3+Dud|0e|t&xhIq#;NtwZg!!|DiKZ@4cuE3t$V9i@i z=~ew=@bZ}Xg#yDjX8V1}2LmKdBSDq6)&{;?zS(VkA$p6*COc0LhThv1e*MHhjy|T8 zNXy;^X2t_K+H!(rW`7nN!K8EcA89DEn(ykef0m)LbcJ1b3&7RvI0{X??Em9Q>y1&3DE<`<9RwO^Z7)e{4vu06~rag}{Y(LU)DV%w-) z@<4swwLfw4zR$nq%{=|!LufGN(CpI^Pi0jF)l^-ZXFSZ0yQFT?Hx}=vQoUX#pQ_I4 zs(qQKo2#_6E!N3ZUxB_ZfWc>b%7(`Oo3h+1>+ppd8%`S@X@L(G0(vtz2g^S4g-Vr0 zy!Gt^>8qgC?Z;#JWJBw_<+GtiP&X&5#bCA*SNakKy`I^U$0QHyM>&z-C$^UFPMG%q)}cA0foO~{oE_C zTyG!CYx1g;6TP1#T!c*bYYbFZyX^OD(OxH}rkt@wxl){kgRd@@*@4i(DCa|%aEyT& zdBwmf+9ScELVDi?wcgOrHP_A37uH9M_wB#Xg+1>5WR*z@K zHc!=sN^R3Evao6)mT~YN{LFQ z(i3Mm`JX?bPB&w<8Jn#fX)AM^?csNDL=E&Xo%B@pTkG z&VwryCAE6OqL)y)*~?PTDUEZ$Cvq?AvylH#4ERVaEHV;V(rl7zYt6Idbw_S+878ts z^6lbni&^8Z;V_y15x*pg%{sq2uhtNk_+Wx|#^zJ*+-fdtl^=DsCx%TI);F(1iW)+BwrP7^X#n@lIw zsJ!x1;cJX0DcU4jJw7$x%XTnziUEf{$oVOF=tG%J0@v! z!xnnrs8)|)GM?gM2XkBURdbsDmF+UNoPU?C0`u|LE`W;Ed1XTXZ@_ zpZH%Cr%nrWXixw0-%kNXAXYkn@4p`z|6nlK*#BRC-Dq*p@1MG3KhB|gi%KltoJF5d zv16}AAxja3tSr~bMxXI5k**Gm>iJ;1m*@D-^CW?{inML>{;{!vZ_C$4Z~FdgP!_2C zSxN11Ze!|^(*jgOvcTCirU0vU7)dFLK~a*9-eO6+x@G>Fvyz-fr$1fryZt`0)Z+bK z_kHniF%Y6RUbns8*1^G{B5LrR~Xy|wL_Ugoah-8-U`}P&Joxp*wr7e((?SG4`2co(8PS>CP zGH>C=@3N4_|6mo8Tt!t2P(6nK*IG||!42un^qCo^=hwam6%m3!YhWzS=>(75cs?Yy z%s&eyESPiE4L5oKg5Izg5N zjvItmjW00^ox(nQHC2iy9xFzR<0gpQ0lfw9g3({~?Opk66r#6XJRrA-XMxus8mN>L z7!BOE?uh({E-u=-=EJ=sBC~4eA40QDQX7UK(BrbV*-6{Lt?h5RpHL9-mJHSD zd{+SlN3Z;M0~>GnN>%RlzDIvtb4n&sP?W;>zA|t?WBW_XqXM6vCEY>~Q-V#)?);zT zp|s!JoC*pIwTM;JI5&#pPf744R5N>o{_bd|F6ka+ThbJi7FAH1Qg#lnrJ3e}MC$$y?$wtb6sXW3y$VT+U03fI+1-&69^99uBDp z2?K?eK=aWs26+JZQSJOGW;C%YaKk-GEL{SF&tek@Fz6ij~7u40SR5~_D)3DXh6-C?dZcH_6un>I%zmoF|xLEjTd zO88T$^U2LkG;NHi`~Y+Yr+@CNN>0^nKVCR9yFz+h3Tn%%KTVpCP&W3#8V_`-Bt)=~ zu81%Kg`D(4k_z4m0rXD4!xOLtV<{Hp&L8UfAhk!lV5~Z@yfWrPyZeOcqByfQ%23{J zdzvH#V{~FxZR+7yMcXhl(igH=TvV;mU3Ijnt$&c%`lE~6`2;8itSl z%bJvD^m5+omKir^R29 z^?@il4Gf`Bom&N=zl16j&Z}XRzHxRZJDEbcYU4Urf zZf^Rv>O-&$UvUdI4~?Jex z4I%+5Txt~x@|^g}09ykNKysO{EsxQ37dn^Lqo44&;_Rh6FT_f{gzy3EfqyZYCGPCY zeYW$8eN@kfhp&FE>6%s*tx{5twQ zVO^H_wuI`%X?R+r_E0>7{`(dBHn0ehM^3)qX`?r1p^IrN$HK-{c9X{e2jv)g!sU&Y z6NbxGr1ob7!HaR+n7={We5G-M#uq{o)0G+OS}xIie99(8BWJNgBIZ z^6Lg+>`jU&9SoeTJai>ra}4_YOuWcAWyzxAMKY!v;G=KN?eKV!VQwA|oKem!&hp&$ zmxWSSiylp%14Y6~oO=_7ve;*V93E?SrwJQ-@t&qE~t7#$jML;*9 z$qt*CX%>GKDQxMS;|qxaMb*yn3QSz4WP8d?75|$JsBS1^`n_fcfF2LHo;fDVg>-&6WT_xM-jWYEL zY~~WR_EDh{MVL|mE)^t#Jy8=;y+8QZfk;w)>00mJDk--7GGHTi z6qm>AC8N7~hc<>~2zo3u$g|1%HP)QSEDv6=Wj_oqnp*B3pR32>*d;s1xMtCnNgSI#2 z|BrcGNYx$6^yR>XsAQ7kt_2iF6QSw-M)f_-qE3=u$=e*pLV73U`rY9BdTSdhg(}Pw z7HK&7ixdvlc2V+Oo7Cg~b2qrTL6^|k^zX-SlK%HTVcI0|2(vE=?lAn1S!Vu%wF>g7 z68S%mz=21&({t8C=D*jvJxg*`OW@}G|G2JDni_>FLphvjIQO6D882t$+e%*3uAwIuAM`P?<|mfxDhXr45DW(5 z6WFK-+I%4)C>419&FraLq&^_L;9#p?7gi>ell1tEd_{i`@`EJ&)HUOebviTLMiEOp zxp34i`2Dc2^g6;tN?MXcS3`?H+^-+nwu$%t4F5u@#~!;EdUIzLtUjOui5B>44V5o4 zEil4G2ZNRp(k)T9A{d^L#?zZd%Tn?-+iSx!<7;|BhM~ZOb#1uV)l_hD&f--^0JWpq z`6A;Gw67`wxSxG?TK4Jpin09Dx$}aH8<{Aw32SP>6@`(xtTWe^54CQ)7bSo>Q985 zsz7FZ8-oh7h@|m3rE?ilXg0Q5t(ie%4TCCR^0GZ^*tie#OXOD(u1@^;Q=9tDbWWE) z`-9eVPo5^4F&AEO_QS{3#KXGGijvR`&W%h)F1M{Z!R}8jv;q>>vmuyz-iApd@hCJX zTOrH^MD48pKAla1+nn(DV)J9* z?lG4qjzf`VnQnXv`UoE}wPHWm%CKUYK?|0;YBr^xD`4m27hkqTR}_q9Sf`|wfsGfo zCh3#@h7(L}coE-cMGRSdQ|JqJjXDMf$#}J4;$mNt`sP&!y$uX=-^%3$nbXwna-#0J zMO^a>)L++W&!0XOVzP)7tWJhMAfEihOib}5rm!>^E_PvH7q9VNdk4YCx!MyPsH9m{ zeoFifEpR>Uq3vLMACrU*a`EHVxzl&#NvhKHZ5B#dN!(Epxi~B|cD`kzMjb6sEbaj! zaC5G1@2une2oQ4HW|SDy?Umn@2VS2kj$AhK1fQse@ljE)Kl;29wJPCXUSuj0+-JPu z#pnL@4H`JWYXhZ=Iq|i=7nms7ntAwM+Ms;0b%f_*x^XpsGtWpgful0oU>-fIuu9N(0(dc6$QE9*Q#}eB^2=7c4 zu-tYo5(X zcF8)En=0(rc_zhq;(^HV2(_mS%m0|%p!wKGs|#(9{^vevF~rfj%DKw#|5(pcFoF_y ztQY+Enf2uG%#Y^HuAWZxCQK)DHDT1rrX`cBJGDp@+?INnWT30YtiM}v^5!J)?L7i2 z4a9TuiL9##J}CBD9%CvnXBfQwFF0?TUwwdXP+~M1x<#MnTz*k8I^^9D4{D|Ibmj_R z2Pn@}qA5w;k8OaYkuVM@NnY9f@Il6X?xkA-2TkJC+_rbNPGu(8J~ZO-&#q`9o`s~+ z1JVcL!e0ck1eU!`y74ZYg?TFjF044P=t1?Gq-Fhek;$-X_iqTUo%U%I2A+@QgP$j1 z&SOZ(<{QR0^WLt;9YAWlV2~V;F7mv{0PfWH{e1}`L$OxGC_5n2)SAva!%ytLnkfF%$*HN1SJ~&zz*2tfb(0!NF z3v2~skM#u9#32P_3q+3`ufh|j|M6=lX+dKf4(E<t@P!++G@4i0Bc9Wmw~)+U@Me4K0OfzoRwNEt21* z18F$`F&o77&NqBh5R1crLR#<7=bV>!!0bl|oSl$g1i?gc)4T<6%Z`&Xi&qx~R1)l|+sf6AX@9#HW z-&hi?csSBFEMUCSA#^bZ3YC`Ivc#>XsB}ZzxeY4ZREVGYQv&L{e8hA zM~{nc$e>8%P|>i zsB!riqW71HP8iBl&13ltmCMT`*f(BF=mU$QcD)}3E1nRyDL$MUCWzhp9^OM_Ut#{g zUXdS0VkER`9A%N`_ULA!SH)&{mWRF`O0s=xHu7-q{qo#s#{&4SMw!|=L}(qKfnfo>zM$;q6X?pM--DmrY{`{5 zVd)85bPr*w-dEmi0v)ETVa*eE^|{V5rc7pBBA)}aQbQ>8qKGFd&Y*@a5v`qhxVZ>h z1vZEu2X8SSW)quq>#0FFhMiKZmU2xW09k6ee$5L}Jw~Nt6Kz=3qm&+NTyMd?a76+A z^t2Il5pdneR-k6h1xrf3Ov2%4_&~x=*#JH5C7K%7Q~KE*@*FQ6;(#oYCP+0!7OiMh z^2c=9d(CyU>?vojMMhZ=scoM4q($-OG%ejs| ztTJyVmyIs{-p{?`xa?Gx^-Lt{cdg~;cR~vc{>K?%W79i$0P)p;W_i9<+-7~pe)?f=I>4w}spKqc${mm%^tfY~wyzdYTX;j!4 zKsu7`GqR>L?;CYtQ$nXof&1QIjJ$(G8=!+aHs5~x`^<^Er*IN@-d&sac>=VSF$m@R;kb^C{;D7( zk}(GuXIx_W-l`h}$-i?2QfWfg3UG{dJ3?rXcWxq)fV9?b^yUn#>wI}-Ge{dTs6_B` z58`lEr6eui7pv^>4W*oA`v6A3J2l3U%HR1sPk#)(qbfTj<;%f&JahX%Z-%4w4c}In zOK6;LhGLi8#`hL0e%u4v5PI(!EP4Iw?` zKu%XS-A8RbXUW_1YrNv_60-tFM-JHig!gg$d>XKQe`on1m+YKx?2~VA1(E696p#aTe2fQ0{G|^Xy?>`}`sf3^vRJvm&)=Z@SmN>b5j+0k3Jy}$TalJq^+J~fH>A_5^}u=&-Y z@7k+4>?fw(r7S|Dlls&rKixHdtKz^h&_AS zvVL$0&VN}EQlY&JIis_w75^V-Z-pXMA;EueNiVez)LyrT}Q-;Puv8wLN>?fDaxjkn9*eNH?%AQLrrY5vR+o0^Ru z`CKscn#VMpy|429P_`jrAmIQHs44P!cGBOy^Lbf__osLh{i+Y=1)6QJ8aG{lczuO}?ux3fHcsdm8ef;*%&-fu>cD)&8c_W54DnvE3!UK(G;`t?kgBkx{% zJJQS7{;BszL2Z!-CoB6bq%?SW8iA~rFofMFm*0mkKKaQ~-4ebqlMW{VvQboC^2PaI zGZTN-<=r`xYsvA~fJx3Lfa_12+cIFsNOaV3Ui$Uhm~teDDW{Zh^w*WLi8lC*c|_B> zIO9af)$7-Me^vAuNXCGdCPypk9##xD#AC~h!eW8dosa!eULHHYel07$Nk=KWgiTX`&A0#0sSR`3q#tS@XJ8SnEswpY1U^+>`_#xAns zPTEoSHxQ#h|0m+d9E5%dic8c|4s)SW_>C9?0!vuQZ7&OFY%1twN?O3`3newUcElkL za}^xf&(;n^I}TgE=Jm*A+p#JukGJ?CeXn$_*P($;s-D%eELS#j*U7%`O$O-PaP_F@ zN8i2+dyXL99DAJTAFy^`9nkXa~&KuB+l}fWgz~XXKeMmvmBxx5AUI9 zeCup|A*npN>2ySMC9^H&clH&wghWPe*0Dy-V}-{SL8mMK9HrjpMqL@bbTJ z3dHsW91?IjWH$LqeG_OwnVL}p#69SU)ON7&chw|ldPm4=k|jG!*orym2ZzHc%2pX1 zV?vTqGa`fjZUuTMMXhNH?Ebqwh>h|P_JXs|+Z^GR3 zWT(R0xxav|xhXSkI|~o?xdnclh*#W(J#I`U@L{H*H#zR~mwEi3WcIcoe2wz@P9qk& zrAX~g_U1Wty|3G4gLEWYd{twkKdTMkIFJdI>mf z%j45VJJfyNl_5elbUvImWliuag9LnV@QG;~{*F1O2#5n-GxVQlW&S3hqS3{vhj0qn z=|EzTxUNTtZ(o6}2>IQsN^nfuvFlIETeu+V!2E(wh%4G_L9BN`=fHfE)0obkfy@&Q zw`lDZL8Qw8U3u4key%Ctq7b!8;BVC_un*_q_^~gC03NQrRGorerA-K;m!bUsuPdL! zt1V0tN0RYHzc>K6%r)n{Mh1T%czg$K1O!9o zHW;%vgy|t?mFx(HJ;J5$4Y^HdK+iG&J>b+>v^Y`VXNI!EHg`j>Bd`m&oG$8UhqyTS z#In4S0p&eCzc3g2GsB>E>_+M168gq-ic`7Pfc9R3ToVr5Iw@Mw40$zPg}Y0NQzUp*Dt4#@06csk5)FYCmd8yCbBdv=r$e1DO7a`(6Q6$Mc= zOsoe)moWl^loeUvs)rIB!i>C1(M_(+rT7W?PHAxo4FZwoubBxNp=F;DF=@0IyUb4RiXyYI_A5O<#(894^U7zvZ&DxQOq2gGE znJzbWIdi%!Sbx>fo7aOcOp`YSK9O5hVHINz@6fu2W+AxptHC)V#Jt?(`Tu-9T*_uU0WJxyx{(q(tMFysa6j~mBG(z&}bOaCP3U|#fw zbB{5Pm&kK3zE^S=1T(;r|NSxZLcZ+X%HnR)Q1!CEJz!WIt6R9pWZFhaQWaR6nl{nB zILE=4ASFw*3}L#O(NPN4*2XGyJV&Mp@u}rp>4)+34c@tP*~W83cH0CH#^4*GaLEZiBVdvfRlF1f5KZM|CASy@AD?a^#T&F$*`- zbRrx_#d;VO!ye#Vxtj~{(aXFiPfyC>`hUMRUZ!fah++bIKcEORW11lVp2fYr+=}d{N^N z`vA$_WJemoPnrbDdx~fgA-z{T!<&?knQzvb0vAVsPcnV#H&9r~Bl5Ig4?-F@Jf1_E z@Cna+Q$;2Xa}NdOtQDlFyZ;#K@!c751Sdi5K|a~$2JtkoHO*PYsfr`;4a>JqNFjKeGL5{`<%`kc|5-l_DVfe>OD>-IvI$Pgo=cPBOjz{yWnd6~9|t+T#}*lZinR zrghB%Q7_UBfJR#Udtm7_QImkx4_tR~(-q;umC6g}Z+1dG)a@w^D$Tav}h^7 zy*-2c5;_}7yzCY%i-8jY%8UGU52Wi51K6^fTi@8-7EL(#WdVI1a7G%#X;^GFju|Kp zSpbuYRdpVMG{%<==UT)6zWWv`_5|Mnss@3cQ21DnD_RcLa64>@5gbJH3+mkq`kaiT0MZvkzALCjNOj@ z2g3CY!*fdB zX6F5ao~;Jr4UI=sZ^;8VmX0pcnl7WaSYR)Rc%eQCZt9=xQYG%yhx}0Es@!>t@0H_~WWE3rYWo-U5AA(B5^;qwHeokC(f;m^CJ60bZ zPGmECIo!qc0IN0h;Vg6wLt(6En5GqEczEA*Shd zc6j|i=a-D#qDOmqQ-Rpo8YIW9Ocf2+l0j^$r61YBnTvPKD};a#(xk1Few)p_W+sb$ zU7^P5c&!jN8dO7~%MeHgq}8wBA&Br`u9(MCa(A9bgV}V&QnHY=tItcJa6wBao+Px1 z{lmRhiKPn}?oVT@@r9j0p=hV;dv$OZ!Nw#mI=CKZE&0w(RPP22Lcbl?J$v);nMt%VDc#6v(&kB8 z(<$$M8#>0aO?>R%P~~58)!#EkT-$Ip-i&{-!vCqOb%4V5m%aDz1*zin!8%2=^)R&r z9p{5%ZSlFo|J3Xm;VdHf}q;U&~~WX~p!t|J{Jo`*+1^>|2?>Q5!M_h!NM*g6i!}4(A-UcJV5q}H>iUS1>9SW+VQ>OG37qZnwbAIA} z=shA3Nn1S{nT%k64m&%=>;#EC%?jVsfrnhvg&^u8kx!T>Uc;{eqto{9utXgL4#^Ig z9P%=!v*30$9wAB)0G3eUL%6k5T$=uOqoMQ*=FNyeEeo+}9+B5x&;mIKz7djG`@SpX z8N$`oT+mLd>%~r3hSkUG*y$enel+LqWb(em2(yZ7Z&NXxrl=}lv0bV1jAJ;tY-s@P*x7m}z)4EWA(iy9TPe)OC{0**i< z7~Y6XsqC_#-nC4 z@6RSTd*Q4R{{ad8EZ!Vxx?l)Cx8WzU=XDzg4M$rJAM z!#&-*ym5;cNu41-PbQv`_=8O?vS{%FJhF;Grn<^%p@r-|T_nMkDi8FG96`PldkQOk&B{q?GD_dj1a{?%m! zq>-3VKV?Ct9+1{UY>{`{RzLl(aqIzqnB?PSJ?p=Zd`B5Iqy5R^3#K`k6Ay>mx<@I? zHkl-AOz&iJ0rG>EgL{h!F0!x4Y;bZf80%nbtPB|~)4s8%%f4yx!Jc8Tr4~yB<7jk; zcV-LzT{Gq|=|}lSS$vY70F_rIw}sefOixmP)4YpgH@-BKFZ&cAhS*2N{&Jik4tNLXI)W&$KRqLVQa8CC zTNrJadCGz^J#4p{#@fqWQY5H1rdkP5XuZjCWobGGaZN=M?CG3-8S4m7;GYE*3Zm?r0?h*dj@mnCvq3^dY<%|i-fl;rMKHnVLFao^7q*<3hocb6_jaHAC z>9Fs?6C8=`W4o<0%nqR6X3sNCs)3(SeWb%z+7|$(`RlIepAiPimml!q;Yr;c&BZXh z&*B%aR%dTp3QS8rrz}5LfQ)oV0{sYpnQily{YC)~^Kz|G0R$4CzTXhW6hKK0%1;KT z4#AmyxrH1eXXA10T^}Omy+=^HXA^cvgRR|8dyS44nKi)Vh^Ydy^|qP@tYI+n*(Kc4 z2=NWxx39h1^Lo6kL2ZTv_w@;qDou@B?S$m)WYHaj-7J$sWiTdvHhqZ(~_cfMze9o!k0WRvC%m@YLH#o?(MrV?^+h;51$3h=YUp-2M%$|KTu&E; z&rRoSZA8K?Hv?%qk6b_&Prp@M_8)&rgh=XsA64)kB@M~T_H?Gt>YsE)JG0RxqG+97 zI~)X_n?-p;({&D>j$7dTG*jI$QJs6H#~!vl?G5y$;R5Z?l5*AA!Hn;Qa%dSGAo8&6 zvZwOMdlOWAKR)0+&SyC%*R((^H=RRONL*;09>pPEbwRrqHRl9>#$)z1-u7XQJhgG8S(4RS91-}9#(K&moxLwHBD)jYyWEN3k`l=6VbW2@f5 z;t2XomHNFLqC7p2LDRuJC`TJwFHiCOJ3I@^B$Lp-*L=MAc4#mt4fbGqTDa9R2(6e! z-n`96jg-rQcXvNx*a-WpeVkS0a&Aa z3udo55rI#e8T$Gf0BCh7?Ip*exb?m6t~g&+8&)UxWbmF;tNY| zu0h0qKre{0DNm8@&%H!r#0_@3pj75nIW_gLlFnt>teBnqbRY zp8I@z;#(=|2yY`^vbF6c{$GOI!uX;&CLKNzEow2)EsH4|ybuJ(u6V&s9<& zGG=shYx~64M%ltkoqFyI8y4me>h}E3@K=umRg-z)=Diy)OvM7hGhM6ob;JF%r&ehy zUJ)b8Kq>r(-HAAZRk${5&jN$lMI0WdD&ux+n_cwuZw87@H4Ek3{Q_rl@ZIz=c}D+0 z$k?BKI|ef~M>ZecjX6j_&_HZcxoXnh8HsfHBixUAu5M?MZC=Ko)y{HP(;&%ggnEbc z^3E4QDInx^L(A!uv(YUjSkK>p8-SF#Z`qpqXlXm4nT2^@(#ivmmKg4p{ynwLLxpgR zwOSQx;g@|=nfSb>>@XlWuL~|YNFRE^6F9j&>s>0gej>Wvz_Qv8xU0C>1*W%nBSYKj z&$fmyqjkZ-_SNt|HDClHEPZ-T`8)sy!04E%S+guBqVxvbk-fxi1nxw$5Ghz9sE5At zY!AdxWsPH8(M&f`RJ@vzKaApul1=xO>w4!1K0cUtE+J!Mk7Cy?2tr+^KAljpwLgKl z^@624KnEDPZ8Z|=n<9R8YlAU|FDs}mmQURhQ4!GZ6-Woq5Qy|*SY9}MG|3eTLO}DB z9YJxt7QXHa+)msgoEaf2q&-{n#l`x9-XHwsywW?DLpF71xMwfMNp@!{`~f`%b(A+- zniErQ)s}BSC(WsLk$+@r(+Q$$O% zK~aQwRobt|0j(oIJ1~um0Vr`mNZ#UNSx0g*XrJ$dJKk4>5Jc#v0%8tQ|EGpa4xEH$ zGD`g#>=@oXtABSxJbGhdxjhu_ps18zWcy&AoaL_)IOqYfKsGZ+b~9x{ZSTGe$BfyQ zAiRSpd+Yo$^Aqw5td=+2QMF`*ZrQNxU1=ZuAdsjzMsva0`SaoE!p2+&pSy>7y~dD8 z3onFZ+XI3-`pi1&Gw!uP=oHvC4k9|#CiEyH$e5km4EHvYT=OX;m}?_D@^yzpff{>V zVPO_+ytOy4^c#e>1sQvWWL~?j2?oUmj{o|z-4^Sr0hES~eiF`70{v(-;*II)sU(^* zXh<8|`hdWjarpoEeR;cl(f8dT!Sz4#>OXuFo^)Un(Q(zB)-?EpK*P>mJFo6pei@1J zZ5PYe{GQkJK|ttVvpzl`Dc_A9X>IKWj=*?=NZQc(8bd&eOA~TB(Y;y{?b#e=*ACyb zMmg9`+)n?D49v<;_z$JL#s^1d)dbh~7U$T--8~z!#neoFPji2#%gUgGkXwCU8k>-^ z*UK{0B}ZaqK>d>@*|R>uN^4S!h~}cMEWt4Am$JZiqj@?qNyj{1;pUbq%uhF#8VK+n-v|I9QgEQ? z&5M~)!zpoX`^ziiG{Xr$OV}`G4ct;~2s?mPdhIMr3$umvykXnVH^KHA%6^x;<_%{E(&D+a zdl0yjXkdiA>5do3K)XpS0%T}`HJ{9}J0oYFgSjLR_^K|h9IhDqQxHZ`a$lHsr!$}~ zZw{w@#^D!epzj7zdhdFwRJ6MX^$x+d6rfs}UAx}-YelPx>4TFJ1+RoUqr218(z4M{$`;q#ytO7R~XrprY{u_ADcGT^Zg1$VL%{>gdW9N@O2;ZL^QMrP-z5t%R zYUxLB&a+2-%slC0@tt~Ir>01mtKVw`~e`l8vYv(?J@S*S>nZ5Rwep1FZ=5g zEB?m$8Vu*iVwQ_f-*&zYL^xf4DWvV3g9hzwRjR|-HQ+;zn#@au!xrTy8p*-si}z`# zFRe*j@N5bh$Pj}>|Gbc)A z6DSNtk@k`xcJTEz?(;6qyYK-G zTe}8(THWbfA~W~Ek~u%yJ?1&Owr`y?pXcnbN~-8aMvz6b>)L{KS5ilpAjb)Adm?h4 zTV^=BQ|q)&g#4=$+dv>2IzV@78P*OCWOW$ZrLX&}QE=!L zbSik&aO;KCePNAZ$;#`su$W2h54`2dhAxSA<)2RQ^a*3PPjb)GyU%y|Dw8KK3R9oF`HHwHtytQujwpDKl zhJi`=LXCH*a}NV8{>G*uoZ#7zbFOl!>aakZQ)Bt+mrLBDZ6k||?;#5m(X5RjFR+s| zFwd@SCJz+#y$BGKOZl8d%F**%xq(CArA1#MZ=>Rn>5G+K{zuLU?a}vJr9r>n#V^}= zbT`paNgS?9d&%IJ@3fZ^432fA04|^B0g(k?I0ghTt2a>JCr{Jp&Be`^?7~+)?K;>G z?F3C$m)vU4u<}(39*;^(pJc>A^DUgco?j#KD;w+OWqA?NHzVd=HjvxMHs`XKtp7$) zf5xT$Vj0Ft-K)j%TN$;hwy?aD6@*3?l68LJdB-eh3!}@u zQ)V?eUawJk`wM0C+=sd4Y{k1mZ$7CQ?O)jY;9Xm4^G~(P^*&G$?T8`flN>4s4PtcRzWVw*9Bf_9OOTwIBF$>=!GV zwUsXK?=GoG%h>;Np}aaCmc}xCT!F^e#X3+YG+=YKpk!{hcJ!i-UW_*II7H$VvOJXV zmZ8X{l(s`|xbMs6h1;L!Z0sc!f1?2u__|tTxn-1I?u^D9J7vcmma+-`j308*Z$0#* z%b!tRB=n{}o^|Sey7Y>_#3Y*J1AswM>f-X!sLk+r_}*IRSRlpLX)brXo3K3Vzi%Hq>0+(BTs9ZsM`COmp5HCB}{L(yNXG z@86;9V5&ds30jmRe(;<>tv)!%&9qNPS1*k0xZZNP&N_*=hxU&8RYdHKqMjnD4xF1+ zBJ>GC+E<&$!TWGfN2;tfKh2XbVGK!0Qvh@C|&gLcGqe>GozE5llGY zRwMtn=r(A~bQqKBNNXQ&P@*no?KUk~|E*IY=fx%MZugskHM!PNdWTA^QrEY+yl%=a z$p0~>s+n~+Ti~nc)%-nQG`wlb1Ky;v2M@e2gA5Ew5l3V;9j|oV8v^=`8k`y(IYa{( z3WpzD&%M*BmOD4QfpbskDp`e;gPir7>*Lt_cbABSWvF$mJOF=Lwev*d|I^-?|HZh! ze_YcxN~G=3ky0n^3mGX(BhisCo%TJlv`M6sQ_>!#h$N-c%2YFLG*OY_5K2s%LPANQ zO|&ZN`?|I8Ie)?ThvSDH9+vx_xtHsGU9ao;y58owVG*CYHhv?tiA8bj!$<4+Y8}}= z#k+zUtvT2I5_XQ}#E7$egW?P37I`{q$rwLERyNp=?E;Gg49&~ z+GhhoDg!W-aO4~3TpV^TIWaoXnc=Q{EJW~)NbxpKxvE_W?d9R3-q0b9`^h>*vZ{Ux z1v?}Wb(WX4hwutoFw7ph>!SnrWM7}Cg@<_uLQZ`5!R{oDUVGAhk|RaYDx<_X z2DlC`Zb-FUs1XOQxSGjelc2#u5?4@+)^FRjlaChA3QzjJE{{fdPwSzf@iul zM(>LQA<1=b5m8=~e)E!2TI<04@zt-c?RRAIsd>3b_c+px7y1O`Wg4LpF&t~P79A;m zmJ+e}ecpBR4oryiK(0Ml|69}5eatP)_R$<_Wck7}rE;}|$VQT8IHg0MW+9|3v$iX1 z;Thby2T(m@+`0gkeNmDq5)M6zg$B|9BQ7^x)S`B2ddu-Y6KLI)bG~WryIP(7%il_- zOYQ5bP4quWstGuG8vP@*E8pU|prTdg+NGRU?cIlbqvRAtc2t)Hgo^yL^*&gJrXK5`PY(0`PO#+!I*@6bc z*`QTCxfQ1)GuQm%nas`#?h!)qN&hV^KW$9Uv)Uw;Ry3OTjrWLejSYo*p}{pO_=Eku z7JZ@`a_?`i$?A0p%jyzIa!h-5Whc*kiQ`-8`T1!SPJSEe2)S_a z`jIv6z0=F=R3bYoIGg8C&ZLQioA7xi~vCs8X!!r zo+ae-RI;XVlR~apr_-cILZ82_ZpBnBHZpB3Da za@6BsWyt>@{zq666+$;=e8kCa6xi zbXfj1LUc6YBZ1He^It25DR$9)dz*9Q(4sQ+L(*ylmcsf2OfT0kp$pDL^lm-s=`n7a z`UKIE2zd$2>m9T0DD768&|t^=P+<6{(qp!`&D2<#=YEpBNoLgsZ8%%;&l zWO%L#!8^kf@};uh^}Bi=paG@Wd8wFz?tp-3Gn%AG;m~JOf+7J(6~Psiu)P$TX(4l8 zRs;r4wno|8eY&APqf7V}Sj{+8J^=w3sZr`2CH8I(L`nX4#-j;1pV79^!uLQjsN4MP)tno%9E7V0HRH8qo4OolDX3#O=^90JMC=yW)`9ah@(S+^ka zgP6@lI|eDd*%wF$9AbZGezekkETlQ6KT z6CCdcZYbd@=8UsVj3u*r)d=|3VO*sG%Q~oC^#DlVgQjuh+}n&DP=5&UHJLY@`)t8Z zw7JcLs5?TSL!DDJk%bZ?8Jd||a0y;RGBRrza9Z833SvIMic zJd@7L8P1i5s?hdor;xm+>Aa?E`}gZH@P5YnyP(ADvXR zUGShKRm7qN(k1qjbj}4Ihp_OU>QQ*^SDDG6-AvyNPhDhgBsSsFJjTl2fdZ@NE;?jR zN9jiK)hPvOL5-_9hq(QtsWlQV4IBRcj9H%?rsZ~~_5Yaye@6r zJptJc&kS)r(S%S6;KGns=VK-6L;mcz(DWI)v)N|VYJ33 z5E4olHB$>aS@Er4`aBn&;B&t&&^fpZhF)3sun>|>1$$AkgdFYU8xjNZqo6q;oQU$r$5i3Df7Ly!bb;Qn?XWiXd5UI`U1wsAN;bxgDb|rTeDe-0zAEe%Rp> zl>|)=WRs~RAq1n?b_n|QR{QiZf(r?3^4zJvFA!&2EboPKebQI<*@U+|@CMpZknjeN zmUDhMHTx8CzEdJ^R>5_|B&6A5i?84vdC^6RK^_ckW~Ey+h-i5byYQ8HDPgzLOo%ov zaTTEZ^?k|$`Q|P5&}+v&^QMV}bUO49i$O{9d4LHBQxKbnhj#XUz_6zCqKLw84&v5c zhsevRLz-lLj0jkS-~0E|8I!v0gCt59zw{BO9{IoC+?gG(fp}2E97otubc`oVZykW; z%QWQDjB;xd4PcT&Qr30m!@(LiW3*hrfo6<^Z+Spi7R!=X(pXJj?(c2=BlOEcpEhiJ zmdU&UGn8_7DWk^TN)>ZLA`69f&0tJiV#lQ`-M@S3VOX7%SY6TT<@~;p9}QCMhN?Vh zlIhZqEweGlC<9jg0L-AUxQrJN_{=+O&;Xbs?g3hdSCmm|n50p;y0ZLAv38i9jLOkl zq&KcNo{lI}Bs-Vh2QorT24f8g8m3_lGvdAoqYwdjC=-hj^q`b}Y>c&LxY-+X$t^f( zn2!$AG_lJ7I*(|M6T6M(1iA~LksN?l4VJ*7RQD%P!;iFv|6)AaVCiUn&mo2YaQY)*Z3(QQp*d-%VVQ_JCj^ z&-k_3V`RS1)xk!49l|?p;&UT2doR}0&M4-*+>ao6Y~>p^G?u0Y3?)Jpgm#YW)QsHS zpT5Or_n|j&+t(%Pc<&KfR4Cn0^JjZk_NL8>jxnle`q}sc@b$k;NUlhSjVTVUF^+6% zD7tVqcR8Bvwt8DLaCf3sJ=2mCy+4UH8E>X+o-c4o?nXPbAA4>^v4BEG!&cYAP`>4` zl#0T`*lzeuHFxJ}`Q~5uS7|7|O*dU*8PlgOd)+0v|K%rVJ8{(TL=F=~T_vdblsaL3 z7YnPtBemRcOw70ZxaYINbWdBUgM+`Hl3Gnq_0!1nts`{N)@P;Od$CU-N8Vx^#-GJM z2atfoKI)?(UYO- zm-U_5%vyD4q2?1xSy*8K__F4CmG})Z_*Pkc{A#`jT(Bf-0>l6$8ID8bPGQLf1%zno zN#o{`U1aDK|H(w(emKFw?*q1@t9ugBA>y~lMtgk>lDF^>sWOc?>9M_^?%Vbm44#gs zO-CGIuaiMp6SbYm67?d8M%dZ~(Y3f>Y1)@Q0j}&6Ec!%cv_6zi(w;!~-_;~+*Wxv^ zI>u?>@Er9q5QIFAZC~26>pex@Jx+bPBu}H|l}P#52_!g_Ajh_TpZH*I{+rl~C+XQo zhvl~ysU7Kq?z9dFfi7cl!5c}p9bxiYJ3x~m?GYWELpEKGtuO_=T%2y0Uyvn5@xo(T zQ~d>L(hB8tL_>>{@#sN*Lamh>KFiElMG)k&9G^FTuC8hB0a2Sml#>9g|})itgu3XBxahK18Xw{i(-;(SME?8>@1#0r>7lFZ3B=|!5z-bhKJ zx{rL=eLY}pDKXN={u@S94vsbrREe9iySdSb+p!wrDZR<;C_Me08{EckY*`jQ$1XzL zWw*g#nYpp|IOWl6RrR!w2!!q|eOSA5C0C49-dVbWXPd@jiZHcn{(EQfTk$k#uFniy7(sbbV%@oMT1oR#!7-BOB^nFKvbzKF z9MgQ8Y-sFDy8zN0TH;TqSW~~*hW{p(#WrEyb3ZlTMNhwIRfs?`29ZFih;qcd-TMx< zyIh5YX&(CS!VGr?H8kQqGkoZ_TSSc~Z=*?^G$3{r!${;@ISKhXWJ&cMj{|o#*;AQX z>prV8gA#Yg>iW2~-?xUT(r2@`8FXB)Hu&J16mPvXz}rmR^oxuf>pJ-uDBT+m4W(Y3 zUMIN}FuBX?wrQFWzL!%lnl%2`cqRWSNbHo1DcC=pG=ZIH617{^kFiukMtiC`d{G^- zejSo#1+`?#>sxOJdR`3IYQJV}<1}&YFCRRon`*;zt2$aWofcBH6r&h+5|oVP`3(wg z-aO13LsIMkE_;r=L>dV=2{~kw=(FMVYa9PVAycnf?zl7N9qH`zql$m z|575UG2FP_M$trNuFLzIgEp_qvMn|fW1J|jZdC{N6hCE6WNgxPnINguKb>8y31Bgb z=CU~ibIA_%G?{w~Wt?dwpe%~dbt$pSpjwDMS0FWe0`j9m}tqJ>4m-NEq9 zdrl0SU5qh}TM~iJxLlfL--V52FE>ij7H;HXy$|y1P$z1HUs!jYurlYpxmpUGYxZ?IuN&fP?16$XigTp-G%F6vMr?U>~X& zBo9RimQ0M+V)+ky4o80Tdr=@ha!$i>@$(LH*?ctzmjbmY)@TER))#V4T z*k>+a$!0bh947kE{Z9w@)lLw*I=DXxdyWZXV&6C`V^Nu29g04rAzZU)Yq8ah!g6mMdS&BjW|BHbl(l>LLKcR6M;?(DW9+h`d#bMG-M08zFesbUa^d7W$ za}7qp)6DnQogRS2s^NqA-e>(xhIGEpSHD~NuKmDM%hAGXoO|T$lZev|9~WD*mARwl zpN|=^djxXs^q)esQ`%P+1WUX#Wv}<<$u*y2=jQb#hZ*?!fkN*-&ATjCO2h)9v0X3p zaN98(T@&qB&}tL>y7efU>zP`cN)}!$w|e+w`BTGp!2Wwa-IJa`7lJaO|MFnC2g@kr zPkyf)ivx>xcN)TJ>lDJ zCn$)4b&TbR^?s$~ ziH^_#2bYyo$v!~S15;Lu7E3OljEq->?1H>cLw8wA^ur29Ddjss%r7fR+9VI#sWP7e z-(96E%WZ!zVOdy+ct3di9q=Ji2;p@Iskh%*duQSI)uhhnp3kqypZ&;Y4oei3L|R^; z?w%^ah%q>zCO+k`mlA7lGVd$DE;Y5B1&>J0tEWB_oQ%O_dN&_K$&LsJm>%d|R4%`H z?Je2L*GCH1C^b!@K0*IrOm<5jzd=8avy{}S@j*0BONT$WMXY7sx>c#a=1tZnl?R{@ zcs0wVub=(|-%}p^zzBLzYJT;`nfN)gO1k39Bo7(55JK$_sRzFcCQ9H1zWGE6{PW)g z)e|U4Pb7Qs^f9KB4H80xjApN$_;e*Sop5Hs`93{6?thNPvkC04Q*){E|DIwNX&Ye@ zU@7SPPiY_1rw}Gb6JFC2Z}WejJejcWxo!{pZX^Hs0}C%ec@%u$9pmReB);jYt*GDX zQ^-9tF>dN)GMcz1q_#)q|GrvYNeIu2HD=8fA7^7WlU{twvW44wc0 literal 86598 zcmeFZbyQVt*FFjxkVaZcT1vXROS(h4OL~(M8>CyMI|W2akVcRaknWUDrBOf-e)sl$ zp7(pccZ~DjIpdsh#_JfuX6?1^b+0?-yyi8pxuP}H6tFSKFc1(Bu%9T(Y9Sz?SRf!E zwV*-35qDb1XYdQrQ%gY_aeja`82o|mrfBGifPhH=|Br}}nN0#tn0C-HfEuW(2wS;2 zvs+lZTH3JtI=g|h5fDUug~3N>8>j`fuk$k(Phnp%+COIqgU|589JJJbPJud!(Hf{~ zP|LV_*iiGa^Rjc&iepeyQ;T|7+X`#R%Kvpa_)Cn|9tw36=HT%0@nQGjVR!YgHgKo-|fiScv^WlxIrCUU8v#hT3EVzLB(ik;WzsC zzkl@;>R|gncXILk>#@KCa=^dg;9}?G_;=gjQc?I(VHsCvHxC<6Ptd+NpXi@6|JSkq zxzE4O*RXemx`I*gaIkvf0=4k~7eg)Jy%XpD>+1jOGybzJbq@y{@YH{<=KAaEe;)hm zd{GYgga2b7{x!;fj)Gwp#}MWC_mqia)NF7sBOpj3Jdu^s@kRW;?2|vGb6evssUe9{ zTVSdG{T-1Mik#*m-pvU#jqVZCt8%G}3H`v8meW>H3vv*ggTaV=93L z?4KlVO=U$0@(M^B?9H{OJgqzZCjUyDS+(hG>Nxg!4~Z{pZ?z`A(9M zJf#2In$I7v{Y?3#X>9Gs0ArpkEsx!ZvF}=V_s!vQtHN0hyj!mrI6XG|JJepsb2M+h zx;&%`IQ}$V=-Tm{|6|#+(xPR*{qoX6>7c8vGz*2Q_MZ#(w%v?}tw-YTO{`zak>#cc z{c#qAQef!tl9t0DDfFE7WBFxG933~O7c0S3?OxJwpM(%{>E|)?&QK-->!sWuCl&G* z44DRB9@)JXgjQcdGd|wKwhle8$Yq*P9#?UAJh6-d`^9fxQ0>u7oSk6 zYIse!40QH5yqhO{KAk?+Kn|e|)|QVW`9$pJ-nZd8TV>KM?!h>;h;(1H#@4V%ekI@x z`-~?$zia;sYuyl7pCe{^dsF2E71n~`wu0)n;KpJ1`hj6!kfpHyIjSn++APO7db07k z(P1*8@#YczC*)f6E@aji@12@%oYwW~2V7FD1+$!!zTN9ZAN%X#Xq4B4hTrvTHe|?9>nt@nk}62h)o0EE^-Z*3(i&)VpquII zDXn}`0SQfB%APH=~UoROL@@cTI4)$>8En-Qus#|Py7w`G<4G*H153o zu6VA|A|?HPI(TZ2U@7DZ*&hmzNufqs+Z`y=YyA=lyhvn{<3ky%_tfE$+9{2Er_nkN*eD`D`VHFCIrvS6hMU=dvB`gi zlIbCRUIOe0G9pU?KAG=cNm`RXCXs!Hj3D*tTH!%yUeNjp&i(4#pkEH#+GfcJh|7A~ zu_=wlJL7J!F@%tNRff%Rwzj(?nONltl0-=C6lrEbe%oglX}!iBr#dkK(p%vgTq*l+ z=&1ubw>St&bN%<78<+L!)QaRuV`^`IeT(G1)diaigGQ3icO--=@Enh?_>Fqr2t&iX zZPxLw$TRjc^q8|XU~N7JisX-kJ44RGf6(Fd1PO4#7*D#&!-?@&&CWldi zuiM8?X|r3dP%KOwNhy4m-097O=jZ7X1Zk~Be2>i;E;C4YY_SNCF{4<6Z%#gE-^oCN z_@bcin$~01Due%J9yrY7@-MetG(%2_Ru|HCgp8|}eK+EvDw4}O>|z^S5XUMlfA)lC zhJ>TOWM=IqPPq9R5xX#!Mrt!q;#)9sX$~hwUi#{}JkTvrMb}+I{{g>At)!eK1)?T? zwYih~oZZ-Q@*TDH^&&8I$KCf;6jZV7kI0<4WVi}>f!BsXQ1GdJ2c7xtct&)dh3?t~ z$YiK%oD+_fs#B>&>yP-kXNs+b5wJI{hGLbDL3f_;E1BH8mEU>tMo2e;1QBVTJBZD* z&>L6f45K2<;np`tjh6nr5e;~R^je)b3h zJZ*xEBaRJ%V0-p7f9|k<}@0?=?&y{{j6oDh-NM$T!#m zUkebB{kmnD?oIL{=da78liqsxCw5eN3ZLzVz1VZHEiXx3*UQ^SKl*sh9^Wr5jvSn^ zIz(kd4*9s|+@IQFTR>!#9C7bK%`rdJ6nEI?k8{tzgz!zn0^xS;B^VHuyI&>yTcmiZ)_(3K%A=5a)%>DQ@d97dMS@)}{Y3jzv^*Rxfe)LcLX0glSZwui zeR>eV-um%!9g~tGTqIata?X3vxp`@GfXFF));7zz4;c}Ya_}X$juCb&H{r&e-p@Sv zlyD+C9!(PFm7eor&v|@dlSvF5pW)8VZ}Qx>Rup(VHCGr)#CIp7%{mgfr&f?n&y&qa ziUvCXUa}4e6{nbngsr)-53)HFoms-`j8^Kj;OuVuCV|9+T)sqA;HBRWPX6J7NJ^y~ zG=Jw}t8gZ&{UWYm%JFU5A$oQSHf%vcZo=p&ba#u;k%Fv6rVN5Vf$}558cHejljVBI z7L#P!UMXyQm2X+LL4lqkn~G^rP&y=u!B_Et_-;&%Mwxk%GmbPBhjEcL1vGnM!d1LtB_CMkJsiI9G5tU(d#6kdd* zgBYr-Z<5kjaOTF~Yr`b3xM~JBhWU^%jk~MS;P=rBwNqG(O#VNL{0V*fpMS4K^_e|t zJHv5x2vMJcEnIu$N~6nL3#y-<=k8p&e!OBJ)k+^rTm`n0>}Sq6_Rp2zySfqKSaakZ z3o?ZpVBL9f`?VOl_rstAVNLgbV@@nlL&Zqcme1oPm%{1?VXH;M+eLuh6ef7IrdB9kc#3R48~%i(d)Z(jh6dodR`QF_~4_<4yNyK1DO; z_NzI!|Cm{G4lqXAc8m7wr-=-6sqkZwt3<4Fl2Kw8BMlW1);(#8BdcLu?|OM#S@>Dj zxL2zVmcd%sEH#ysa;_t3CPM95#@x9G#*GCf?58&Uwp>;@XCI9Po03(e`3C$GWVpdNFl)<~KS8K>}Od$+9=hMxc_u&%RN5R#RbK$A5@47qla?-<{vles8^;Dj})=Xn-AmFq=2F2q!q zP&!3)&pB%WRrHP35-YomL(QNr-!z8(tVTXt>XU|`!of{RgE;O7$5~y7>xVfr2D0XR zr!~rH>|ysvreNFXymn|G+6Ql_Q*}>nWK*WsYB+)AkOs#5QIF50F72wW+!_u&pAM<| z%f6}}w6a;HGn0k)GgqB&-E5eGMuG}sTn~vKYj(OS2|AwhNg~yb{S0|~z1-r}y8pIf z-^3;#UjqG#r7o{5;7Mj81Vg6<#uch05t`a@4)6Ofk}+lhXM)0?DB)j?2hJ`CbWdf> z5m}@`b2prtDX?3!hk~3gufEmil^OS{RKpC8XWjHQI6liFtnDkkWqwN^7@iY<(^**J_d=T)L z*{60oq1gO$)TO#A@BZ1W903UFcpU|bC3`W?4;C*{jaW=30(2DLyK{Nj5`8F>?^_swnt??)QL2Gu^fEw66U9%)Q$HtCtyJi9Y;!a&d0ZcL z-07PI60!YSi=@I4-p#j3Gid}9bH;;Sc~ zD}fi`jik457?$$SI%nD0zUeUv>4BGCQewz@>wyX?4e~@J0R7-RoXd zssP+W2QX`0uhqpfL@Zo-y9+aT>|9#*-YlgO;5C8O`bH%KUk-5wgtx7SEl#JK0|mHP zp>R**wL3=Rnc>oQB5#Si-0IV@7C}x7LgO5tuU7zRTBoRsUxgV3HjGSQvNxIUPnFAZ zp*hw>nIrB61BL)RNN5H!S_Bqpmv?hKqg$u*)#(?G(H;;yQ}DNsieEpgAD1VzElJT6 zUXP;dizDq0kS70r3ZOOVMDew98uq)9n0Fti^^EYE1visqq%BVib|BLDzkmIp)01Wj z@raOnU>0!lQh{)$A^Lj$*r(zmlY;R*^XEr(M}m?Z0?93PxR`;AWNpW-!jX+cN{H^80a@%29;Ki5HdzQ)d^VUO6s98RqE^fsvQ zm3PlFzy*qo=3{6!Y-?7xuM;93jBr;5xY{K1`jtx&RRG^5>$f z1EiDZ>Ue!STQy7=Q%5DAEZE@Xl}b{2hq~wq@*~DjTw={1usPZEsxuDst^lR}1`r03 ze{HZI55;{_f9+MJ7<*4ix`*beQd_>m*EEOIc?a-d+TTAh3a4YC|_vbmuL~XkM8@Wl4BAcQXhOMRIz`( zQlxO7jCxEwuL{`(pbdM?90DBz^V^0+m#p{66zO%lg|Tmj7!&#r1RfbXmX{RD^)Q!> zQngo#X^P}6IP7Q3EMsg>n#t5Bw;%!f2AV!PNvLyPL1>c8d}VuT*+?<}G8{n!+i0 z?Z+~mss?CT7<#m4a$Oc1*m(ACsY_a*XVIuh2CxbM8V;?$0prYMX<}HKfCyV|khWOk5hD4yv&E(VgS5-(pMQr&Mo9bSw z>I8XvaV8p&<24h_QFtLmJYFFYeWhFkE3=Z`9t*ZeM*pVuId}UWFK_-VtV~EUnc60F zSQ8UQf>;oFTv+f2x(y{Kk1sN)W@@xu%9CPPfDNx1DypU-%KxA=|61OW3poZ#j^Wg+ z93|hO&?=)zuBTZc}p~MDc}H;clF5OxC|OUm^VhD?|PLfcqY@6%#JCxnQ^N`(%d&hVOuS zCBzTo+(*1ldDYkW_uRw%rG_iwBH0w2fbOT4B!pf2`9G$|Yy1``d)(OZJKo_igmfUW z1gvi%SA9d96c8f20D{ae^XbJBi|5_wq(|mTU6dvg3F=F^wO-DUrA?kU)O5_ZeAU|$ z7YB>a>Qn=8gg9f?082C~2#Le#AA><@n*fhF!C@|Hc;MhKZ|xg?HX@R!IvEL}qvkF{ zJfM?S8kApsVM;XrVd0y+C)yKEQACRbd$k;bxPC>IuujwsY7Mr~DgroTIYfhX+WhRkUksm#W!}18-)yL)*tu{5o_Yw&~jDT#5{B%CzUq&t=2! z#`L*ni%>l@|1E?o6p9XZ^sCNQ$WqHBTIjm_g&G%NNcnTWetx~>?hklotYk%jFladE z?6W5-yCIasiWEwht1s?zzoC=DjS-uje*8!(LS3iiyL3j{B#Bf8RX%I%j=&>jrq?#i ztBfq*H*>z^G$z;-`}b=_43rT4m#cXsB{K|k>|ijtC8SbhG&J7h!XM(g_u$YY+dCQu z?L8K5(!pM7-bl9}EebN)+ioeB zBHwF#{Y&W(v!it)UK$r={A~jzrBEhjUb64%4I76z;xkDf$Sz>I$#mgHFVz!=K*fplhe2%_Gpzq&owgFcP)s{Z0tf*>XY2)>e$$J^&c34@b zGxAOkQ9^Ut-+CFql72%NBRxph=hHv}zLC9e`Non&+sPPHIvhufcRyI8;|1e1R^-c9 zbB#Z6=K%IC59ylRt8pO#6cvK2VGnk2GGozmT$Fq(Jcg*Lw!7PF3p~1Lk8~2Fxil+}ZJhe-hM@%ot1V=t9=Edchh3*ARrO2Q3{A{Kh72E*WcfqMTd z020w7xGSW9RQ6Q--Yk%le)f%VmGNzmOBG;Irx`WG+5_BeFW`)-P9y>!nwEFiEu9$v zE6`=5V)~!X#xS$P%SF!NIjSic^huEK5GT;V$S*;l^PzO>M6~LQLyxt0mnn{Le?syW zwtzUad6*ozSnr2EdGpA$v;Pe=2~V$}O6uZpc~ARwqifsAz4->$74g9&=Ntgi&y@$Y z5^MujU)A_uDqm9H^4Cs=^<8d$_Q?}QX+tA`QRE%LX*qa+v#AX7P#@`s6#y9Q&qA0U zC#`surRCTasnTq?VEtN^B;yr7wcnyjD9_vFFZT+<00g)Ky|rCcH+3hMzqiI#c@GVH z>;-}t<~sn`Xb|~Oe8>6ZMa63Gy(QsfW*nO19&hh3!4UOD z6!3e{MCmhZN9VC!SReHh8MlAYr-V$IAl@m|Y^4Fw4z&@Ya3i^Zi?LELGKeSw8op&! znL_H1R=q}qb~I9`a}g2AAjk?2y$A!bMQ<-R&atG6VJ?`8GH1RE)MMCEqJd=$o!Pi! zG^UaUG`X;Z`+iw2)~|gB!_;|RVgAMguu`l*0`lu|{Dnf28->rmAaMu;UB}4G48@Y$ z2u=s}LquaKMZ8(t=ecmVLBeqxCzrqD9DnG%gBNpSfb!M8U#6+ ze%~QaclSwhADZTgy^W;m<#8%8c7Z@o$Ei_v1p2vHM%6FL)|I6P%_5cSHk$T@36y13()gztG02zk(fhx0Pnb4KWU!@yS_2Q4`uC%zZjJgW&zv zgY)Mm2<%2=p5ayS%_cN0d07 zK_gmB5M8jQU|s@N>oa#v*)q1ZpS64<<#M%|M2l}fcl*F!P$z#4bcW2u9_P<`(WM2* zR%8hfhH0k$dmyDH)r;t|efRq)xX!gS$NTASM80l?9RJplQ-NPo54dUS<%{FYWCVf!=)Ht;j99SAQmO(^$tKHdfZ`#jz`7xN9e?Tro6c4!5sYu`Z!}|Z@yD~Mv z6#Vy!|Blt4HSyn7^8f7qu+RL4@kU=VgfOJf)0CMfutKsO?h9-d5hR2&w-zmd#Eey5Pcs92}@_p%>UF$ZWh8~Dp|)y|5c z7+d#0%VAKb(;wN!-<&n|uLhaMa6>>|oq&vl)4cP1Z!*J0QLm~3+?pwg0`VIj{f0hBx5_Jf>NlP0@mcl8Vv+dzoK(yh}$;D+j+DQzXRg@u1Twz2amtS4|LTe&@cpC@v0@AVK6_JcwW{^-7qF}#f2!wUWsmeG<8@m0 zu|S)F6QP1Dwxrwon})DWIl`x}kvK)0duSEBZ5*%qW?u9vd!3@VoxG zRXzd)a9ZTt)@Y=_bNGXtw>9m#ss>CoqvX#8y6E7s-{O>t6#e}qJt$=w=08FNqbC21 zLk7zCI?lR8<^P%Tf3>U_1F%Lhg~NsRzwh(T^Pk6yfe=)2@TIrJe>T<6 zY92@Z-&_1OW0E2NzaOYE-=E*-GqnFKk$%&B00MJ$-|uk$S&{H%s8K4p(HCzwoTU9&n7EX#&f%JC8Lq0N+8F zJPS_$+n)SuV9Q7VwjlB>V=en@ga6s(@E<4ujc90?^8Aw)`m=2<;ex%Pg7^7{JV23K!zvPOTmKq_V333 z>KBMycrS zi!O-ez=af=f-fH+0%F->$dw^z`0SgZd-GK#>oiSWrBwOma@Bt>>a9m0rPr1aS@LhU z`j>sl*8%of>Cut*e{K*a4zNVGyhK^!|Ff+Ue6Xrn{2KrG)_-pT;JW?LN$8lWH!K|f zwq1YQe5Mqb6;-Oz*8>&}{}6cO3(`YZKrEPQXqy3&J8kDE?~(txsWh~J!%j|KiGQp? z8$!xsO93Qcv1|=<3r^MTpYD;@J1;gozc>W0b;dZh4tQ}My#vsU=xj0re3!pfG2iTe znT=4S|C&Kn&(v?r#Ao@3#NCy8eQCDx6E`3$mG_)b75y=#{o)=rtp+*1b%$jO)|uel zoK<4HH;)P+E^Gl|6A1E+b@PKD;WqbStvhm*kE?@WbClzYv390ph&1NZ?VNz~0u(49 zn}Dv;ZZy|qx6-cv?&J00hSU3g+Snw9E!LhJVz#I6TK3;Q07^Rk@S)q*fpX@6iy}{* z2>h0^UvPV}!sV4nnBLln2(pL39c#RRIfU^oW{ih^n=e5Y#Rxi^^E<|KSSS&FJLim+>=Z<- zZRZopA_w)8a24L{4UpKlcpqeYj$}9~5in^)K-R3(VXGhv3;;^L%eNi_IxbtqzR9mU z06hA3d9pDoxDf~RBG0PA&s&A`X$g?Q(g?yQRLt%K>7jl(w$?zP?8MGGBNOxI9bX3m zT#x=Fn&=I;NU*3vC{RFVL(e6}3{ZX`PiF}Sb3&fmfLWLD+Shr%7QF&;YvYZsD-?C_ z?qZrX$&Y}U0S!ryLy z!hs&`VUTV572pIw2GkDZ1|6s0PWO?!iuxX}CdX?^LFYkY7bIN(&r1D^FDez^W_PBN zyrcXtac@_|8`Ugg7FHlzMiq^C3e?TzhJ)eTfEt(qnPSSO%J+jTK=-DQJOJcn!#qq+ zC2{FW6@%}tR|aUF?!$9N8cz|`0_(YVmzsG}UV}WaCdS#Mx`ZZ8Ml>L{6DF5IDvLzP zV5b?7ptZ#rybkyd>|ki$2=$h{yPVcHd+56wk{wpQ*#UHY;gw;c_k}ib^$H+e%%#~~ zq}*Gg*@rhz__$`)OeB^V2#>!J@lg|pV`kxLkZe(rOe$g{>mMeli0>ms>SPA+qPcGust}O9A zH?!m(mH>2|XNz-A)T>K;AhTigBx9Sk7D(B+czET_Eq|TwWmBZDwE20eQBUD-Y2u!J zyK#BXR)yf;Eqr%#=E>EUVl=ot=fgfQw*tnZ9qXHRvHc~#Dre>~&t5t#10=RlCXMrX zat@7u0izaPN5$NoaSVh7Aj^|2=YS?pj{NEb*o}%Vd7kV@o@@yRV!YFNpzH4L0iv|e z$QMF1iy+ya)Y#+ZC+IiC^_g?|y^XTVtltT53Jk?FDo%qS{+yJ*9wgBt&iF4FzcX|^ z+@t8^}p)!pmI0?F@}q&JFVsUQg6DsIzmL?snYO4vrZ1n$@!E z%H=gzrr-76l$jf!%B`qsdGMndF^`xv7-VBNrw}48YfA=HNG?NkyW>(P(lRSY=+kM`DX@FNt0AcUsRUk42S>}T09xup7% z2jh)y4N)m&S;)|%ned+?;o;l4p>aVPJb;Q=>t#sLY!`9!75X>gGmb3w?d>H!en_t< z)0)+&n9vHuRJ7A|<)awN>WRE>0~L#?hnH+S3A-lHbBm^)$}tr8ei(*JA{L7hgku`% z&3l->PCQ0!%~&HhxsJU8y7h=Jj2{W#T?2(e*!4{^31Y@)vw$z5J8)rb=Y|vI$I5IO z>GuyzeM69$S5)_pYLamg4hRjDuEUekxn)r{EbwO?jGOW7&G$eeijrR^yW?4{Ai2k5 zW=KmZ>{2hJ1?BA#eXllk5Q{WS^oHK;IaSx1=#>S|%RO@`x`r4u* z=;}%aVmcEl(RgFrOaWp=7E)18VXBcGpuh#Q0DA#PgVPkQW;oOSm<3Wx+_Cmlc$JE~ zNtWvAoHQ&XTj<;?XfgG+K8dOIY2M7@6Eo_^6d_65}7IC3*8c z5wca z{wb!%ErTEwP31T_asOiJXC!ubk5_JFolr1$zh!?2E`q?Fj`eCo-i_bsz?qgmE}HVy zmm)>A-Zi!zJIB}KpG6uU->6vE71UVrHTnIS%P*JiP#U9)?{5={W~l(59W-GbMH zSF&4_ew%oGn5*6D_SEMblE(@9ZKB3IZYaGJ5a*?VNI9MFJH60tSKjGx>0i6C12WII zk5e{Vaf!omWzrLFJsPP@T!0F}P{+|6O0;iCo)3-K`!_f486d>oqlc*NyMs&?G=JhaI;+IC2e7 z$cK&|6MZc-dpGpB1^XM`GL|xNFw{6%)bwtvfiWqS11+vR46`0I)4(6PskU_mNl$DelNe;< zom{7cFbK_HN48}gop1akyM7A^8~~|4UKj_p+3T@;JS^mR$n1H(UrGoYx271(GovU+ z)yfciQdIa?c6Vb*S)Sq(#GNMktHf;=tGM~e$H7!5XVHzY+8MLG7aYK%65~S4{j8zM zWRr578p?$yF+KJRCZAuQ9W$X|HVS2oWGW$;rq?*5`3drV(f9^HmRuGhFZX`YrIqcw z;C*M1HGKGH&k)v8K>OHzwe zgOq|2OC47YB6S_YY?SCoyV9ijhaow2C-Ca(ds^GW>ugM!D zO-m@SNX9ZEzYsMfX*yr(1vl(i&Wd*?305k;v`N64kBQXsaNfQs-u_xcL^A1gx%eTI zJEF%beu9fElXu%a`v}$6vKW+i3L3qrvnJa4E+G%S=R&CXa|2J1!@kL#--HVgR=O^dm(5fAEN~~9G&ch@Jap|lZtMZ0ipVTGM zunCaYs_0lT#H#GRYWOZGC4O5f1WHbUI+=i2fHz$rYeggdbcAtE#X##CjlMgA} zXx9ty(xT28w+J$uyzxtgj;^&NLz*$%ywjaBF_a{C=8QKYe&((9D16T_MK3ZT;kH(s zYDh&8QeAG6w01x%^wcERia16@?=oVyBW)7VQQjzhIp``&Ye4wJkVIa~5j`8ykS-*lare5hDSd0Du!M=C^ay}T%_%zzLFSXVm)>nl zR*9rM4AM~-hW$g(9W7#aAxaja*W^E9wDmR%0;i}DcN`b8nJE5#7l2x#junp4z`sC= zKeZJt3mZW}HQ!*TvkGsA_l~oyWQ+H~FA!pe^OEw*Lb_V9j0lpOSuTX z_6P_e%YFy%b_p#ELsxc%JJOB$G=pjC4fEB@INv`}$_rn+?ll{^FRESODjn+oEFtSg z?B^UEYsur+bcv$e*-wC)L@9OV)&(Ck-|R&V+swMfeO(eo!&dWxTs$w)q8yekWI2{D zy(L-akiH=dD!MzCe93!=p&zC%Y}P*EbztALjV~oo6)d!lM%`n5WT0W!J-e{iO_6&= z@PKjD7mL17H;j!th~~(& zs1o`Gp#&DjuxfV=r|d|rMWd~#79xDG@y+kjk3LL|rq;MjMh1pJNC-E5^_!VUUh*@i z4iNZ&lCYk_bU9H~fR)A6$8}joUCVIs)D2JfPHzyBvrEuW%DNVFY0H|UkuiKFiLfDXd)$YkM5HeQ1jB}W0Lf>I-5tNd~C@p;}#92Oy(A4>ScXJ1+#Sg z+-MVw{3ogV7i}+##tO=s;>^Az)q^Vwq-cN6=jgz{>z4+37end{*%0}7EbCv){na4D zd*h#J1VOpMe{`Jx0Gjq&mMFO~DC4=t{*E4>(-k7qbOH2xx7I+b$^WA7Kx>0=)r8-} z`#{|X$w1i^NKbgrUFu%`PsV9V9h~xe$ry<8mFULWpI^Y^eBMRxihmGBI;|l4;3&~` zXy%7W;n7b67sT372zk&uSCJf7w|&WKfP#WrwyivF8>IungM|;RfGFs`fuq^l@FL!m z#%7E#w)f-5mWs~uWbm3S(w9t|!62dU0_rHo8-QXluo9Zh zIsg}(AQ@5ZhlF_ymx7aW(+`H7WVr$kX4g5DV4g6s02LLR>>T(I>x|L?;ICmDWP~eA zY%Cg)2x7`dpga|;%;@ubjJyv()m9Do0t?1pKM%eHUW1}a?VUyc@4G-wi~aC1yowZF zh-cz?Hc8Gx23L;Xbk5SL>uXAeU=V?hu|OZNHh_%3(khVtwSwB;k2j}ZA_%Fo>Tt(H zmO;n}6yI4cEecuj*qk~5Nsh0#Ik^U@pQKDe-(6NETd z2aPN6lC^U~Noc-$o_NX&>nDJ8zYjqXTih%Numw7UY4%mavV}x9t)HU}_nrt%9NN(! zQvy~(Ek_t$1r0AA1WX?(zx#}#T~6T7dC|ecrDo3xy-I#SFZdq((kzm924QHWQKKoS z^M)60m+8QZe}T?@bKrG1#oI=p3NMh0C3aq@rKhQk2?`mz_YxH(0k!DeXNlWS zv!YuukGqBFg)vK7$*jKtQ5@10mG_*5anhz{_dCM=BCu;jEi-lBBg19)Rb+e8M-k*M ztNhPFKoA5f#c3`#U#mA^gIdYghW>!IQ^^tb0y6VFK%~XOR|Y6{{|&;~8=QxrhT#TH z3<%1l4uA~UmRaDetwj10D5q9AUjsqJ(QNe!pog9V01|EuG+6~1Vm=E)JNGU@fr1SI zn7`_A5T(eb!?K)R5*q;r1hSd6#1E>IuhwHZJ_0h#mrR{Aw9l7x`)u(Ejb~+q0s#VU zlh;m5h7mNBPKCAoeBz|>1|hN?EqE`D2KvYih*q^(T6hFOb>=#{qdL4S5U`s!Kz5HL zB1V{GzTOiJ=Rg$)#6g7clH9o-ubsrLCQ7`9?wyth)eacqEsU@_?5$C^_cqKIMD(jV z*%s;&K*x#u1Qa)53gr=GZ{^+rGSxf9SY^MZ6?+%3r}`{8;nu7jp6gBzB<}d#w0ujk7f7C#~N6Oxg%G9@W-)pT$5ucFDs6Us*_7wrz15q*!Oo$6I|_nV1$Smn&nv2x(m>85aJakh3tvES=%_)1j=xx~n!alS*~A{q!6B923$b}bXuXe)7 zBzYpRPfS$C5VBxaxclt&nsqEfe27pANP6>tfS$MzNr%N0uN~U~uE$j4UgO2LY)v!q zHZgcX-0)-cuk&TiJlRvhhAM8Q z9SjibOv{XoQRFjWpjY(4vCwSu-K)AMbXQ|0OIhR$WQ+MfQK?Ooc6RhTZ_zHlUujX> zg$R80eSvYM!-YMXV34<{dDz&k&ET@r4Uu~QPd%kheO4l!qc7WRdVeYd zy<~WsLi#;a|HY3*vsnNPL3Syo{Gj2j0?ZNe9ZS7Z+nX8d_VbHh$yYr;n^d>tT4ye4 zA*fUK@uL=*S$ zQ!>vv8TPYCXi6mN4R{Ah2n#{~t#y{mXTLeclnSOPaFFpK@71cBMe?$J^_#s%|Grai_5AzM&&eQK22S;4c-fK<4Wy@1 zmsfx0;$UuEzn-WP{}^SbI;BuT;}l?9=7XEp&HF(&8fR>n$a&ge3%IXf@OlI73G2Jm zLM>7bn=#}ns)MPOBTEL7pK<4wFO&)YXFj+gB1$3~ZCXV^fuqB){g@oSL z_QsqSlKp^^Y;Axn1g~*^P#T8meO}|Q6!^5qD;PuQ^A-Q^h}&By;I^P)K@}{uRU2cMWA>!WF!g!npW7@06k9lhofvPl?Bl zbIm3@lv)J6Y#b)keM51)9C)QZ7`RmK6~M+xK~^pWdRDh$iKGqr0N&HVJt3uIUR4+U zAnZ7Jd+4NiT&m3$aZktq8E4mA^l931;@|UiYMo_KErSCUASqdcs>&d#sg7qa5MH_8_liD>=^~VM4I+q= zdW_WU=D)xx)B)vn0(jWm*0P-d>ZIu5>k#{b%@*y%`SC$o2lVrAK)$-6WV?uG+4f~CBH%DQch{a>k$$RtaZs*^!kQ;{ct8J zUOeZM^3vQwtC|j=4V5LHn`c5Kc>J!W3)F*2<;E?LM1-yS_7VtI!XUZ~Nmtfi6uGho zxf26@`$+A^-ox;8%e-ats-c!$epw&yam%>pt`*DP0(lmHt>TCYDNCBo$Y2Rr+lJKF z%Hd3TL$gT6f#dfHXC2~fa4WTTWchcJg!Qp_nlCLvyR1?sTIhMp_l=}sN&5a*~}lJLg5Asp;z>#YUluHi@FY0xDo z_JDfjXgoLP!K5H&=DGmoANoPZQ8+bnWP=lVE2;v^vLo*(9SPiyRKrn(Syu})s+xI* z7|En(?=k;y-RL#CN!^1`q>vzqB5nBXCGxf#pXwJBM=@l zVq1RlRDA*&DaqCqOpDOQCMqwRhXOCHnZlekt*ZSrB;LZIp`oPj~2&hEXNUq|tz`4EODJ1~sBEbw8gZ45z{x z4L1qaYAMefbr|0~_{v9IfQV41f46@z@$~NaQ}GUZ#KpT^^i zP)(VHVsJZDjnJlZHIp~9=mYii`Xw9P*I`;=qW!sAy7mJs3c;l4v7X-`Xej=PNOa;$ z{b>kbX7Rz|B?czS@%nq-quE<(eqw*>Y|C1I)w6>QN^5RYN|>b89=Q2Z!G2e{QKp#~ zmWl!oAs!g;vc1vddVtZqowRZuRCxN$!3CX=X^{gj=b=hEXNbrUh$ZX1i7JsP%a!K0 zj-%J-#y?@x;Z6!CE(E||X=9>H===bv%LnXb&u5jF1qzj+_(RL$NRAN=_pstsI!NLI zdK6+ud|aE>B7j~qG{}3`c6U3=b=zI|w>_>IHG2)f>zd@XAVNeYBbiGVgk)Hg_?O#@ z7(AB&NngrvI|N&Qg~CqWq3B>Ij1xYN#&01D3?qQ7I6;i!_Mv?NtO=GNyVhz4B)9n1 zX-!8Q`9^k2W?U4XZ8eQ)JhB5pWRx*xjm2ltS)dS!*Z96sz9m{W(j?KNN;Qm{;VIT; z-8TX&Ad=Ml7H@OzSHJfrYvQ^4ZfX=i3@e^6zFLZ6AJj9x3CCpK_>CtDfD~4??1Kmk z(R}c36i~DsUmI_)pWf2`-cQ{v%okPHwd{1P&Q3iPCCj;iR2A|&Gi{iLMO6wgEq(2p zMX!F$>b`yG-TDviVTw5ap46aUC^_^ ziJv-gkSXLZcw_><&~IrfcjkP99Jha^dYfwio!2DYqS26t{#b4vX{8hD56yGn)iLdG z|D2pL(>@x`=Cm`I-s>q*h_Hx2`TZ7Dm4);=yr#C18tpru&OEzOP5Y4XCCe@UIkue= zZWh`a!2#yk)&!ChEwFnEYhPLXM2L|7gCz;HS68|>i;@Le_-WUf7n-lytv{BKd&DUx ztsiu-_uwq!f_-*6)%o|b>2kMu5VilV?Pwf6*W=Nf`+*9P@^NgyXjlpepHNzd`jsE( z5`N7Akb3sE6UAxH(<^XCIx}5P>Xty`$41U<~kd#aiF zi2JE<`l4!)q0apGsBp(z4XSLZW65t6A%(&K2Z;?ewC@)I5BPZd!ztdqLkeGdX|r(( z)UF6#AME|rXD7#?qp{DQ5m?b8P_;n1~BVYG-*j(TJo2Fx7hF^+q4@`r!u`wqV;o)Y)7OrGu`9c= zNkpYJdKZEKhW`Fpu;}V_eQ4)Vw%ZHTB@~a(e0T|daMprX%s9Tg3j(jZfr|HgWkW(j z!FC=L^YMOG_O>b_y|Y~tbTeM1MPQaff@KLe|%Cr)6PHQ(kUfC6BvmUmd! zs6%K~czi?4_=#m79iHL)OoLJ`KS(K>-`x$b3V;yKyHgL9NpREgf_kT1f>T918E2d9Kr!JspGLnlKr_N1(xp-pe z1-^XrxA5xE?P)_)B(I(4um0eh|M@W(`fZif#|?Pf!HWHx1WFo*1#S4T%Kz#AQr{xs z!bf@_N!4Hb954YNpkRu89lSU7&yV4+n&B7CJG1_evBX_CXp#cwhW`)Ea}0hl#%=Py z?=;aKh*fPGUq0Ub_mzq0Q}Bzw;Mw0F#jqTr=Qw}h|8d_S)xj_7!he#s|9ZP32q(%X z;eKHG|9&`o_=Wu=fHeQUMqq@p9oX1+pS;odZxR0d1H3f+;{Ur1KQ6}qOK-!KZTjAITJ64Ab_3pMJ>f@xFRM%-1EE3W5%@o-?9TpEsi+it;^ ziST2kpVuNh)%_V{Aidr1QhKEhIOAH`twX;4zt@(MH@=fLR0-jy5J;aZ5k{b|>2pvhV}Higna1l;b6=$d^R3{RRUFSeW9{-$|; z^)*28XNQ=M2g_7<$^$!Y)(J7T28u~Ar9UcT9lCSQIim6Q#~aH+ZyvA z=lmBg-=)L~+dol_U3>xPSH4UFXbfAy)KeZ>V1Kd%pkMgw)^8x;w}rUAl3B6V zB{qtF@#`P~zf<7Ou7k{E?CcHz8o@L4Xi)s0C_(VSAf79T|4ZY8Ywm;~ll8)z@`Q;l zw?oC=UZCnZ8>_9Ss&>g3(bFu~-5a1Cw46-bEA zCj3h~>3%<5t~?c%7D%CJzo*UXYe-JxJwgupK)sU<|ROV)N;@7n<*(&rt)_YSQ>);6wIYKeGSE>qgartUB*B@5@Wzhw&a(z}e0xV=H zvnuE-jUrYLQN$?pi6+lv3CDvWF8uyq?MT7c9$&$4EZf8lKOXzT@ZTR#hGgLE^tA|% znudLuN(Ea`w^ius*%eF?0dGFvsjmlquUjPA&~9IV4FkMhLlIv$(b z)4HeQEvI&0eBJl|vLxVVhq32IGy!|s>C$cviMeCoY1>!O0mh(9Pn z7a#xWyRqqx02gQ)tazAf`+?~S9hXy`>2kXVQ!@=C;eCbjy7~Vj_K6mhJ<=2kAH)^* zsZ2|Yr&EH{!&-{xOSv5ngv5-%gWdhiG0q3GrOU?^SbWT<5zp`?0*iYaq;Rxz3*QK zrMQ_M>H24D&bML)gjf%iVFFwu28~eS zL9=>3Xjtv^Wzb{1*8k~_{0y(`3tYWvGE~$zuYh0a0Y`uWq&73)vkN%9P zVFY##{QZ_tR6-;*umVJyc5hN$nu;-^M=BcQxP^-vooD$GU&r6~gs_My()NZV7jnj{ zbq6K$VGZicUb*w<58zMt-yLPgm7QEDlc)Nt$nhmS!)6N(xU-4mPs&*YmP#PUI zXQcb1k!IWbdzDw!rk+s)%Ig>K>P6IdH4^r@1P)jkw+Ql4f4_*hSsKOHam1P|kN?H( z%A(~fitc>{d(%K803X|{bQYoY4-1}-`1_vb+MvIJNFH`n9T`o><`I%n`wVFNApVxH zq)n5MRfgr|%j)xQ-j}GKYs}3Z+BYPseI;*vju*rIDnNyLNAjPz^W)lFq=9g$hV)vS zqgZcFZ4NY8Y`BJoftQ!@RrR}|A}^4NcvbE~y0s#HKtA{A-+3GMrWHAjZ>Gx@6tz=vPD=Enc~4j}g;QI>?5RzVa5p}0iJdhK0pTfTTBsaP{_qU9|G**tg3Q-WTbn-oS(-;B zkOs@w>E1l`ig<)M13uc9MKi;SDM*FvX8JKjM05cz0QMXWgWvlXJ3QB*D=M{H;&tKg zHDwl$R#cpRb?2mNYN@$!y$CqqZf~#lOf6nR0@=osRu)M!Gzka-$QgA_EI6J|4fuM` zq2KH{28Axu?z2EA-gSBw8t!y*cv*IaS*f>0=6SVXE&zbxv$UshT=JVfO0fR^R?#ygm9AAx=VVj2$A*~cjsN$IrM zHTgpT`a?KV)Fv75pB2eJ5}6D3obIrt8=+%)?(4_%fS?rW;Vf<~mHoLmUT8Q`rXa(6 zR^y&w-99PU1@|fIuGmc<;oIsH!e)g6@0U=wL_l9W0*(rqs=nKH41vBw3Ui$s1Hp&Y zbYt^##YH~o1$6>1S%1sNfr`WZB7)w?iQCWYNXUidk)Vo5W1!2*!4bzP=LGn}QT8`q zW}n9xVGyS-n{mX=^k=|dqnaMb2;Y*Rx$366b!RMb5JI|-bpFxprifx`>QvQ!+LW7Q zfL0cPJ<(@DZ1oE$0})NXFr*U#W1`0xM?=7(L3Am`J#HiR2I2eR4;2!>|LyTwYhZhaKBV!G-AKj zCoqZ_?ROm!a_AiapY1j8?)T8JFPuWxE9F1bQLp+hu{=u)gcKRXr|qdaEGqMx(~8I2 zMALZH&Uh+5p`rR?ENGUiPdt7aaTk{he!O|Ew{)bL)zsw}rXO}UFmb1V-V!s36a#>Z zF`P67`;IL}j#tY4kiZJo=Wbvq2>4no%TFtRmft&lbf+ITW+kZOS!*U@lF&9OKZy|L^)3i@HTiS}s>6_U; z>F7fwaKJLWYJxcf;-HTo+^1^6AdCOp^figcPK0JKt)2jvxf28BeCHW@&W?hfjHeF} zGqn&{P>n}BQumVrL4dS{iQjYYT{gnoUuOYMx4#xWOv^`VPAqL)+%aEu|(;sUWe{YZvn^RyliF7CF@AI zVUoR}R?={TQDubUoM_RZiRy;ECuWSKAQxs4piutjVG}Zl&r)BR1r-$*fBUZXw~OWN zm7y)z_Jv#7g{*?{6QBw9Ebf)(^}Ab`A2Ji&15&VNE-S!9%n%^6}!0K z_WN6yT>#nq3E0$SrA`N5vLM3#&=s#gfe+c(l7tbkAvgp_sH8N3SZYNhIr7fx*ltCA zvt!iFV}FV+_?(0cn*C0?Aq2PB+)hm+b~D~r-22Jew<7<#1mb3rBtYhi!B^X|(Aa(G z*3p~oX7e8!KfgE#4%|~9@;N-uYcJ=H6lE`&{ePMD{ERPnY7($F5dKYhGX+X*2nrg+ z0Dt}Sfa+Z3p5z^{Vuzl*y?+4z79C5IGB4F-`uo5YFCa`PDH6&+Jqln(FI_Z0sLJS`Koida4re|H23+{?4Kk$#Mb(E2%_yjkb7}+ z17dpl^p&I*>}=oyYyH*p9NRvT@4cdBGbA4F0mp%@;=a|0=lqW-87(2}JJ|`iA`(j{ zE+3GdJhOan#}}t-VH}VtS-@*9XI9^}LaOTGrOJBCzg|qd$BYEVq|wua-PBmT+shhv z@B)Gwbt2rS?t(nzRU+?H?xp?xfW$n)?yr!gAIRhZ0IvK0k97b(3Jq+lb(nTh4ODvT zhuu8FMZqC9HHRP>MGRyAUcT(Y-XQoo0sOt^z2P~vTVO%FfITj$|28-kA9{pL{P@BjrGdy@8n8G9IF2TqT$GffKy%x%zo-XH;o$CbrE9z6 z7x8%L9hg~V3ngC1$i8wOZymPuTjQ?9MiPH~5%Ac~9IXeA4}-N?aI!njE{#hZD1W^) z(LpJvg>2CLoS&cKSKQp{3jtN9mpfk_BwSa5S~#fx7b9A24Bg@$VH3^W>P*~CY54)K>< z2S|Qzs?@_*!B18TJ_f`PgE-E4X!i3kFG!%)*ixkrA+()u%Tr>9Zn?Cvd%H2LD| zcc6vvKfV%gVP;>H1ww9l+pSk95#xPJ_QKi<5Z_c1^Pc6}3*t4<2V{#`m<>;(!3V*k zFF%Tz7T1s1l8)!^VvGB8SI8$Z5bVA{vtil#-JSR+FArz%G#g2Vb?nhkY)Y=U7DEig z$V!u&$uGowDPweKm~p6|UjodI$$1bApuKlJVKwVoxsiUNp#4o-8~Nb=EdMw=%xe+H z26hmxKeINnyGpl%LG;h%Skocmx?#f2r@bc?dQID~@~9opr7y+epC7Y-$Bz0+V{INW38c-%jv5Kt*@y=doz>Y^AcwBq{6TF)OMRLJBywWF<5Wd?y zDKIf?^7DKmRY1KW8g+P6O^-9-7F?dSJ%~y1^I|Ky1|qRdI{}HJ8(E^CcQIpJJQA_( z$8~>irk|^s+<~!Z?jcI3K$(bC^rzoj%a}&2rs|=Sm_?WyuWNh~FXZKI4;lM9S;2ibnKzjewa}Yg8RxRt&_G#vftMH$d0A93FMilxv&cWC z;z@y8`@pjs-7y4NETV(wQS#(e^>L96&ohrCSYr(n@Y_H6mHpnQzwX*R4U+BvJ-1OH zdJV^JjD+%2NNcjk}%d2~^Id2$SafK~j*aDM$a)CtM$|wyWEr=r>4RBQjP;?Z* z0%dzQi!}b7xQ<2>scF2Qz1*aHrV)B{96w=qA(>Us70py-nOnbOe)trr&hpi3z|7 zH{gGB*H~^E!?oCd^?ckRjFZqp=DSDRBnNmk&iVB&J_SI&eH1*1`@udTjBe{>`(=-b z_RH(vzk(|De86+?3@z#fl_CNDhk7;}{^Tl-D)xmhkK#$nJu}{XEz=V%em@LJ9W7 zs>jOg&!tOXp&99ox}RmNu9iq;4{cl5tW){`x=(L*W+-%|NQ6jbrqzfr8te;agCPZK`DQ%REn=d>t%ir9g8Z}rW{WASP`sS7AP0-R&K?C^eiaV8s_N+6E2cn% z)hX_~hFys#fSE+VlT}!UuAS-HQ{QA9yPkTyfUy`yTj0SD@Y@K|m5?Vih0!I0Rlru2 zuzHk52tZ%SXg6Gcsjcs;+7qy1vgWZxkAB{QMqzR-!#QuzrjCLLcX|z&^if=V*FS(u z!8b3dS#NbL9FR=K#g-V0NHU0uW0kaTS`_p!rfUQ$35C$fG$byXUR593gPI{Sn`ptX zD=1a<3wywq&=pm%o1B12jJ)kRlynvHjPF6f-pHOuW@Q<~U%dDIVLXKfT7GYec<)_b zSC+NRg{Sh+-{`X{-Vbi>-Ecy8c?_ncw}0$?Puspa zLB#J{Zcy+%%sxxs%mjn*@JFyS+c>OsegzaKK_br#2*bA&D0RN(2eI#aR$A6tI6#NE zWn=DveTQl@C1W%tXOi{$Q`gOnb@gx+`E3P|{aFKP|5S5GIdA=z!Yz8QidxMnF5aI! zKH-RWYKD2`+-m<>W$OVqCUjKV&X==Z>eCb@j?$0POs@f&)cqCsedD}X85cNk z@DcacArOg@y2K7juRTa?NxSx~Li>L5<`pP%_Yg+%Z`2@%gPtIfz;W+fme92-sQ+ET z_N2GYVv?oyp6vvSWw@j9TBp<2nNsR08Nl!qaRz5P3Qx(sb) zmzmFab|e&CRC$e0)^kvn;{!yFIA1eCclx&rg6Bb&aLvzAvmyVQJn&kfZWC2x^0>f% z0+MxtKA}Oz=h5?Sd$j=h95^MdYE#Z(}@f+(u!Ms3= znk$Oqt%q=?ei|pq^U$}z=(0||p{6RRTjamI5-;Ww{*WlYiw&j4BHKcRxpaNmM3<7W z_Z!mZ4E!`riry9|GoCp43d*=`}cejEw2aQ#rDP0Y*Rwzx6hMOqwX+m zk%a}CVy-i4F0D8W1Me-Kr|%i;982Ti1nNAqD{!(pS|0BL~Qm-_tk3F#~d6yYAf%^waI~3aa-8PyvR%f$L}$bmeY*hf)>>{K8&lz zuiP?}b)@W()W$b+u)~P4J#DMrbOomn>2>M(8?a@Fr0j`?0d~{S+Wij1DRZ6& z^_(%EKMud@j|JvdTp5GbgQ-?Dn2$W_zB{C#%!2W3_9h$08>BX`G~aeX#)bt`(H(oa zlY;&xZ3orCYkLOo?A$+U%*etTWUkA86T?&o3`GhmOrF(NpEAaEL&7=Cg{+;QZ80p` zL@Nfgb4l%#C(8qstg#Tbj3E1WLVP1fqP%!a4`Ci%on((xS^fPFr)rmRs4j{t9Qe!JNCJ@ zn!YKKZF?9dm{wtTJ6a`DaA5XlVujyD0;H*%gJ9pmuM#1CV)R=p7OSGi81rRp_4E6x z7?F7T#??xr$^#jZ3X`}hw2bWLkoG2e5%pj+z~jrxRaK^~YP>9S-8vhUn)nuyzX<1*Qd?M_%L zc46L8(-38L53exJ!Y$e(lVQk!Jx&&s38+$Jwn$*S4=EuUCf%L;nJJDip+ku}^K1<0 z>Pt>)*KS>f7MBa0hAepJ+S&mlR6@tp8KgWiBxm0e=e-69VGAp!p2tuWP|0X(zRA<>&r5?YGQ;&hyj<`m`bRDv z5JS z%Td(1$MWfP-}Y=iWn8sQA9_c7>99}2d>iEfUO#k;2AnBbPx|8Z^qfX(TR_Gpf_K%Eq>-i zP8kYR2|1Nm+pi~yTlma3@xJ>;)9}PSx!L%o;gp+?ghr^)>B_fqMHORb=og)k%<;QT z>6V^_rd$fwJ%hgYp#eFK#uYqWxpX9l*R?7%v&`~FH&h9i(0~etEKmc@kopz{FK^{| z3#HH*YJ|0p5GbDJJY@`mitQ$9RHh{fuse_@9<+%LblnrHgrSt#UxX{hb>Dorf?lSz z{?2u}A~Fc0Mm4SKL1&t$Mx+n3mVHQ`RBRyn1R24u*UVNqeemCU2A@Ll{>T_NZV4w7 z4sd|$P16Qlb8VS{)iG@hz_(w<>RJy-6`eavL1dtYdEsJctlJ{!H}w7F#8_V0m{ z;s$sQsM>pc!^yj}^3Z7MW|Ve48XY|vLSai!Vi&;^^%FJwgt2{|$E8k*q1&okk_t-%CaVTRA)lnkTh~X;KxI#*CjQnrhnl3*R z;;=6Kw>BhH(u`xjEGX_`#*wBmh>wlVfSycV^{*7@Cw2nWlnz{N3He^`!tWTQ1c2IO z3>{&%E3|E9uqrRAV(;GV2g*GMD+NYO-n`D{ z4P6XlNS-Z23m&4FqPnxy-la!0!|@XH=neEvlahy6>v!9jIAz`6A7arc>|GGyEF|d!phW^ z-ZW&0@9aNZ=Y1r9OEdL#ou(?L|6&? zh*av?`1p^hI}T2EWg4tPEp%kGOOkKr1;!XQiKdrVt08Hez(rA%_`R6QH&@Q({J!k@ z-M97StI$Dh0o2)t&<^gh%U*+qC~E=79?a*FArEMWVldmZ#v$)!`9z0PPE3fW*b4{Y zfHHhG;|%$y>dW-U8G-c@b{8W-271^fP4CFgCR?WS-=Qbn6N_iPOai>Ek5}*nP6PL9 zi)vWZ310yCg*3e#ihks-{{dA53Zz&g?&-%R>5qVJxcUTZNZN@Y#-U{c10v?gX`2EV zHQk=SuMLVjE)*&vk_m$RaSVnxDPaahqqN+s=6E8ZIQTB@=M5Nqd=g5ObpCwy^3(+k zKp;*as-9t&q>Pm8ygm9>F*HdhKvVUyq?J4x-KW^Dtf8$>aKV&HdM+GnG1O)DF#Bhp z3DAs;aX+#+Q@b{J_Vdtg^pe4dkg?TeA52y+CcElh&$s~x@i4`wHK-B;#kRVZ9+E~V zC(EUKGIJ)8ifWPc?bF9$p5YBBy~DTGssenW`;OuuMFWrHXX0CwaB(G-=k=g(OPrd6 zx~@&CLdmB0V9n~?Mn#41Xq(_P7}Wg_OhjNalU$uJ+BrimWvdGUF{?T3un1hju9mZt zuoxW9QJDCh(76FsOx)Frp-?i>a!9qFB2nem#$7z!1JK1lJ3I{X;Z<=@yRq|{GY0-)itK>uOrqKgO|yLEZMl}36gE$;zdkIK!`_?ClW|+mjjd~rG)hL_ zFaG+yg1=q|a*H2$?}`lS%#7`uibMA}*4Mx>5tm8y?d`&B1THSJnb1OWBqW8zNe$WV zHewUdr;c}W*_6K{%8s+7U>9~q@*aU(fsp{xJ6|2Bip9g*DBrHVW*x?l7Y{Oh|1`4< z%5CSR1we~g(-4ie*cs=!w37V)q7TFy$yCqSJg1D57v~4uVBR^HS&rZHDtYt~P6qO! ztHt{;8v@J>wpE8=oD4u4kEan`b+bVO_}0bqYjCS&BZIOiqV+@xV}zn$*t>6l{y6EQ zF6{NPMxsG7^&^%ITS)AnY~aKjfnf^vu}vpy$OyczlVFUsVkI|;Ydi%#6ChIRK0)ol zgwxlFd9PYWR`Ij5{y{@YdMqRR=>5Bob%!y2$HSt5h7j}jh1{uqXQkpVRoF^3Bb{yc#qGgK%rO?FRZWj zDrE-Ud`XjXZRxFBWa7~5QAScSP zXQ9HPpdIROz8dCZ-s(tW{x~yM$~%TQ)0{5njEEdMX*A^reHLO6^}>ojKK(J^KKB(~ z7F(4F(X@v?**8zGt-<{3KD@D6ux_1hd~nc~>UO>-L-4L@?gQIsa#aWmO-RWC zok@rhEvI5`8WvZ2li8af7~6IsRM7GgNf)1{pumi~c_UH!NXJ&z@Zp0;VC)%wk6I?y z9S#|UTNX;gy5LPpFpf3KQYgPjEz@b9xC+XS)D!`oz_ zDY0I(34Ob5_4e1m68@GD*vyN$#kmG_ko$zK;YzcSiU1QtfG@*SZ{%`&;RTpY#G_RADGIL+j!NSd+{gI3=#9Y^(kY zO>>x=O!axeikaLo*nkL%1liDeudv`R-E}NItfdN(?$3cngsCamrfvv(o5>`){*x94 ztlLxlo?Zb^)3(x4yYJqjmc8GIFIp;ED4rsLNhLfsK=M+C;r0b%t~X;)7VNKZ&?uX3 zY7-aRj`1@%iiAW+rDDjy9d5OQMO5z2Zg#;A19R6Cc8s%{s$a53O;BeN*O)PZ7eZur zSx?fejKn1bT$t!clQ~e%OC!{%!m|#D?;-7PVA!`Lxdv;&{2+85BF#AO+g&~WVo)$Y zzq&ZlQjUM}Y0kx!g;6<<_yU_3_t^Gs*OpWOI8aAUkg7Vw>1aIj2xLj)4eLos+gPYS zTtA@#wyx?tIpAlZGM}YSH+v+{C&9jfwd71A35i9pjEfz!}R=4 z9m~r6J$L&|^9iAmQ)tLAcg1{xuGEo`b)eOzNLtOpvHKz+?}=;I{IEw)E@~rHE=gpG z%2{p$3pXa@Nkl$V+-4YnTM(yG0?ci=a2>49yh65)Zi$~7Z&LosE)t~?(qx|GQ#yxw zQ7e2Fg=R72F=xRsc)_xL4;7p@`At$Dv0Qkw>SOy#9}dkvsZfij>5(FoQN%3L@4t3k z2&X_|5j6?|kBK^a&G)6%-o<`^+%^Zf2hok4qYw+54(}-eI^Oe{O4&w+k~6Bm^Y;p&f62Fg8L^W^To`X6g76}#@5th3LvqJVVFghT01o( zuiJ6JsUzerK#pq{$GsDhTskEG9nz=FYRyoX0t7tvtK+eDwMT2fl!i%{X7zsV^L~4N zM-UPxjpxgnsD=AJcee_cM?Pjl$qA*hwWY-3cW<+M)G31Mr!}iOR={^bV;fDnSnl?2 zi)a-rVEOVLhwOPuZK{I3)xNN_2LTiBi>wG~J_LHj=O9Z`3FiV!Vn#Z<8n+`dVi-O_ zPUr}J1F=A58jz|6SkmP3x76^*dQh&>jfh3v@n#>$p(1Nwz-fb9*VU7;3+$(;LVP>}%4nqiL+C-EuL5 zz`_G*8rD~v8`yg%Zy0Cj`1aH9@myj)XcMu?{Jp~X*V2Go1S>{lLFwlKmx4*FAd3^} zeG3;ka=eQ>EWo@VkJ?yl=()e(RjJ-$)A1$lkUz&P@7TMzCdi~}Zb({}w)Ukx65X5X z9sO9={S%u_Kr+HNl+)XU4P$&34F=CvyCS6W_dLy38_3e}CI-GSx-5BWy0jR;qq6veM?S{7`~jM|IABQ*F_WElb95eDBt^e|@U;>VqzqI|BZc zDmhcm3VRU#YWJhanT7VnSmLpVAf^0ZH*DWJ3i-nxXCm%G`}ix)3%&P3uc>XLFWQUe zmc78?`8G>^-?ks;s~$fX{rA|vUEW1$7*qBQEpX48U$ z5S|<%@coPD<2*b<=T+&bwD;$~{`IloF|<BaF}AM_F{k-jIXaIL%x%- zTV>0)Zuu28KL<@XoY1_ETOB(p$&HZu%36>s-v)UfBr-k5O2=C1t#ghb;AmxOjlYOX+hx%J8v^AKjxWROGM zPe-5TlAu5|%PpXMEgh1x`odR0InGkuY!zdP*_%6WSxE%Y9pc9BZhDN#D6Es zcr=#~xt97(_7$ZQSFV-pSD{7iCd8t}$Tv(9zW_q4%&F`A_+1Xg@;?k zH7^<>okkLe$ae)j3@MF;`MKFQHwKR9JeIIv3>nXRRKE!uyN0ca@4 z70MTXqQbw@GeQqm>QKER^z&G7b!ruE=bet!1GCk?#OegU7@mvtM9gsbzuN0Z@^2V{ z1QAN{>Klvvx4lQ3IvvX12O;?QVUx^Ve>hG*(K<&g?gm%p+2B1ik?%18b%Z@QK2Ty6 z=CBQ(I?dJK{Qr4siX)dx_TRLqaqIPKpYdy<5wFp1$@sQG+lCF&3giAoO!*aR^W|aD zHlO^`ae2Ez!wwOLJ*A&;*uUXu~!r3C||{j`hE|E~OZN z^PT|}#b$4beG44riKmWjGy==);?taSz)t&(T*5aJdmVws0F3&kuNT@`49R3!olNB|Fz8jdK5g6Ya8)S6U=9cCdikIN=~fjIjEAtUa5%p>_L%Fz{sSus@W_o!h0iws!08rQ zWS}6x5G*J$-mpz2R*rVO#vdt4ev{~e*^`4l{gYPp=l}7}7#FuhIR{(m9bDR}%&UdL zte^%Hccyr*vzNT`$9Ld5wZE^%FP)wWIji+8Xn%cPkS1xi5i3F>18@nH zu4M?FAb!xG>AK;P44}BQSuPnFpa_X|3MNq}B$Z#Ac3*9{a(YrPSE+oQF3??Ov*6*ek z0E#0CXDwv@=X&C0fwTq@=s=N~wjv91uUP2SZM%R1D7AF_z&}?|5B~@=UMBmf)xJI# zA~=u~zXLJdzS#76Knh%P{%p?Ed;jMFp=9Xz?}CE~xRasd?`2V#NJZOP&ms2h)*`t( zc3%2_T0;~hP$wYJI`loBl6OkZ@}n|%>Ynu7+qp-7WcaT6(|`6G0h0wcYb5{D3LN@T zu2)w$G?>q7Eu_4Y69mS{FEf1Di-~56}4Cfp|gPK z>|j;FQd+IrL`{RlXw$ylZw8wCM>`6;CMh~y#|7Pv3;UXv|GRsMH?X;SL@_iCHc3T- zik3|U5Oq-3WjAXU%>*E<#5bW<5B}#NbiX#d!B$BNc3Zp|7+k`pI(iL#X!bM`f6j;Z zYyER?ai|vsH?n-%5xFQ^nfVB zhMgs2!k4|nEdQ<+9O)ez4V>WqrR4;<8!OpwN@xE~Otlp(_GJ?Gj_3c$hJ=ihJT zNIwidb>-#=A|1Re!_2GS=o}=hx?P%{0B~dgK@k6p{D2cOGYxwjYei z-g{q@%L)w?*IJfIBC+Lu`iUhN^+lfk<4I_?j2pjW@UDUW~Bu3WEZ0 zCyYFTaCmw9f%t-k_MgX$o*-T~MBJO@A}!ZLATA-Y$Y$j-4pzcQD4;=NsIDR!=2k`2 zmo&e$V^_U^w>{``ZI zw6@z3nB#zTYy!9eU3-^{pVf=*4+CoqRLL;lm%krXxKcM>E&KQ+1;b&?YeBLd6l?l} zv=)4C2kdsE70_yg55Av%avPU0bsQm+WZeL_c)Klo$q@51 z`k6WceFpfmook=#BLqk8*d=FbxR|5^uRgu znH1lFzNt%A8GU5ZzRkjQj{7FEChPV(K8|KbgjfK!zAm%Aee@C;(x*zu?Lh;vBj)q8 z4--owKZ#mw_aPS4CNnaaJd2x{-?sQxw%?iAJArtK(WRqssx;8pDj6F|Fk(r(qjS>a z-Y0B>s!aJOqj~9a$Ni#0iL;nu>#|j-Lf|6v;DWS3i~NI%tYt+rViR*s1`gGEXiNm3 zOinIVSM);GeycjFq2Bn4;!W6r85KGQS|SJlkXI=TS02#~Dn3xwCV~f%bl-tjRk*}5 zZK)J>i%F1#I6#9EQG5G;v`_HcGmIlOvtp}cu&TyE7yGXxpLivWa8>;*5V}K4gl`@_ zdxE9vhOO;wG6{Hay?`?%_fxo^8bC^__!*2ks zg_q0`orWeVxCTpbY0rIPvF#h=-Vdl6aK_>J*MQj#jD#z#j74QDz(&Ad@U*J_>zUwL z38fe%uh)Zcq60MhyvRHxy&XF7YTyV_6`=g$VAYH3J~V=kE|{(>I}ZZ$0X*@HBHN+a zIuEmx1mj9Bt+1HQJz80^`d}+YboSQZycI8ZWT#srOSm1w*&N~qUxSvx>@|2vf9!}D zRI^Z7)PRC)nsFRxst}$Sc$D;vD=%(dXuSW->T_t?GQ^}!plx=tnTUR!8?wwLkf7%( z1uznvDC!%btpk{RgOT?$mQ(*6zj!PL2!FN5pG#se^x@v;TCq4>qUKC~vUlv(Q06Ue z^lE43T646toY3g6<6W8l;2TR1l=lm8$~!>Ul(^-}b=|g0*WbOU{O4&27gyE(6^3E!d(X!ZQdk;dAbg;PQ3@ZVRP3hzh*x z(@F>y*l|OnlH>dTy}2zzjv?eadk9S_x$mGUZ>8qFgg{vB#LN`cce}5o= z!9dIA9!RT!o_u0t4N?Z6rau5LXN2JaXz?oeGsI#UvVA|bY~Nqx4OG+NbMMO_zh{)u z_eGlD6%O~COfUJ)4azAMKuwwiEH6M&fi5r)5w1oQF!FtA(2llg3_4pX=-If(^KD~x zHIfESL=oRXmP0R*tLsmb%V<_^uR5zZDMB%Ib=0|yiBp!StEcl?9pNO2-qkI?JIO6y z}8 zr6g+{4k0lD1Lc|$$MqZ8XSm<%?jlqHp@|QLTBWkl_vTpt5<<=NvV++akFNt!hHZ=w zlC?sni0D5(QKGMfi24+g{E!m^prD`-Svgr;QXcK_4N^7YDb&#gvq)W9O{C`&@p^y$ zDVOw*8nsT3X%+SAsKK$|_VkIF(ik}0y_pM(+v!Pg>c!D3QnWmfE6+ZOMh^+A>figFeZ2*Gpp-$SlS(sa&C!HM9rf z0A#CqT{qnO;NLp!1T-JUGFI`!6%RQJGQ%6_IG0$J=x;z;u=3)`A#_9PTfa5R6UX^A zgH#?$+#1xY{|Z>)5R}%v0Jn68kz1XNu0gm{PUs#u}0J5(%>? zr(;>4QpL^D3QB7?VYp+0es845n+DZc3pRah+$@`pA3terR+VR`)y!X%KeR59kK3VS zjyl0lP**oDhqu~ByT9c_aKwUMtCvr`>T*-=py~!N$tQUE%-?54 zw4gr$xK*3s=Ks>qFt|(bjE8l(eUz_WV0-^a15ltEXY`v_yCh~u_sKLnWFM8>pz~xk zs@qFJojQ26l0D+%mijw|KJPq;HK(Nc_r{oitb!118S?C_o)mj=>~>^dP~}Os(W5R{ z#*&El1Ay_VB2yJHw~(pG;kLG_lEY9NyC)Y*UVV$jwrVrSM`T#)6Z?tiETL}N$>aUU zYFJveKO!pNp(hqnakH14D-G7ZV0jgvIi(#j;Ej~GOr-EX8F{HJ0Y829Yp=`E`jW=R z-3l7V52fp&!RSU#O%bF83@_9CDB~!h#W2xqL7sb)*?0|Sgx=~GpjdkFj`R#zPdv$@ z-JE&(;&+b>9y?BsjK@4Ypd(ph=Ub&X z8&n+DpQoFEbBtHq$DctbaQcxjOL-^rGmRWouZ{ZlfjdPH%u4Qr6XxICKgIUsTtcO< z)P~&3lf&z$2M#@vf3xetbrDzdzBMpkg|t~aAQ+~6+{#aA!hJPm6Q&^T2q~bjD=5l# zc8PAIU}OY3z?1D6<{z4$3BI`5pzMtLbXpl&}q}QYU}lA4kre7A2+Hjb@jSv>&jr?3#F+kWtM-p$AEHwnE4{^E$FyP zk1XfxyRfF40F$(`+F-s+lz>@iqXfOjLf)&a7U)7>k*OH07;mZ^4_s? zKa|fGy z{K>&x0>Nx4&~`d_h_yUvwRf023MS()e1dXZWraDWs&{a{igWvZIe%<=Hud=!!Wa$b zcHucX&u4473ws|jB}QqIlUuIp=*8`(hyYpg=!+-28D@Zxj@avoBx>Y^5Dd`=->9o& z$<7TtOVjc-7V6nfPr)$^zs=Q|Q2`n{s%U*S zE_p2d`*$iQfnMdqHGe|NM?Oj<;5=ITOx%>y)gA!ey z`JGF%faT+yn(^-SF@no-XtF7Gf*+{;v5Kn(Ucz*g8^%AOwgTMEvc!^~gLO@cuM}xMsXgQYuAN9h2bRNQbQ<#qwmKPdXOG!wl>KV`1KytABkcsUY z$79sT=vMCWtCA0Y0t7*$RD8ygXx|WTLh-w1&v)j|XSaVIis1izD6XTtbTm#1XYU<_ zXXeW#lCFf-$~{`I@AfsU_awAhc>=hP_dLyXcET6WHh7(VE=e}4Tg~v`h%$pUI%~dW z=7-^YmP?|sZv41U=o|Rs_J~I|3s}tMHvmJ`{?fAk-?enT2(z_{WiSZAF9^(Xu+_eE zEaW$BD_!fM_Gu-HdrJLGh?PrbH-1gmCF2(Lf z671}Q``1k0NuEKP?unuJGmk*^`niRft~_9UW4a~wOROq`Q?AH@92(St!rw*#zmd4w zq5bvI(v8bQ@8RUw__p@Q``;buA-^Pe2#1WRgLLw++tLUiqDss|vwIvC4uU#AXgXd2 zd#CTHmNI?>#?P0LYj7i-`*81`raeHrM&CJJ`5EYWYrE0uK=aw^b4nEX;3PJ5>v|2S ze4!WwmW(nS;lRf_grd4P8+wqNG7w@&xQr+mze8)8GI0Eb2)dkRX|z&MVFh%t#+y*) z*n!bYUA3PRlBeBXb00Vmpi{fw2o1wA#EwU!`qfF~T4gYZOB56_5;E2+tcN&+CNTlC zbsKcdo}2NChb+9CR9Mf;1ADhjegU>(bw=2Q&+YSg}tMEFTz^ zC(WLRn634q(y(4JTM8oEc>VG-L$YZlw@V!kZjBI1fHQS0Vc# z>V$ST{b%s;-c3qYhdn!CA4>h2@WTUW0t=r49M%5-VBBQ0RfOe%h+LI%cGvw1z@@n1 z0L+{EfMHJJukn7?tXc%S1YD(Kn8gqh>f`A}vpAj;3ia}1;A9580Q3!% zy7_PC0v*#;F0VLWEF%rOrquRaRLNu9lV)8ATyGA*pdzAS;dnhV>4`9=wWn!2J`J_d zr2wLGt$GOU@;Mmsdv?=Q^Fgyb_~C*ZJ`;r0DXBc6fyIuDpN84^dUt68 zE-YD}tiAJS{ddc+D=>}0WJb298)1?XHeYwgLfhJN0VhaQdV~@WI4s)xMbIi6c=7<{ zT@b#p&y=SB1t6X7=1xye!wjOekc1~1MmSvR&!8Z_LB_D5)eBHL?ZXXlPD(1Z2sh;K z6sTE&W>2~;XACjbe{g5m#z=%C+{9KHjV3_doF3TLpypZ^-_)B zW5Z+~ibe2UqI^zync4Km@<5DLop_4ANg;Ib0G2pIv_^I#NnP2e9DBY<+4oHu$wBcv zA?bUGK@fHB@dSuepa-tF4Q@@Z;oNc)Htm6=oLKtCJncPB4|I$>UD<<4P+niVd0E9| ziWQ#&#y$l{?adyXBqj0@Dg)Bh_a0A&Y@fiRZaq1w0bRmpWBQZ!I_6IIyDN`8L8oBv zJr|=Auhagh4Dd|*iL9)0HrL-$U=G zOw?!=)2DPEgmo9SQ_-G!3X>s~CNY9Mr?jO`rKzgmjzyiH>6+HMuAKN5wCZG}#VfBX zi~3o3dB`$ct$m};}WB4p2xmYFszVE;M;cZ|gXyMef|QKq!Zfj?(x zlX1R*Jw08O(dBy?94jgW7(a|X1#9FVSXuKG2DhIrz4rRy85<(q#$On)1=Eq0 zi$XKAml^Y@g&%}ybHsz-;#aOUwH{X?#(cY-o#0nhNX{@Tov)vdrTx}kuA5AXj#JI0 z0RJvqicenU8@jDV&9@$;&DM>jN{t&nsr%-{XJ3l;nucMNJ-(6qNvJW=CMv=YbvgpZ zS@P16LR*)r$jkdHUSzfcPVMwkA;7e%epFlJWBzjhLH~^%H?pH$37c|Xow$Xq(ZXM& zJZSML2n1*zrm}-H|zm^aBKj9lW2%qPRO^n`y6k;>~_u|a|dLs zmIOP`xM@*htT`q^xiiS9X+{Abjo9a`U^c8V==~E8%fK@#2~!C2*tfyqiU;;cX;?NM zLaMaonzscTGnB5IR7>hjRD1GQXpyYM3I_3ggDw!`O>3+$6oU1AA>af^{8JRH!OfekWFFY}72 zgG^-nl*UeyFU{TAgcSfj$lF5+K#tqvc=09^A6EySXtpa>Fubtdn5toUv^0XQn zbNV%;#`V5wXK8|%@W?(?7Ez5%QSr!E+K1%)Zp~~qCZ6lJHPvV%;%TO~1&BGSiG$L3 zqfnbF=3;`bCaI@;-_!yfp$7Nzg}@5x{qZv#^B^6xg$xg$HCs2d_nz*Vc;;Wdwqj3Z zS#dedG8)neRquo}$WV^75sKhx(_$Hrr^ZE+Q+RLU7~1a<60HO zyQBugkW8r0F6o({;C;_F1sW+_F>4CSHM23(;wtU$6Khvae7_nXzlVvOzG_kUhZi7- z3Okb}oQ(2vjUKgbFj&799zyh`4+0;E#qCy}B@$7z+K%ch$v zU3q5=--J_J1*ZR2amic?-WB`Yi2A4>t5WnT)nV}Vq+;~Lc1$Jy6ZdW5Iq=o2iD0Z+h#K9}fol*!1~X{QE=(U6}M zI+(KfFo!dXkP{DM^op?OYhDz+HB0s!CraQfYeVOAlloT3M>1p4LW9J18S+eb)Km+D z4q^JwR_Qs<)ZoS8JF#*4KY%(N)$SaFKOV$#Uxby6Q`_+D^{PwNNh%_Bc~kS6VnNp( zbq}}Qn>W5AL$L^8Dw54A4y?VpKov-4l(0FeNubs7+ z4+ZtT?tYwDcEcvG@<6ag?MXm&R@~*7EM=bSOT#OkVi?EF=QbHMi>Q}PVl_e0*Mug>|?~^BGUY!%6g|+ZL;wrUbh=YZn2uXAudaP9~jf0Mz|I0 zv;Q{+Q-nF%bG(CxOY-i?Qk&#+?()EuDZ2#xcY9zi^KoHaR$`nSEu*m5CAlkT8p^fb z_$*R7ldxA?2`dp8OjySD+;!BDdO3DhX-ZYc3!INT~MCi3x528T!Q{1Wl2FK@3h=(Akp`FyjV zGWVt^s@w1E=GCHaL0!@-OS|i>%f_%X$1*en>GnFMCiD`|H^_it6R#4NWPYRk{y<%a zw(u<%;$^Fd(v4 z+M8DOHP~A6Y8`ENMTcN#+Efu1Bm*fw{ z^=Hf%seH;Qy6+|`c5=jJ4gzC>>j5c@Yil7Rh8TrL4{ZsAwlP@SV`5b&`uCv9*d*+F z+0Fe;MrFd6rNFYH_FZ+J{Z$AI*;d{Zh~nifg-hq>1L$i4k8${G>-ZEqO0>a z9z{kp4bWKy{8|r@H4x7EVQ?Y!U48fkkV8Nx(*yNe^KBd&`yfk@2${a*(Fo+L6%|5e%I<`E}O-u`qr4GJE)OH7x6 z_O^%Q;@ZiQBCH~*VPfMk9U$RGaM5YuS3o!kV&Rlz7^$=oY7JB!-c&{ULl2{nf^9cd^K`=&m_v~!l2+|} zslh!(Oa-?kw=vcBk9_kV_E-UX|Fo%vqM5|^hM-Hd3gzH&xvijKwCerVd`8a(epXAf0LC8(XLu8QJ*A>x8XI$EP(8~dc2O*G94rtSE46;zH(pI4){l$_pBkSKoy?zcI~t!6>ldS&qi98XA;YurK2uz|pa3moO5Ebuq1hA= zwUUl>PPTpxFL|6mV_~ZGZIZ-rFgSnTTn=48;f@Wk8^{<$asv8~slGI{KZW0{ zOQW%O#G9ZJ7ePjwxg*H&WbL*;LmHcMK&d z($=jqW9kN{6*s^_OB^`8x8|S|Y@W0F01{`DPw-f8H4A;=z0%07afix|*XE^_qO(}f z`(rY5U?-C*er9NHGHCX8-GvQO;q*oP+@zWu4t9%1@_=1u~9kVu*{I2y4fwhKPdUmW5| zi`tpTxZmoDbQ9h>@7h|Uf3qGGHmBnDKWYHK0Xj=g9*Oa`X`}8;|K%V z3Af$RVg}jHq`U!94ibR~^E__E8nEukcAv41s#cIY?Tci$m{9YFKMX0Kzgt&|&)UHd z_grfRbT5PRKsYN3j&K<4e*F%Zs(@ge0V0O6>c})f8`#Cf zbmB3CcjDZ5s?8KB!o4(88`n0Z>4@yVtMJXVS{#w)y`_qzQ# zMi!D3_$G3U|2$MCaC_Po3!NSHrqXG5e4ms7Va;mPJFJERpD;almLIaP%R)2MGIMXI zv+CU}%`I%yzq|lOX;M&u6hSVnQYcMUc~5upt(zOYI@G?Mlh}vRd@{MnACXc95x`oLU z6lgwAmCRMRal`&o&P?B1oyS0aVbk|uXOIlrdg(dp_$*7&m*={p$2;y1a{c=sv#s_& zr8seR3k-yOzGM(?uFP4|^PVo93c6u=c1!L7U)YT&VZ>Mc2_iR8N>q%IbMSOaTl2K# z@F!m?In#D`H*yyQe$<%&leqpaDvxWV#GzRFu3ws+7YTcKGuXGtoZqMTOq^Su;HH`M zskwPY(}>kJWNB7x1bNV9{8Oh9IRk+(D-O?Dy96LQ0nef zl7ad+bP+L(>8He#Zeu4*&B>S*F0oOi3m9kMbI?Du$u_Q%IXGjut$kxJh)%*}pF-mS z8#1spWJhQ2<|eZDz!pDRVDPf%Y;#sh8n?(2n;%)|<<*3{27>Kq=Z9yM+OH1PPz(~j zGYFBN?u`-54V5)0k$+0!l6pMSmn=7gULhwssFOUYvhXfbD1K>=SId6-wk%4R;A*O; zy$`9kdXTKHQd>w8D!*y5O1Er~XUHwEW@sTwu)ki1$8-$Tv%&m2NzNig6btFoX`+I0 zNF&D+A_w4ak~!M7$s=2LdeYN_B2;4}v>AnRv2T93-@lPP# zry_3kCk|NtQZpr&Pma_mQX(EgdXwNEO<}a$=;0ag0p^PzBxWpQtGMO(tpAUGt4JWi zc)zEEitGykM{LhGEYWUR{RgwEb1T_swr`3)j0qWmfg|*>^P}|TA}K6(4ynyr%57_s zY+FAa+HcsA+L4-sr6%(vyS+xgK<7|xNP;WKwczG&L2WgO^`R%Ix=HjX@)|M+wi4&5 zNbf05B-fMy3{|}q%e|3pPo1`b1|Ymnl6>Vl(BpWp_5(|v(K=@d75$qBu(f@22808O zyplwc+fJdkJ`%7*`07v9;m#3MFX?=e9?I@ z>=$C13`wy7^_gM)&VsuiZE3I^sMKm{DWS2|3QvE#&YLt!il2X}_LN@8 z7H)Jn{l2#A#aB=3Wgj%`;p*hvi61w}%gr_4Cw2&}i|n#%+TfQXk`FdM-{LRaaw6o8 z8K&bGEW8q1a`Oq!U-5_0Mu|hHgT9ApF)iZ2soQ_*3>=(q`RwCAn@|UF<#WYIJHKLY z9wT6B+TYvC-A2a}Z&1a`Yt_W#e~?OFEB?fBW`_Iq*JXLub+o6jJxT2Ed{N5s9s>z{ zCgD%CqHXI%!si0qa-&w~z`PY@L(w7JV;O8uyEeU%zB4*9(hDF5x~e<2HIypNaj7>Voe=NWeG6**=fR zg;#@DTP1`CXs7gu&JZk$O?5|v2FZ|D+Zdv0RGW985We%2N>#UDY52Ytp`B}nl|<_s zYTLTK4dtj7rz1~=i}IFY`qq=Gp1CO239vZnSVfN$73yUS@xL=ug;#_2!&z|<@9@PC zJX=pntyE}EP*9Hue%>r-WoT_PUoe`GfWJ(6d(!@_Y$Zi|Ya^pTH}CEGyRp($xfJDVuy#UJ^Jb!mIwOR*mN7&N+L(RA>Alq0KM}7q?NrY6tzA>u-#ew6mNv+% zwFq~e{3DmI4*q9>Oiq3!U-GjuT|C7RCPNZQpEioL9ohhrnz1m6x()4TJUN(4REG9+ z5wB3|Qaz7TsVGpT7xqY|o%U@dB@T2(abYz(tN!+392dF@`bJ#p1zx8#%cbc zC@gH>H_MRIv#40ediewoqV6&LbOCe{T$z}l_YR^*+AcFzd9GCsdY>xlQRj9qCJ-T1 zRX0(8cr}QVkG`P%wnCs?|6ZFw2T zNOWDfo+`s}*Ik>nlZs5bcU2ti_09;SV~$E}Xhk=#9{*0FIy?U$DiYG(H1MBedf_C9FMGyfIN-piHF#rC<2Yy)}~;f^aXoPt1%@8*krgBc#lxs zXUE@5FQz&H zcz_>H*2mDs`)0nw#r=pa`L09tJ?vUdiQ(?%?5gcH??%ygrNYWC0m*B>_D>fy=eAK* z7C}+=Y;9S`@Z|n4u63#_GqaAX+~PmyB@4IGzSi_V#_b8bx?K7|YQgL!?HK}b0$G;e zMbhv!zt3D1z878chS(ji%^3IcpRit{Ri+xV4yw>}@Bt2Mt6?DHmY(v{y>EJpg#pq} z8U!pJ4+$d`!v+^Boh*(9CmA?VRVU1K-}}*ae)dgTk5damU_AC|MvZIVgx`^K<86GJ zXEO#v1Kre7x~cB?YZ**dl?w(m!|q2vtMevl3uoP+JJ9V-)h}&)EqA*6)K~lCBHZW$ z{>F1ACn5ugFzz3Ie@^~=cpw!`=(_7^#d~o|Lo|mtvBw96YIF2dtJ9u$P9?DF(seV` z)TA@Cu%M3H9l3mERsAUx;F2)Tx<$)#2aUmae*O-6^kLFsJ-5~;shNi8L%IslWXdPv zf9Prve6;q|jD)C2D#!hMt(S@hWsgz{aTcs+fk(zI0_ApQfkxCI0X?oPx2bv$hCnZz z%dWIlGP#VbL&n7!(;wqE0X;*-WnyaQ5$}YPm@kJk*K2_Fy3g zAv*P&pGL^Imazmwc-JFC*PR}E_b}Wix%Nl`cp80y)l#41WFloElNif$tCb!)gO8^d z6)9aC`*1!;$|yfvK|~}7-~zreiehsqQ~HBv<1urIT>9oB0{m$YtWYBZVVgkPWF@oMs!*y=P3j*=~Bp24qbHD8|{ z7b!AR{3!WGoOI^4j5fNCj#H|Lf}0uuyr`)uNcq!YnFf`KzA@Ja8}3v@9}G5|%*K=~ z(tI3h;4wWw!Wm+JQQ#c`3JdLO-0tP0?NpC!z<4SgfWSCvaEUfsXSqIf00kL1ppl0ItWcF_4PW!7D zu}mkpa0*GKLJq%lJaHZ)obmlUg(3g`dd{?p^Ql(vU!tHX6JBq+F z{QDmPAYV)Je{Vgqis?{jMlM$KQZbG0%%31hoFYa=)`{IKuZQWP*plURIzJebBkF`&KNrwrkSdRWZ4Ds)U&deG)meQo4-Df~ z=)K8oxzo&L+WIf=8nuoB<$$V9Lba_LoQ*)U4(iOsI$J+h1%Ut%`uBQ>l;UQltW zv;7k8!o==>1JFy7E2L(7sgH1&fRLEN2Bxr}1N)#I3ghjZnO~4+)EaRdu441>W^9Ce z#cnXL&?IO<>arWyrc#y5pqY%c#6Ex(ZjwDB?5TKt;%0IJaCer?&7;krlfdEhYr5g3~(+majZCA4os$jGX z*jKevYj`*0A|{-B3(hZy+m~5+W}<9_PW6)S`~E+ZWqxH7U>7F*=V&?XV9%|t1$VZc zv6}RAJrEmcxZ7az+*JY}I(@NS*5o@2>jNeK6+k|6nwKeJ0A-Oc2I;MrDYuEHCcqI) z89M9XF(2;k{9-#!?Q-#N6a4SD?axmp`0~u4()*pc7XL;06#)dl zk;j0!ysOs4o6u|55c{6$Wn_*jlnxF`6ewgG?C6h`7yv89{mu-wnLc@Ny)-_9P+A0T zzady4?qpX40jEG9UF$=sVjRvVtzI|InFjP>6`fDc0aCh4%XQ&&$Px1|JX2EWSBq-$ z+$#ylqO8%!{|+8Ql`(wiuoW&p*Of2xSTvDv`{vsYB7#ZN$!GFp@s+gM2{v(RNBAR} z(XQk{IzH*sIPM$WzNaq9NQL+WChjk(RWXUsSR?F7hUX$o&_uJMwgQY#Z(SIP!R)_j zA5vRy$(C?BxBW{ztGw?qV)%ER;MZ0NNAMg?u_eJG0FYmV7_yuX?}-JFd6pygc#M9t zyTG|`3u$}=K&WDCTd7)n#{HPxQ$Be&t*CQh3M&EYx9FB35%R+S=YL!t7#xjVK%d_E z&>svOm@X3U@J{mhhnlCyVa3vJV`W^s%7```?FS?TmN*0mIYTxgo^Aom)Yywx|C|#` z+QlQ83rzRBO1COO=dds7c+5rM4v0tMik?>gw|@vZuUaqwU~jb_@wUI%c=IX{*2;gdJFypbyHe@4Y%U zjNexV$@K`dC8u@?RIrXL`p@?uC!ysDPXu!<%(x%yiZxz#{C+$@{{Ns?gKa<%FVg@l zcUmffTfae%T6Wniz9rzvHIT-T#`-{N3M3;8w*WE9?--0Qb}xa#GLHz{C7EVsx#~=Z z_M#K-w>5Fu*+kH7rK-H6jui#7WllKg%n%tuh zV~89yP7L22oO)=cyv*lFSq*YM$P5yD@5JV$e~Ts|IT|jB>HQ66rl|y)mk@;&n~itD zNYO8EgY?3&R(07029-rur0LkfqebBUGVQ;PhCfFw?DvLfl>CYl4K5=O297kxQSzoX zFL{qLEBbK##BS2zdW9}fNs(cN_Cb?0Z8r@PhPRnAg6K%FOhsq`J-GN|Q%?jACl3cs zWq(D(`XZpkKDJT7Tg^Y9f+U+CJr>S1heNT(hy}@9r%$mh@4OaCsW|?0!XzFc1(@j z6DF@M8-OHg^t{3@&KYCt_J|c;OjQG}M;fDfgzTkM|6IVt|L+A5HdWxi{Dg4^g91ts zYyVoky^p_?d58`f*>FXE&4r-}=|zQA$~LDr6%$WNho@@K@U7}tQvRue-eER=x6r-c zYm%c0OSCvDtIoRpb^Kxvtryn(*UozQ;x5TfXGvFg{P`yv@vkNF$EQ-QOsD1BR|XgU89KrC4C2ydnwm(9v_GG{hy|xwcENT< zGAIT8VQ96?X%V~I-XN8nv^Uk!>7RQlMwd$-tvj*6VOQaN%(D$468_zWxK1G(p^?!% z@@oh#=exdExipCkD2YUr0MFgO6i z$?G}fE+4@cEltRlA`m7vIXsbf*>@p+1pEBleN9wzD4p^}pe+5diW1%fWipQ(uc~5s z4S162-S@th|6kGqQ*?#gzlz^T9Rglt&_y=C2Rf{frbr?i#(<^Xi0NsCj6r(RF(!5x zw7?$jj5m4e)yl1sU1oN)`gU%Jq*^&<0hA8%%n*F3) zT|5Eu(xqTw(9bdxbR&Q?@d)IfJO566O*wZqXugQ=r!+{aD)^<08@&WMl=lRX(_y0PLglsCnS%;Ty|;rIcFFT>0E>h%0>O>5-*12p$||)ff`{NwE-c8ZF5n>be2!4 z{weFZKxCMEjc`K`+c!S|Tg^sAY6^C5ATk}@NI(#>jOplT?&UJGX0XLm#ja|-0&|(DeJ&ci zMBx(ee|A2Q_kmyEF6I4c_-BhS5uioO@~tR-1(UK=<}%{tV03#LK_IG~B!3x+!!e3% zQ~gT%1sgBh#1Spnqq=1*Ep@EyG@;@rE|Od{e8oDeC4>n2&r7d~5ku{XRsC}#?P(i{ zuK`@+>N(VLX4l+8N-~1F*-Th)0Ause_2(!1^X}T-Gs1*7m_l$oOe4@dKKt1=&O3AZribVo|!~G>B)`Qbp4_UXO78F3@@s<)A`(rgwXn#A6u545Tk1`in>Z zDztqQ-1dVGbp!lA_>wUJgnW0ZKS8vDC*{8ykqrgZgiI3EhC@vC*;+rSxHD9P{dSY! zIm3uhLAIQf%E0~l2KpUMgd)vUbZer32!T5tGYX{u&M8)_A4^jD7~@X$HiU<-1cQpD zq#DA1K;7sGdy)RCMex7Qvj{rc>Nf77KfAId&mjwkfa5*DjmHy&PSBLA)Oo+GM6;xB!@NKf;5 zpGUIA8`y0VNtaAMb)?*_d-nG;>E-|1@$xTaEd|n6nf|-2`p=*2YQP(~r2e13_3xkm zzxYYkj}Huqjp(kA;CJ;$MwK=2;Z9 zLOyBE_vAq!<^pQ~$pgD;_VM2(fij{K4JAWr}~3Fo_0h_gFo6Ixkq#xL~=Z zT^M;RjFKNwuvi*qp0pMsD~azejNbWmb>j6z=m#aQ;sN!=waT3ryK|unv@T69+m*eTP{j^~`FWV{g?~Mn3ehQohh){BjXpcJo9{o43($Ew#M&It8 zElvBIx>zDm28<90V7dx?1|%2>y02oB?Di+Wz>0bSvSE)uK2r~Dn)AO_{X5vN04x{) zL?px$9XyqiPoz~z|ILn^y8{W0XCq(O{F5yTPRrVaszH*J$o>ek$ZxtQ)@D9wfn8Z)(it#qA&j0v(wVmW}wZZvoa(`kl={ubw zz=#y#WFVv&r-Qv->Ef1s@TIW?sUr}Ml4D($5hrm-Bhh?$2yr`oA_qV$FL*3*{>^@l ze@_XJOMq|=yHpp&dVdF4N&#U53dDh&Ft8qNv5jBn!uWuoRR4x<92_in>>zclsu)~f zfMoX-ln)~W&tV%A86^0aNE;u{LawX!)Y+1hZ{ICt13l~G@`kZ{9n#cNs%4*E`m`5cUi{_L ztIHQZ^~?IcHKPtE+XoECz4t4$DP~}d0E74%pkWV#iNrJk8)6INkpd2{dypW&2HwY% zyXUTqr_qt}_ymC8=h*_7Ko{!YMwA$Ex#}+O=(oK`D{iP(B^`6yMS_MQQh;1M#I7eC z_y{Zs6>lxH31!zxJ_BFNz@7JT7kIh~r@N!?zJ?U7k}W8#yM^<|kq0D02x`;3UHBb# zSlAf38WU5cuQjue__T0-etJCYZ0t&Ye5goimv_)1iGG}hQ9Ca)ChOiypnt)TV+MjRBe`wPouL2C6??Ia#StTWaQGX5hC+J3!IRcEWX}mQ_igkaKU#i_w5TQQ4JjFT z5CuU~R&*$@-}sUA#4gw*=HElB$7f*Yd&?v3NRgg4JM}#H#32v{8xL)hFt;1kV|uc; z8zvDf+cj_xVC8Ez6R}-vJe>ZCq#@Ira}g0SYvH~k-2dDBi>EW#L#Ps53>(^%!+1j%$M z#bni1W*VO9TL2xGnqw4X~nQKZ;E0f_y|3P)?5_poDQs^HM|kgl z2oDf<)pV95+B!N0Y$9EKp?h@8TOVFU$NNsv&GXT83*cVM__LA)(pGW_X)Z-9>3ila zt$Y`b9XHFmq7t&GpC)~_)8y|aP0Meu5vhCZbOL;_jq_;ma7{dAn`jdC@)m$uE^gp{ za=4oFWo#<<3wd~`wo_cYT@B7#@nvPv;kya&MHwO6BFpla01nj5#?9S^3fE8eSyy`= zkltN3Y-eW?xwl#NIhd#M$hE6IGA5d9gCj*A-5fvBl2Tf0`RoImAIMM0iBbqZDV?>_zllZB&P-5IgqbT0M25tJtx=kFul2Rge{!P;3| zM5D3TI@l(RJ2cDi`6L0UQ!TAz5_^zuC%M9q4wfpYdWG@+zy7OMc80z;R9xTQMk`4uob=6ZX+r%DkIyA-Bnj?CKo? z$-le6zE_%=Ogx$7m$7PT-Cr^%)IWLi93{doNwyqWaEEBg0 zJaLVgSYGS}pG(o&1^uyVH0t(37%M`_YiJpwvV~?q(HnF9R6y$W(=1ffO7R?pH^}#q9568nzzgfXR;J3$17?M+N^vqw@LeQ-2p~Iec;2RSys3i|eBV zH108N#YIcFb-ir)T=&G8;fi|xhi$6#jw7yFw?ZJM4T;jV_yomG<}dNEa!7q7W|;cz zweQJc!hs}UB_tDxDRfGr&Qk^8v-pM4mI&$S0QHr?fuh_ECH-dN75f% zOxdL{Zbqdvhs&Dz{!H*|qhmzodU6^zmQzYQ;*gA4Mbi_Bes)~Z)!+5M)cL`asP1#I z_aPu4B?M^>KTyObk~R3QfOLm+t#)!%IQ!zQ?we^vKiN2s5|!)CD}Ih@usg-+EYX7d zX8(mSJ!tCeRTR_~i(vFoKuC4B#~huVy7BWo=QHge5SYu?Od0TC+T4aa@)%ugqIn7! zO2iyC$|Tmtipy}HyeZyW^>^9ViY9H?;{GiA&MT;c@va?Z-7n5Vw?CX^v5T=h#wtFI zyUqCNL_A@!Qq$^BNy=-Atx;RwQx|aEdOrjDPXtL?t-E7Jr0zG{HOl?F893{ohVYnjahEO2IF?BsFy$YDI`Dz zcgoB@Jm&?q64W_o8POC(mh``=+#~3$$*i3Z4%I_{Mh(&_8u3Lk?N0Xs@O7+_C#N4U zL)dJSwH5Db5~$c_OFGYH*JysfM?5d%z@2=PL70fuA0!Q+EtALPX;K0A0 z3is6E=v?$;7ZGIr^0l1rw}{8LQ#>tMqH}k2!c^60FNRz-i1D5Jc+YlH$MiJ2XMbRU zQmd?Yk;jBN>p=AY-XVqBs-#byROV6T+upB_1m&A*Fc{HR*a zUTi;fsuVJ7r7n8Y1w>-Y?=*C? zxQsj#(Q|7P2;3L$NOwK`=VXcy`;U{^&jMeqc67SXELwQ9Mv}at&m$NL?g!mEqLCBt z*N~{JW0m1h(^j=CXLn~T;;!5empdvLEXXGQS^tda3MOybbV)$d8vWCo&UVV2d9C}J zQ`UDNQ_uYn5SI${Tc!}~-@dl;oZ3r%LTQU5%^aOidD`Xs?}c+x{!*h zRGOC`+eGGhOiR-nisAcaLzvx6DIyCg zg1IyXr{u)Mb+nJ@QetJ#`ghB*xwQT4eXFXYwT*y{<8-g zdZF4`C5x0^b!@0>d~(JrtkzeuJY_Q6%I2R5)-!u~VLOpsWgC!vr(=Pqr;I-{=4N#3 ze%z%#daagCuwN}=U9yooRfjF&cx(DGIqes=zmgINx~@0IP~@M|m0~`R`xcWWBc@u) z&3VSq(CzPa5cqp-47|HWqTHt2&|lZ^%KF6L!=MBcPs2AjTL!&cQY8BOg&rJJZ2hqX zuKT|T)V4EyB|mZYe>YwrIsggCJ9DXt!ODUovouRZCNB?#7AFNVY9!{SbHh`WzR$Rp z-*7jRy`otaLxt|DX7utzt8fg`4F2&?mr#3ttI+uPqmP|HZldrM$riJoRx~}sX zckgbU{`0DckRK9~>Z3Gh$!nQ%o`3 zvR0;zqszPMVw#PXL}K?lYoc4IRN{LHZm~*^;cZDPINf8z<06A(r!}&pn3xUZjy`87 zX61Ny5_Qk-*|*a*$Wg9|lhmI&s)Zu5G;eGuI^52e9>V{au(^_x`? zFSR9E-2Z%v`v3J)D)-Enjmlt(4`70uO!x)2yH<_Luz6*WGquTJ}|D4*y|J!emUu8x9K5XkF7AV))qT|g8=`(wOmy8fJ z_PhFmUdws|5JhJsqtrOWCOd&>G!Hc=JsC?Zx+VF7g89%TgPkljQTg2t7w_tXx zuDJUi`TEAmlHjJN`&5&mujr?EP45bR2^B#;Q3N`Jkolb%*br!u#$eI=3L`q+UjYy? zWR1-@k0k4WE!nR{jYC@?2p|9fe0#o2ufJ43YGkhq2>dGXiZKqNI+nuEJP8lAvGIjG z7gcE11V6D5>iT9{zujtU!n&J_nm} zqT05Nkj#z*dvOowt3rT01dNdfV6T>Ah?1)VVeQP^P7?Bd{8^Wp(pbeIE|$E=%kZ2c z#d|@ukL1D}sOgGt?~eNUr1lb=W6ry}QUZZ7epf-inJss58|J&J0D2&Q@tWUyzwd?5 z4^57wpP*oYVyYk941U0+!hj2!tK0tR|FH*svu+H-?5(a3GMIBEVwP?m0=j1B+r|K~ zC;6gbU%;KR+@{EFmVY3C}MgH47B^)sh1JQCpuyx=!qlb zw_r3{@a;^MyA-4jWeotAVw z7)3^@^*va2^jXS}@tYl6hVaN{Aw0Z^ips0-V3c7KE7^`9;ZXGGFRJ{Yo}GO=K|fAW z(IT%a4s>#WR(?Uw;FxNDP0HGC5ZUYV2YmVT1OA6tc|ZmO*2+69_ki*J8eAuA1M6Mo z4DyyauB|l@M9F+ml~KJ5HO0T4A0F)stE{zY6CP<%LL-27`jt5XLb)4zA^V#h0O$e< zN9l@T;avjMmpz!pf4P61Y9?28at-d}ui*7=K6DN<*H=Iq0c(5?%_sQPDnUQ??9I;= z7|i>{avabb4y4j1C@o|_LN_I+I)ZC07GQXrYr(COi0)VN!x!NnEetj?DHKMwtU+3*0em zXt|gfthvr5e^{s%q~>s@%g2rPs-YAX6zzL%h+P+81FLs72lR}JDmqAqnO~$e>fi5w zqdd{>neIgRb&?}`KjrRpg1er~ruzC^OPJj{1iS2~pSk3aa(Bfz+JI1n4T1WWvAA5* ziDz`t9lUL_pN^FYRy}Cpexm|7ZX?fspG#o*cwXEE!?uD-xECRSl$Eo1*1lN6F@aAB&X;bxb%IwxPmI?YL1LGvwXei2 zl`4{6QtIhrV>1ehPw3#vU2Zq1ZD`3p6J09f=BYGirdvmUs6 zF3?u7)$|m})R(Kfz2|eIm%wnzJ!W8i3V`XjvL&&`Ghf|PF!kR_)i#Xo>B!z;Nzrg> zk6$X_7L+0CeS7cjXNQyxOv+wg@9lPVc$V)}r|^Wx8ZJ9Fd#{hAFMkB%i*e=O1>|5I z7*qYc`te-yz5q0KEX4J9xij~d-Z3;;`qj7q+ejpJ3VF27oG{EU1B_Bf7)?{TO(Vk? zzn5{fOp>(9Yz~V=$3Mju>J{Xu#tM09|7n(xAGi|mzp*n@L5A>cjPT`O%XeA?tcau4 zEFx=(_axT|>VGoh<+$mLp@_v8ULZU9QvXC-Cp40q3c!4XvUczgA zV@3TFi@c1&cp=3mT`MmYxDbt~fi(RYD-{ZF2>T85 z5r&^kQMqR$t#^t@l_g1evh{A#J6ymo(bJ}CfO+i5%SmV6tfYK@tu^7z0r1IW;$=k& z$`McQuee`BYTNG7vM(FSs7hUMU%~;OAAWj<$tzly4US?NZ|Un?DeO_BoU_;{UzYEm zu>6vgx?=PyPMkB~I`I-@+puaCw*3I7>LQwsKU8ge*LBcYL|YzgjLa&mUIo!^o+xM>Ha6MTdb1YoVP6}-VTJgFz>_J?H*>pVko)F;m=tSNN{c@ zShT10TB&d224-Te!}WcVYb$ELjNUQ&x2Gk^DSu}@O^J@n%j24qM{B0gD_xf8%q_Rt znAfhP`d<@viNr5_Rc`sU=QC98BYpRn2V&%kyxFaEX1HTbD^MIbt7-eS)7Y0{EEtrv zHZqn9oBF9)9_rb%wgJhvS2Uvd7|BhD-u9g>w{~mB=3&Pb6mxNeRu+~K6v2*L{~&ir z8Mu2i;!D$m5 z;`l6a3H|8eY|V_3M4De+_c~NV*lyvav6&(LScoGI~{hZG?h zGDV?`4VnvytW?6P%sxXXp;8etB$^XViu%x?GHb1*l1heB2^lh-`^~u4KIb~uxvq1r z^UvOY>}zlG8{XghKEwUo&;88g<8u5|;<;PvhhehTRx-Z3nwc)isukUSg)fg^Io*18 zZ9u-CR3NiO?q}wo86(8ptdh*jbP~FfzS~N#>=<3z)~8+P5_^1ilG~XJ5VMRxbvp9T znWCjq!p>1e`gz-lMrOB>@tK4MH;-wQO zxPCtUYvRngL&dEGsgEC`Xz>i>sNXt+sd`_~InD?OD%{w@r#7@L&NV6f?Ftuq^bZY* za~qi}_WZ5y$PvaJS)vd)Lo}JOrcChkz4fl!vJ7M?C0ar2UF#cDFISeu1g0#NP3wHP zFSRn+cS}>9s^aJ3K>J0Ra^y`!^^O*t=hlHZpwlb1-HW{|b^LG1hp2y&71WQFRf+WE zSTVcy&rqt(4NdP*F=4U!M}1o!mFu20ND9`j$w-kG*y31u@%{RV=*75~O=CnXby%~4 zR9WP%7`NP7FC=Txpx3(R(R@rNR_HzZoJmue@2%sn5Rgwbo|wE)lnMIjB`h1Y)wNNo zVkrMru^v#sIhW&dCQnoF+@^-e;1w?H#Mb>}`%O^kuM-KmDUhTP-^d!HT;>`=`}t>~ zDSkf&3+zG&l~r-JciAk9JGkKq4%GJO`N4vwT ze&2TLztTfEMP-%Jc`N^ze2MWDig!d^n1lVM&crn){)qED%FvWwO_y(tn~+KO%pt74 zEdKxFO@x%wf8J`q^BV`yQNB_RVvi%4*5oH=#~XXFJN=$r2U;IMud;WFTi91c)v-upBs_;ec zIQBBRP@Hnt@QY>$Ct6LMtg-!IOfM9|8lGy{vf|&rXr6^%6jnVxtvv0O$c^Gp!8diS ze7%$WLRaKGs{b|%(jc9=8vH>-#u!w}83E;jlOIvlsTf#TJ5ny%#41;JW););r&T&Y zS<1QP#FgvhYL6+9Qj^hRVvJSWHJl$t?YTeTidm%{7r@%x8;)T7+=8mu#$Vvh?7tC} z*v+B&&?9LK=EBNwhA?cRhyBkpy}7;yE3L(szU!pl=$)Do=xhDZZB6NhvF_vs%+ zCsi#oaVby}{26D!Kk_mGas{@cE@-f-829b9ZE+qPvnG>eJ1Dw7^KjH zl&LJpnJ?v0E0IcgQ-CbwyTmg&(H}2;M+XTTSmO;OVQZ!bW6H8PZVA2#jZw&WA{bz2 zEWpsd-6!0sk_VcY1yB(q@<<%L4^YU*h%VYN%IJZxr-18!fPJwR?FvYXAbAtx$v~W5 zQDc*;2c7DjDAuol8XXd!OVE5E=@AI~-KbJ2)46ky0V7$ocyV#f7bwrC!s&v9vMfl0 zFN}z#AjZ)7Y(BYHXpdMs%;TMTU%a-*Qi+Hf1q}rS2R7W^Lt+h-MYkc#etowUX&}(z zBaO_v8g58ep{6#l1yxaalXeqhdko-~)nnNB2?tL@Fn$p5^;a%Ko~_)iRK))qUIZTyeH$?wH$(r=dKls}5c){iKZ2n6 zX}~Bq=Z;r-@`EG;k@Vw?zq;Y3%GUq^g3N<*M#g2@xM<4tOF{t;M#JTKAQ+`sh&gK|{3s02(HDGCD{M5b`J1q=R@3_{6_OaQ(6c4VG&uwN|?r#$naT zw2o+v#PPF=Rksj>Ntt+EedZ1QKAcUrFueBYr;12lP7Mj?GnDMvL@hpPRY){@;+c33 zE{$eoWNtSiE2*j%7y*$$@v1<1Uk9aF^-m@V#Z&pA@iVHZtVNP}`w=|89~m_34#uui zKhsfJpf~^=JW9_f@aga z3cH}hT*rG?Z(gCiMX)%2IO9`r$z$CG>*m6i&~`GT|FgF>zs^P<(@-#zrPA@{5(lMn zz5NdUggpL%Qf^(1vNli*^-oT12Xm3iUd|i+Zm6K<7{wfoaCJW_`i&0SeN)~@$cLWV z@139iF6`%8Y%mRKbx^O=ZEgiIZLutdQWz*@r&QLclxA=zbN@k8tdsKM{Gz%AjCe7P zta8_qU~LXH_3h`$cKrR-EH*HY#@IBNZ+nEtZKd3*@Y;Q@C zxBb~M8T~Ogs!aL1Yr;1Oa0yv{xVwHKxfp#HlNNm3H$gK!7lGntH-Ze$t1j~$JJ zoJ3aMVikP7OtygFjI2h2i-H--cL)3D1Ofse@Y6>>0}Jl zJm)`ri6Hi~u6?(}#A}S*5$D<#=-_9ARo{`iO`UeK3@O;9WznZwr<~i^|BCm+&RF?9 z$~Wuj+;V)Rr9&lN50+ugn%*T*|nMngTNq%wrW$ zQRhoNNNcYjuY!GjGJ?@r4?17E!GcM@hIjf7E)ue!S@SmZp!dkR6^J}0O5ca2e|ce5 zD|Y9)8$0Yh=5|Y?GtkKIleS zk*_qdgfw2VNoJI7At|zj9v!9Kl{tV5tuL@~7!KaPGlmmRfP z9Pf~ax})hN8b%~Me`t)dK)!Xugw5l@*Mw;>+P3XQYTNayK++itNTEGkvlG^NQay%85KE0 z!joyyFceu24p8AG9@`PT6g_e&!{uB1K~~}yPgHlJBqJ7aHVg$As+pQnxOQUQjOoGk zaSSOsu|fNRka=tU*eaiZv4^^HL0VQX*lLK$ZxLCjU!$q>O ze?Ilw3M)FFg75UnfpCz3+ne{5|G}08MjnYscC_9S$aug2UjH}R1t3i*8z=Vzop!sw z2u(9@Sy?#c2?3WI6A)7zHOAI0cn#kS9D6_V>a11-4A65HkZ^XTuR9EIR3P-^IFnRW)Xlcy2l6iG(+|Ce_m%DW@GkYxG{N zd=L~UK17a?*1B}iBHy2k)BAiLp@qwxVH}vR0FGlE;GDNZp9w8R8V5QZyNJ$@nAhED z>(D^fVQ|22Kkgz-0e>^wU~85Izn@_EcAmb5o!2n01hPqtoZYyx9{7y^`eQf z*8Zq!bMd^*gU7!_Ji&=YYq)HG7nQOv7WSPR_i;~2(AVrM{P%yD;UQvX(|&E)_-~Wg z%#U9zt|~eGA2u-!OV+jO-OkCwF>V{<7gLrV%Tf7#4~ulBx<~T4T8F1uLE0RWCzh`2 zm7HZ^PS@Hrs6I!^d&&1#cc5`QZz6}_%tC3Gvw2&7?f6wEEv_7??UYzf66z)_IGv*G z7z0*^_S`Fz9)0YOKsrQDlk#Gm(mWOMD8=Iy{vX_gUMToK%Ul|YmoNU+c2;N;cKICD z9LH42ZAl?6#dr7byfpE|G&?gkVi@S5^X(A(WU{<@^dG4U+wlFa`y)t(teci{Xv$4r z+%988tiB-~^5hb4W)Vv8!3T!0nV-R>5*frG2#7f*-CKLpXUh5{5J%+|_hR-+Bsjw3 z840rz32TqZhdqQgqkr>JbL`OT*o$OjCWuv~BOMY)D%__qMcOAUan_J*FFR^dKiSAI zy!F9>r1@cnb z!8f=VM&;cF%^m$4Hc1+vpqfqsR>&Sk6J55G+&)Tjj2XuJZ(2-7V+F1Mj2tcavx9B85gX6x(W1Tsz8ryJsdvDcKXNz zvK1W_HTlDtnA>wtN_Wo|Fib?FZpzsVtp{Tacd~hFmou)jPk4YpGKT||wiAOX3$`sI z3e>sv%%w9+Qm?JM9?JUtHiyJRBk``msWTDpfK6Zfd9QyyY{dr(#1fqyO5pF32D6?s z{Rl;_t$sPAfV(u+JHZUgi{|+`qdN|*@b#ZMi0e53E6knMM11E(U+% zA6KI%5QlRt?yux&p5gZzI^M(WJr*S?k_#2gBPmv@vmwZ1q=1F_phfWUU<{z;PWcYh zJN4JubhH8;HsZk5Ni$)5kyFU>ui4%A$L%mYG3jdrs)PO%9C8sd{15tyq6Z3O8pxCY*hKd5sA163quPr)Yl{nkFitDDb8h5hv{V zW(=-dCc}++bUT*#Tq5NF6w%Q}y%qzuNFNKpg7+jsLLufSOnKe_V%g)aRT$01W7yxc zdEcu~fTK`)|AdII9-S(w`x^@oe%0IrV&sh&WtC&JfIJG7#wQGiI!oKUrb8X3c-{*? zB#O8YR0h%y&%fISfVRbuaEf4Gj^D>@gnEp8P?6>DCj*U9Ep+5%4pp!|J%kZh2s*dT z2dASy9eRpjOQ1LE3+~^D!K<0&qZl6SqmyD(oojGuHyF3%1#*71Yo%(HwMmittW>a&IxW!%&R?m_0MVyw3)&viv_ z`nRdvKVvne+8b3 z=Xr#kU}!J%xK;0iFIlQ!z`sQ1vY#Cw&Rj7WF5pN-UvIqzn85Yf4)J!qz_f~6Rt$;w z$5F&POm)Iukyb8c+z}Y5qFAAF5N;u^XxB;YtdGVB)*Xg^azu&>F)WyC&FB2N+M}f- zN9?&Ls%-+QcOX#fFm3&Qa1#QQc+HAL7mx#e+Ethd8KckbjqBnzjdHTkZqPY3pFQ z1Ca|tB&A&lUv%f!O|wzQ>>P7ca|&!q4sDPpc5;xy-C%n>_|1;nq|pjzA~stzW{q=2 z37olrs^hWH`eEG_q;(9SuKvsAf9Z0Kbu#Jtd!CteE-qVXn@3_Q??>q(5M*QZNIu6f zEg#%y@jDf1DT*GakQ&o{1U)aC+%?$yeeJ(J!)4fim#-hTOs>uNKd80FNBKYei*$>- zZ*%*`!fVDY*2L|e8zV`1c+8Gug`tFB`(D{U4jbzp;jQKDDsU_g)j4$mg$J~z^DCOh z@@{YaEMy1}5yOsJ`c>S$WLCH;bRJjO@jE3Y<|vEwGOqSJ-FTS7*&H{WZC3dS!y`a8 z8`cr?3Q*Eto72f5r~qdn`(*78v&r|P`|#CAizu{vrZt|$)lP(bsdWo_L%>uVPYB4g zmPfu394~|!Do^|w?KT_mme(aG@j8~$omIgjr1z}|^HoWsFW@8ul%r&*tt)AMJkmc!CY4 zLI&2NCSdA^K=DNu7=7qn-SPU?SAhH_LHEZfVJ}c!2Yz)MNF<61TR|ciEZ{q(gV#Ho zVlUJCnGAu&q=zl=rWm(-3h{OA9W}4oiub4x41-3aC)EzBk%As zkXG;=U&4DLTbJ1Tkbfcl($`HsK{GmN}7}zrsrDd9I=KjWBN6y?J9Upv}I)0j7;F~*p;gPbh zlh6tEDMS_edOi*AIb_PW^)DXJr46lnihc@5#~J=oYW77+*6zbMf5Vjx3;pKijt;b@ zDONQ4$l`L}M*!-=Dqg9-Wl4r0I8xJ<^R5_$I637A8~AZq-;L|#7VoWDDpF%Y6CX6*L{sSwVd3dV|81$l z0v_>!nv7lYcyy!3%Hd~UM4GKGxn;BW!DQGGLNQ^8(jTxQND`qV413i02abTRrc`gI zcv>YTtNr#UD9>HbQg8RYH#HBzi9607oVcELi?6X>^D^z0m9|)=9R}4o@X_u-OyDDW zgrB(5iU7TypDD!?Vci>P-C{!JxbLmrzPY14L&Ev7J^ddWU)C{(#uE!JDD5086yb#Z zjXX5AJJt)zC+>{r?dGYzWYk;A7skg!)ubNGw?OgmCA?|uypx7h1X(=vgAi$^uWBn~ zNQ7S(i=wITGU+&zn^dq%D*4IuedbHUZFu90P`{CjFxFC|-OO;ZO$)yJ$?d$@_peFE z;!jZ8{*p3mcKK$^PU%3A!=>eNTLPjNSe-*hsWp=G3FAt8k|UI)>+PCAXZ?hc*7_U1 zhZszEFd5Y`Yn2M9q6$hA~IVnHn9?XBR z{^Dxo?9EwmrNBP(f=|M=jb(ekQ*lv~4F0zNn9QIyq;OASkncHemHeklI{^CeNQD3Oui!^Y+;4pyr6 znPvx5eC9T-S&Rh>H#unPbo@R`jCGSrv(uJR)$Uwx?Dujr<9mS{#onYnp@5H~zm6U* zawqKPk4a3aX%E|>p(*EuH!iHUq4Lg&y-Dzua5d>r3FOXH&|}nm&XUFz-y7NC?QC;`f&A zR(YFDDjn`~n{+yF1or5AeH=qe=M?ByjrDNiYIy_(?NCaZTA;D-Nq>R#)CZdk!iy_{^Eh^U{2yi;OqdL79E!t{RwbBsEQ#H`($74YOS|@Z% zpoVbl)Ii~1h&VrD420AyNETEBk2|5fDjVkx8{#E=Y-t8kojjC#ku6__t_PAT!uV`U zP`VD^3?io&SnZBXC7@gtKs3kmJo0BGh_Dh{U!_2$zTNXYOgAX!pLW2hVYtQ{Op-uI zeKazb9r5yaMA9aeh;+?AB~P9Jn4&2z34Ms4c*=j$oJs19mOzP%UO=(W!0)2lRZPRB z4E2q@$Vz!7bmC152Xb1Cu7}|%XNsyfVPaMrg0YqcDI6Unh_Y6m$T?SafO>}Pc6nY! zn$alC!U)#Q68Z3Hts2%^XewC3zOUJ6;mn^9==(va-3^7k-yy%_kIZ|CSvn@m<0LN{tkU_&y`i6# z>nf}b3IfNI=Cfy9B@_*$A*V7LF$qvWA2{Ada2a?~F+QTlOjOsad8x?e9;EBFbJCG$ ziP<42;B2@L3GLWMluI(NDc7T_2=y#$F3!f z?^l^0)67cwcyyYWLb)DNMQa%7q5!ANG;+uBAy7M(-f*9yxWWn(qRxYnEt{Fyx|W34 zvm-o_HD+|uN%Diht4D|o`zyW7N({e>Jt~kI`Ih-^6V38IuDhM8)*yq&$jbsrcQTpWoq=8Ymd4vNp2yDLD?EQ!4-gHI0+OJ z=cUAO&q;m!hAu6^=Q@*ntztzkmdwTWp+_l19Ac+@?h=!JbwTM>qRYgrrr#)++~{)a_FYJZ z6e(zSnf*nmI>e2Yd6<%@;fTc~8XlM^zjOw(u&Sdo<*sOMc=0M*gFn&2CMe?Q$*jLV z#uKUw~{;t3G5E$aqUt z&vXLbzBkWzs<4WB0D&g`Z=FOZ*;5%tN(2-eIuR{_H?jR$LOay@=JH7X`NICicR<L6X_`JTguJsZlEC8}0qk za&g)}dP>5q@t&cFT)9CeAGmHlOb}MTnot|}GI^Y`NB1CL`$j9*f8(UnOD>Zp?YD6& zJBcUqSde5nVd%z99zB7$x30MQ*;Clc*so}W5}x{k@KFZ!pzvO z@K~Tq2p>0@#%fgp{(qqV{g$x~e$G00_t8koN-rBV>7K82=?NKqnh)o=3CqNUv z-n!R*GZTK}dhyM8j$=d2>kZ)X)#$L7Yhd32Y5IDj723v>zV; zRLkjq5z|El@IZb_Nv!_+5Py-XtZ zSbgI&zN~a2NQUXXQKHQ5h8Zb#=bDGIk40ZuSHflG@l(}$VrQ1ABGMThe*PD6dhs_X z1aGd+Fm2~DZVFjWpbx;oU$Kn9j32k4WzDqo%wj8Vi!f4WgFqt3Foh~p-7<=SZe`s- z)4KwZyvB)Xwt$hi>&=}X_;OFR0W$;7jxvpu@G65Fe|dA~+Qy+(8b2t0g7LoK5K3$} zW_|bmc5ij6zQ#{r&XY#F1!Nefw~xA9cy4A;oJMcEU7#84tLiTcU@GEqL$@WbQ5xK1el;k zCgYscbgVYfG3Xt{|5rd4vLk%AYMwd(=h!^OXLvTVqz!moaW|$&t3mISZ%J z1H;L|>C9M1ki4c*wzlh1+L=GtoT$cHzd|IPdlYC@X#b0YUJl|WSKd^%{-Bm5H&Oju z%#9_%i`dL{dh|2YjB@UG;iAxG-0_-M%5ZUSd2ZUP83fhuWNtB(Sonv~oDs~mwxEU_SmNPY zON4e%A6x^Nfn7s=V=?UcQ#p!QmrJnsK-VYx7WE&}Z~0)+rwq(7I}|A_`|U0t!)D11 zq&USgFh)H35m&>=PidJ>!kr2#2Ah*mR0zH3_OJ5w3~H!)k0YF6&BDybLy*|2CmOnc zf(rsnhD&6AOjk3l*e(KL;$E4zV#Kk9Uf}bxv)OW{M1qrJsCj!u(a8yW5zj|*knox% z#L^52HKmRsrZjRY2?uVqzoY)>{NeKD44J5~8N9AOOlP?X1$O@CyVYHzeGuwWs1xH` znEp89EOR$aW0w;STAK;O#&xcX2c=Gej4JeKIZ(00)yjpUtV5+{O%{k zRoYi87)LyspY0Ydnfa>X55ox~P17ZeD~YP&-`-1KfziNRzP+qG0+F%KTjp53&q^h% zPQREO<~q7Cv&@7B?Z+dFG&V|W3WZy;+JZ2HewJGGGpHI?`smznlqVC%rLw~rUUCe_Aa(cTf&fObp)NZLgvhni7w&3GHDJoNa z>PY-{W$DHnP+zt9^5lEPzX>vY96D}JW=a9B~bgVf6sAw#z4hq z7#8RK{dEnU5h7b0zm}@*@y4m(%NVFgHV)xfWzuEy71+u(t_)07$Cez;=_rc zNS-Y4s4k#0NlDa`YFj10O7BIcZm-bV6$pip>#ZoJs}TM|L{6Il zD*pj=45cSJ$4K?DOp7@*hzcoy3Hf5VU^7;#UxuY64NMza`y zC+dn`%aO+;EFfJ9n=!BvxjBd10yT!oj5%hZ!>p=_TJI0dz3OpbDJ{8$f#RM~7OCUYUj&$thu&%ahnSVquJ%I0e?%XKQ=V zc*RkqJa_9&J7l0Z`FVh&zpqAs5Y&F|l%!XgG`NC%i!6k~@}qJ*%cinx>@>E@Hy{{{QB|=6FL{K%~<}P zFTy*@lPUQ{qyVZU?M=qikXw50r+Y{~wl+F;8OL&+?sCn;LIdyKZ}SOG|HxD%mzk3C zC@54&+UV_!#H@I#MLoH!?sA_H>#JKPB=^LjW*AA`h${V|-E0|M@Ajpz&K4hHbf`!IAE<=P3T<~5-mt!TVRfKbA3S_d%bAujhSAuB05}=BoC7*`lCMNzvO12)M zgLgafo;&xMwWEe`4T-4JaeukuBM$&TNptgW|EOZ$Fol(&uK>36+2ZAUz-aVRV(1UG zS#8O!YmC)C%X;l^%er$$4T1tzoMk<%S?%JCWZw<_(r*}t_h%bLiNBWCOxC(}Rv>XN zZFwZCwO{t^^kC++dQ+#oka`1%lE_rwqx1f7T((U8?~wZ(2;j?7k{0l+e=J^>RN#>C z_yvcAOo+g&M{->MdY-91hxwdfm_*bjIn&HoGv_W4plitvL1-?h&U(>;Uo?r#QlpH> z>V>pt<*tbrX${AvO>$TaQKyhCVshqPATF~{)70<7!{``($Im9N%eW(>d^Ul70wXWA ziruA5LOW@{`t#w}4VFE0Qy-PMgbk@JEbvklOky4ASiE_}pBv9=a*Q>5fW0`qc%U$5>)08Gvjp~Ncs--HL z%+fzXJH?H>M{>~y)ayiJfFRD7Og}a1;{tK(eI1n&IR$|S7h9dyQdXfy3uNAyCJvK8 zD5M&3wX@O3bAVFFwDr0(+sr{AjgOw+KC;bfCHNO>9i-Azt0Y(h`Z``6w;XZ*tL|Ld z`r-L7e=VxiZfbnGC=}XUm@f%*K1=21ZgMMQ`XaLXBj@PtK}7vT@JDQCC(yvEB+bYY z#M`I{u<-P?*G1kjZMn(I!OCY&TT)!hVr6SlPDB$n#r`FDUQ@Al5q*#FCb?go!KYqT zj>3Z3fb4$$HJJ4mW_=WKR&=PZEGhpj!7`Sd7Lpjd@6|d@L*sl%oaw z#zS*8uDwUblVl#kJZhz9!TkXsA2o+>hoeSG`A#XXgioUWKR-W`In2F?CSLL@_C_-j z2YJ6mo(atSs-{H96K-$*$X_2N=0I6hRva9M;F>zRyGWwU(Q?4RG2Vx!DTi>TFl>6v z=XsvERe~JYIDq-|-vvE!vr|Xgc^JPaKj!lJCwyW@@$r7qDkjfa4ZYX9l^>r>+J0Jc z20~?*DCSGj9&2ajtIpfkisDnE6}>EGxj@_-O61sVr2?tUS>pXOwf2Tcs)B{bb z#W!!Zj3?W6cC21ss@g)mM*l|mv2rEj)_P~DxeF+N@1!YtBBXF$QP_kv%&MYX(v|C7 zGO%&OSiehfU?I_3B3}UI`URmr2cDry?yl(gTJ?Qg%76$-i~MG?a*aT}l=Ry`1HtkI z=51WtRmo{J#yaS@zj(3(vFtvs(XlZJ&%hZIvMc}1fz>gpi5Sv-Tur@*A~-j<)@1i4 zd8y~)e<~Y$YP_3(u0?jS0&m0X)wCkQYk1jhQ~h=^MQ=}!14&M1b9U&IP@52^r?L9G_2dWffHf$E~%nLIB^3CouDdI zm$Z_L*+rvo@Q~{$f=1sm>Btx8)I=HM0E+C}zVxARD}`eR1k)Yy9*8aoDBqnm=iM-9 ze+D`0Dq5trM?K%y!lqEJ`d0uFa@`QiVWhY{;6l`SAB6GfMLC)QmLzq3rC=Tbgg!g& z?B=QbDKz=w^l79{;|7ix;0q{ej36W?odFQ4IlX`IArF-qJRFsE2r9!IhEYc_8GL}` zn(zdndt(w#sB#FDuW#|;BP`Z!eGjuS-@F{$gX*yKzqY0!XC+z*oAx5|yZ&lA#p()n zEJ|68P@|GN%mCN+{4*3nAm+UFKIm$!)F^6hM<9e$ zTI;7+43Uin2<39;2+6KIO%JAetSmtR+F~)7fU8D-1z->YM!J$(zfDS_?gRhvErpL> z3&k@NaYLUy>zUb(tq9r6y867!(_jn09ydu1f152(uKPt0jYkST&=ZFA;jsgVj%Jk&|FPem-%sdmW za%OZ>=18H|L?pjgIr^NO=7i+$RsC*&V zR%x9Tiz84;^SW%0o)180L9Po>9&-S^U7DeZcm;m}>al7VHCE?CRQ=FQaVWBg~X*)}RVw(wt=#a)L*!W1$A%Oq=BmzfR_VyCS4ypi(GC>e3s&`2+ z>yWn)G*6%V5S)%$WY5`a=@FQ>K62Z#X1yl+Gy ziDvtJPaMU%5OexOZxs#<$Eg=4KzkIpH zFYqg#S5wtiW|eimFLfWv4=F+4u^-s38Y^VN#EJHJWznaT8>xKg()0wA`XN^$AAJ~; z%A1pmspb1Eg0_qZilisA=$o!GN|}AXy@o+!_v4F^GCH$&rWv$}t3pXq@oZTeoeTB9 z=-Pi&eU6bXx?_u}mykra&t=F(sLt<%Gk^^_=*vn+wqcaGBV_kzRw)znK~Wc`L%reL zHTE`WN%-&ps?}Is^#(Bk#S~A<_g-l7&t4+Xk;O|%W!rM#JzK5xI`9V6{Fv>IK;!Nw zN}ug0>T|GI}G*1p|1 z=OA~^y=m=2-zj3FZRLHOI1@;sf3k|I{T_qFcz_p~LyN14h zBwQ4P<#Oj4+)6Z4Aw$Q{%**Dd1pR6+-^)0MSuUkrhs>lfctQ+HeM-6>8s+c7W1;TU zsx^}FmdsvVR#;~+Wz#KX;Bzb`!~cTGUM-wx7^t_aquQs{@T@&kIQxO)TYp{S{obnk z&?P8lRMCo0N5`*;KI2%+@ZyT^xtalyQ7p1l2t?jX>j7cvNjask+buS%jZd28s^HJg zYnbNd-$pZLzq;fJvv_Zo?%b3TQReIw&8EAEl9IDThZ_gGpOVMi`X9(AG~P0h)!Te$ z*`3tRu|7aLK{)Vr;B@e0I=;HnnpT*y74VZmXZzi%^uTZNOw89liH+LIrtp!TbBwA# z=-H~3Z<>kx9>WGC_O!fmkq#p~+gGe*R*Qm8s=KrT*t!Rs7g1ibsFaWN#Az2e+&Ip` zuX{9N7SZ9)J^rP(NwH#0KhN4|8UAnI7yfES zTXz*-V9UTZ8dEA|$A(};tuBs7LOL7LesYO2s%DRDi`6}NdpKM`RNR>Tpj@>`h*?V& z4C*s=`gtRYSkywhBhUcT5R?8vRWOlrES8Iq$c$ror`5r$-=W?@TUX?_a2wwilyLVQ zoFz;cr>9?WioIm090X{inVi8RmuXw}m=*;?5;sCA&*kIUE-KIej~;U$O5ye{63ApF zGL61acs(Z&ZlUlg@2He9?+^~*8SUWJmg0XQM~oVg3O&I?%s$AW%f#Bk7%L-u45+@( zPR?7g%1GOEN7_5=l^W$6qotk94s*Hnlg=~&?1tCI>JmdkuDzcEx? z;ns4rL|^|aUu(iy)AAWM9h||p2S~$9Rt!^Jhx~g`U2eO7R@-jk;#oxmjeA?8|`(|4UH^$b0M1eqMdoZY;7raid6N{IM}*qgB^ZWURpp~hb*>%Xo_ zV@`FU`NX@)fi^JedQJ z0Z<|k-pKU49o=aJqVfX8F?`iXPSHJ%R||GFF*WvY;9hmQ>sb8L64&L*9;!+SD-+C0 zRm=LzUR!C09Z~hj+8SXP4TXPKso}2XpUnGQ_!J?N3%^cv<9Cs4p`!TFif*I#06j5 zRFDVCDWo#U>w$-ogf>~y7p_<}M5O2mMhBX!5CSMA5syPFPtP6!dPo``Idva^_=9fq zzB_rQ6jmx9%{YXKnIa`^mpA+PE&TufQK7P6R{cF4?@J?5aH;h6_xB=7D)~SHQ`V!K z?-sYm;jZv^z=CnlW~(z#HQUqYK`VDlZMX!66m}u(*;(Z#SqxfqN-1i~F0Lj^BD0kW zj9V03+#X68B4EaYBps|4`g0`t(kEA7>VKsK0F3QvuT!*2j4uVH?_3)|E^B+&xmL;A z^Sn8`K0)yWKPH{v4J+i$%9i~Cu346Zuy6OPymRxq&dsJvKG1%vWgm%mAunZ@6ynP| z5WGZ+!m1*o16dOiG@PL9GU_%T?DapxP_#XhqRdjExRVta&3@FcvzK_$Z?&KXAh@X6 z6XjP93L1kFZWVN)l9C@U-RiA_($Nd|`G2CZNz2kRJ`S&n>JkfGsyCTI@047 zt02EKL*3_tQ3PY1vA!l=mtt)%6GFHawZ&+#th?|ePU_nF{Olr|4Pax@4vRNv-XH$} zWaX9U%u;JMZpRA*#8MC-KilQ@F{HO8KV%g7RKOSm z7MwIlV!Yza$Y0}~*sWYK5YDyB{hJFWfK?3iW~ohX3)Z_L@%%n}&M7cc(k_t<$?FS9 zS!)N~hRXlivrnpAJUjqUc*0k9y?cGAw(pnh9T}QRDSOT8PnXuoMLyMU;MDElUf9q*ZCTpT zauT*X;f1tuQ@3zg!-!SWp4-1xxY?xWkScV*(-$Fy{7oMo9roR2(*1NF(^D>uqz7?^ zBzhWT%U;sBH5Zvdp&hN8=NP}s_G0XOLA{|q=g~6P$X8A2aVr)seIZ&+9u!L0RyaTRp?1!S%DrKB=rs#0|G!MtE)_1#_? z85G_<^>u#<=l-3(N9r6ab+t?`F1&cCwS5?g^?-h`uOd44tv@u<1@#TE7<(s>^sBASE79v zDeDf1%aq7g+Q`)WDWd3bqrbu)EsNue&&7!@$TW}eh)7gsuP~$<9QvhWbqQ5n?eL;K zYRbl45tj|y-Y$M+a#FW>dt}n4>U%GezOfnh?5rD}^+N8qjl{A1M)^MauC1zQ?YA^0 zy4og2X`{#pSEu{izMb9fyEpKU+0um^g{^At3;(p;I`z!IPdR-)n+xmaB^QdvZ&WTU|r&PZ4n~ay)4KRI?{O6C`RvIs>mQVO&>Z;`4>0ZoY zCZAX?`~U05EAZn}b{EH=cl?+1XL!03yt&{0_2d655AA6DX}HdS!g}9=eyuU^e%aM8 zJvm&>w_ECF@3b6$?ej~TF5TYq&Y^}sb!;7{9{LdL<%LQg>JtC4Y}D1VG}-g6N`IoK zd)jJJg*eKnhx)Y?#=qIPef^7CM!hJue2(-#f4r2U5h`PNgk#DI8Ss@c-+_ax~fhKJxgo{&$t9cC_7pH^$US`R~T~@5cDQ eV3!#Wm;;tBmOrnYo5h6xST40OySl_N^nU;(ZqPgc diff --git a/format/Metadata.md b/format/Metadata.md index fa5f623ac97..3388a7e6cf0 100644 --- a/format/Metadata.md +++ b/format/Metadata.md @@ -46,7 +46,17 @@ table Field { name: string; nullable: bool; type: Type; + // present only if the field is dictionary encoded + // will point to a dictionary provided by a DictionaryBatch message + dictionary: long; + // children apply only to Nested data types like Struct, List and Union children: [Field]; + /// layout of buffers produced for this type (as derived from the Type) + /// does not include children + /// each recordbatch will return instances of those Buffers. + layout: [ VectorLayout ]; + // User-defined metadata + custom_metadata: [ KeyValue ]; } ``` From 2d8e82056afdcf125e6e512f96007389ce79c1c7 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 7 Oct 2016 12:13:58 -0700 Subject: [PATCH 160/210] ARROW-319: Add canonical Arrow Schema json representation Author: Julien Le Dem Closes #158 from julienledem/json and squashes the following commits: 796cc6d [Julien Le Dem] add json documentation f0b2a39 [Julien Le Dem] add sanity checks 7dd6d45 [Julien Le Dem] fix typo 248d3ec [Julien Le Dem] more tests f2bc3fb [Julien Le Dem] ARROW-319: Add canonical Arrow Schema json representation --- format/Metadata.md | 81 +++++++++ .../src/main/codegen/templates/ArrowType.java | 165 ++++++++++++++++-- .../arrow/vector/schema/ArrowVectorType.java | 43 ++++- .../arrow/vector/schema/TypeLayout.java | 11 +- .../arrow/vector/schema/VectorLayout.java | 5 +- .../apache/arrow/vector/types/pojo/Field.java | 23 ++- .../arrow/vector/types/pojo/Schema.java | 90 ++++++++-- .../arrow/vector/types/pojo/TestSchema.java | 119 +++++++++++++ 8 files changed, 501 insertions(+), 36 deletions(-) create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java diff --git a/format/Metadata.md b/format/Metadata.md index 3388a7e6cf0..653a4c73e83 100644 --- a/format/Metadata.md +++ b/format/Metadata.md @@ -63,6 +63,87 @@ table Field { The `type` is the logical type of the field. Nested types, such as List, Struct, and Union, have a sequence of child fields. +a JSON representation of the schema is also provided: +Field: +``` +{ + "name" : "name_of_the_field", + "nullable" : false, + "type" : /* Type */, + "children" : [ /* Field */ ], + "typeLayout" : { + "vectors" : [ /* VectorLayout */ ] + } +} +``` +VectorLayout: +``` +{ + "type" : "DATA|OFFSET|VALIDITY|TYPE", + "typeBitWidth" : /* int */ +} +``` +Type: +``` +{ + "name" : "null|struct|list|union|int|floatingpoint|utf8|binary|bool|decimal|date|time|timestamp|interval" + // fields as defined in the flatbuff depending on the type name +} +``` +Union: +``` +{ + "name" : "union", + "mode" : "Sparse|Dense", + "typeIds" : [ /* integer */ ] +} +``` +Int: +``` +{ + "name" : "int", + "bitWidth" : /* integer */, + "isSigned" : /* boolean */ +} +``` +FloatingPoint: +``` +{ + "name" : "floatingpoint", + "precision" : "HALF|SINGLE|DOUBLE" +} +``` +Decimal: +``` +{ + "name" : "decimal", + "precision" : /* integer */, + "scale" : /* integer */ +} +``` +Timestamp: +``` +{ + "name" : "timestamp", + "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND" +} +``` +Interval: +``` +{ + "name" : "interval", + "unit" : "YEAR_MONTH|DAY_TIME" +} +``` +Schema: +``` +{ + "fields" : [ + /* Field */ + ] +} +``` + ## Record data headers A record batch is a collection of top-level named, equal length Arrow arrays diff --git a/java/vector/src/main/codegen/templates/ArrowType.java b/java/vector/src/main/codegen/templates/ArrowType.java index 30f2c68efe0..4069e6061b6 100644 --- a/java/vector/src/main/codegen/templates/ArrowType.java +++ b/java/vector/src/main/codegen/templates/ArrowType.java @@ -16,12 +16,6 @@ * limitations under the License. */ -import org.apache.arrow.flatbuf.Field; -import org.apache.arrow.flatbuf.Type; -import org.apache.arrow.vector.types.pojo.ArrowType.Int; - -import java.util.Objects; - <@pp.dropOutputFile /> <@pp.changeOutputFile name="/org/apache/arrow/vector/types/pojo/ArrowType.java" /> <#include "/@includes/license.ftl" /> @@ -31,13 +25,150 @@ import com.google.flatbuffers.FlatBufferBuilder; import org.apache.arrow.flatbuf.Type; +import java.io.IOException; import java.util.Objects; +import org.apache.arrow.flatbuf.Precision; +import org.apache.arrow.flatbuf.UnionMode; +import org.apache.arrow.flatbuf.TimeUnit; +import org.apache.arrow.flatbuf.IntervalUnit; + +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonSubTypes; +import com.fasterxml.jackson.annotation.JsonTypeInfo; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; +import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.SerializerProvider; +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; +import com.fasterxml.jackson.databind.annotation.JsonSerialize; + /** * Arrow types **/ +@JsonTypeInfo( + use = JsonTypeInfo.Id.NAME, + include = JsonTypeInfo.As.PROPERTY, + property = "name") +@JsonSubTypes({ +<#list arrowTypes.types as type> + @JsonSubTypes.Type(value = ArrowType.${type.name}.class, name = "${type.name?remove_ending("_")?lower_case}"), + +}) public abstract class ArrowType { + private static class FloatingPointPrecisionSerializer extends JsonSerializer { + @Override + public void serialize(Short precision, + JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) + throws IOException, JsonProcessingException { + jsonGenerator.writeObject(Precision.name(precision)); + } + } + + private static class FloatingPointPrecisionDeserializer extends JsonDeserializer { + @Override + public Short deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException { + String name = p.getText(); + switch(name) { + case "HALF": + return Precision.HALF; + case "SINGLE": + return Precision.SINGLE; + case "DOUBLE": + return Precision.DOUBLE; + default: + throw new IllegalArgumentException("unknown precision: " + name); + } + } + } + + private static class UnionModeSerializer extends JsonSerializer { + @Override + public void serialize(Short mode, + JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) + throws IOException, JsonProcessingException { + jsonGenerator.writeObject(UnionMode.name(mode)); + } + } + + private static class UnionModeDeserializer extends JsonDeserializer { + @Override + public Short deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException { + String name = p.getText(); + switch(name) { + case "Sparse": + return UnionMode.Sparse; + case "Dense": + return UnionMode.Dense; + default: + throw new IllegalArgumentException("unknown union mode: " + name); + } + } + } + + private static class TimestampUnitSerializer extends JsonSerializer { + @Override + public void serialize(Short unit, + JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) + throws IOException, JsonProcessingException { + jsonGenerator.writeObject(TimeUnit.name(unit)); + } + } + + private static class TimestampUnitDeserializer extends JsonDeserializer { + @Override + public Short deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException { + String name = p.getText(); + switch(name) { + case "SECOND": + return TimeUnit.SECOND; + case "MILLISECOND": + return TimeUnit.MILLISECOND; + case "MICROSECOND": + return TimeUnit.MICROSECOND; + case "NANOSECOND": + return TimeUnit.NANOSECOND; + default: + throw new IllegalArgumentException("unknown time unit: " + name); + } + } + } + + private static class IntervalUnitSerializer extends JsonSerializer { + @Override + public void serialize(Short unit, + JsonGenerator jsonGenerator, + SerializerProvider serializerProvider) + throws IOException, JsonProcessingException { + jsonGenerator.writeObject(IntervalUnit.name(unit)); + } + } + + private static class IntervalUnitDeserializer extends JsonDeserializer { + @Override + public Short deserialize(JsonParser p, DeserializationContext ctxt) throws IOException, JsonProcessingException { + String name = p.getText(); + switch(name) { + case "YEAR_MONTH": + return IntervalUnit.YEAR_MONTH; + case "DAY_TIME": + return IntervalUnit.DAY_TIME; + default: + throw new IllegalArgumentException("unknown interval unit: " + name); + } + } + } + + @JsonIgnore public abstract byte getTypeType(); public abstract int getType(FlatBufferBuilder builder); public abstract T accept(ArrowTypeVisitor visitor); @@ -70,7 +201,12 @@ public static class ${name} extends ArrowType { <#if type.fields?size != 0> - public ${type.name}(<#list type.fields as field>${field.type} ${field.name}<#if field_has_next>, ) { + @JsonCreator + public ${type.name}( + <#list type.fields as field> + <#if field.type == "short"> @JsonDeserialize(using = ${type.name}${field.name?cap_first}Deserializer.class) @JsonProperty("${field.name}") ${field.type} ${field.name}<#if field_has_next>, + + ) { <#list type.fields as field> this.${field.name} = ${field.name}; @@ -86,20 +222,29 @@ public byte getTypeType() { public int getType(FlatBufferBuilder builder) { <#list type.fields as field> <#if field.type == "String"> - int ${field.name} = builder.createString(this.${field.name}); + int ${field.name} = this.${field.name} == null ? -1 : builder.createString(this.${field.name}); <#if field.type == "int[]"> - int ${field.name} = org.apache.arrow.flatbuf.${type.name}.create${field.name?cap_first}Vector(builder, this.${field.name}); + int ${field.name} = this.${field.name} == null ? -1 : org.apache.arrow.flatbuf.${type.name}.create${field.name?cap_first}Vector(builder, this.${field.name}); org.apache.arrow.flatbuf.${type.name}.start${type.name}(builder); <#list type.fields as field> - org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, ${field.name}); + <#if field.type == "String" || field.type == "int[]"> + if (this.${field.name} != null) { + org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, ${field.name}); + } + <#else> + org.apache.arrow.flatbuf.${type.name}.add${field.name?cap_first}(builder, this.${field.name}); + return org.apache.arrow.flatbuf.${type.name}.end${type.name}(builder); } <#list fields as field> + <#if field.type == "short"> + @JsonSerialize(using = ${type.name}${field.name?cap_first}Serializer.class) + public ${field.type} get${field.name?cap_first}() { return ${field.name}; } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java index 9b7fa45bb9a..8fe8e484496 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/ArrowVectorType.java @@ -17,8 +17,15 @@ */ package org.apache.arrow.vector.schema; +import java.util.Map; + import org.apache.arrow.flatbuf.VectorType; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonValue; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableMap.Builder; + public class ArrowVectorType { public static final ArrowVectorType DATA = new ArrowVectorType(VectorType.DATA); @@ -26,22 +33,52 @@ public class ArrowVectorType { public static final ArrowVectorType VALIDITY = new ArrowVectorType(VectorType.VALIDITY); public static final ArrowVectorType TYPE = new ArrowVectorType(VectorType.TYPE); + private static final Map typeByName; + static { + ArrowVectorType[] types = { DATA, OFFSET, VALIDITY, TYPE }; + Builder builder = ImmutableMap.builder(); + for (ArrowVectorType type: types) { + builder.put(type.getName(), type); + } + typeByName = builder.build(); + } + + public static ArrowVectorType fromName(String name) { + ArrowVectorType type = typeByName.get(name); + if (type == null) { + throw new IllegalArgumentException("Unknown type " + name); + } + return type; + } + private final short type; public ArrowVectorType(short type) { this.type = type; + // validate that the type is valid + getName(); + } + + @JsonCreator + private ArrowVectorType(String name) { + this.type = fromName(name).type; } public short getType() { return type; } - @Override - public String toString() { + @JsonValue + public String getName() { try { return VectorType.name(type); } catch (ArrayIndexOutOfBoundsException e) { - return "Unlnown type " + type; + throw new IllegalArgumentException("Unknown type " + type); } } + + @Override + public String toString() { + return getName(); + } } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 072385a2155..06ae203bf44 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -19,6 +19,7 @@ import static java.util.Arrays.asList; import static org.apache.arrow.flatbuf.Precision.DOUBLE; +import static org.apache.arrow.flatbuf.Precision.HALF; import static org.apache.arrow.flatbuf.Precision.SINGLE; import static org.apache.arrow.vector.schema.VectorLayout.booleanVector; import static org.apache.arrow.vector.schema.VectorLayout.byteVector; @@ -49,6 +50,9 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; /** @@ -110,6 +114,9 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { @Override public TypeLayout visit(FloatingPoint type) { int bitWidth; switch (type.getPrecision()) { + case HALF: + bitWidth = 16; + break; case SINGLE: bitWidth = 32; break; @@ -184,7 +191,8 @@ public TypeLayout visit(Interval type) { // TODO: check size private final List vectors; - public TypeLayout(List vectors) { + @JsonCreator + public TypeLayout(@JsonProperty("vectors") List vectors) { super(); this.vectors = Preconditions.checkNotNull(vectors); } @@ -198,6 +206,7 @@ public List getVectors() { return vectors; } + @JsonIgnore public List getVectorTypes() { List types = new ArrayList<>(vectors.size()); for (VectorLayout vector : vectors) { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java index 532e9d2328b..931c00a0281 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/VectorLayout.java @@ -22,6 +22,8 @@ import static org.apache.arrow.vector.schema.ArrowVectorType.TYPE; import static org.apache.arrow.vector.schema.ArrowVectorType.VALIDITY; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.Preconditions; import com.google.flatbuffers.FlatBufferBuilder; @@ -75,7 +77,8 @@ public static VectorLayout byteVector() { private final ArrowVectorType type; - private VectorLayout(ArrowVectorType type, int typeBitWidth) { + @JsonCreator + private VectorLayout(@JsonProperty("type") ArrowVectorType type, @JsonProperty("typeBitWidth") int typeBitWidth) { super(); this.type = Preconditions.checkNotNull(type); this.typeBitWidth = (short)typeBitWidth; diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java index cfa1ed40aeb..49ba524ab0a 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Field.java @@ -18,6 +18,7 @@ package org.apache.arrow.vector.types.pojo; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.arrow.vector.types.pojo.ArrowType.getTypeForField; import java.util.List; @@ -26,6 +27,8 @@ import org.apache.arrow.vector.schema.TypeLayout; import org.apache.arrow.vector.schema.VectorLayout; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.collect.ImmutableList; import com.google.flatbuffers.FlatBufferBuilder; @@ -36,20 +39,26 @@ public class Field { private final List children; private final TypeLayout typeLayout; - private Field(String name, boolean nullable, ArrowType type, List children, TypeLayout typeLayout) { + @JsonCreator + private Field( + @JsonProperty("name") String name, + @JsonProperty("nullable") boolean nullable, + @JsonProperty("type") ArrowType type, + @JsonProperty("children") List children, + @JsonProperty("typeLayout") TypeLayout typeLayout) { this.name = name; this.nullable = nullable; - this.type = type; + this.type = checkNotNull(type); if (children == null) { this.children = ImmutableList.of(); } else { this.children = children; } - this.typeLayout = typeLayout; + this.typeLayout = checkNotNull(typeLayout); } public Field(String name, boolean nullable, ArrowType type, List children) { - this(name, nullable, type, children, TypeLayout.getTypeLayout(type)); + this(name, nullable, type, children, TypeLayout.getTypeLayout(checkNotNull(type))); } public static Field convertField(org.apache.arrow.flatbuf.Field field) { @@ -77,7 +86,7 @@ public void validate() { } public int getField(FlatBufferBuilder builder) { - int nameOffset = builder.createString(name); + int nameOffset = name == null ? -1 : builder.createString(name); int typeOffset = type.getType(builder); int[] childrenData = new int[children.size()]; for (int i = 0; i < children.size(); i++) { @@ -91,7 +100,9 @@ public int getField(FlatBufferBuilder builder) { } int layoutOffset = org.apache.arrow.flatbuf.Field.createLayoutVector(builder, buffersData); org.apache.arrow.flatbuf.Field.startField(builder); - org.apache.arrow.flatbuf.Field.addName(builder, nameOffset); + if (name != null) { + org.apache.arrow.flatbuf.Field.addName(builder, nameOffset); + } org.apache.arrow.flatbuf.Field.addNullable(builder, nullable); org.apache.arrow.flatbuf.Field.addTypeType(builder, type.getTypeType()); org.apache.arrow.flatbuf.Field.addType(builder, typeOffset); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java index 231be9bd55c..44b877eb730 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/types/pojo/Schema.java @@ -18,19 +18,91 @@ package org.apache.arrow.vector.types.pojo; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.arrow.vector.types.pojo.Field.convertField; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Objects; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectReader; +import com.fasterxml.jackson.databind.ObjectWriter; import com.google.common.collect.ImmutableList; import com.google.flatbuffers.FlatBufferBuilder; +/** + * An Arrow Schema + */ public class Schema { - private List fields; - public Schema(List fields) { - this.fields = ImmutableList.copyOf(fields); + /** + * @param the list of the fields + * @param name the name of the field to return + * @return the corresponding field + * @throws IllegalArgumentException if the field was not found + */ + public static Field findField(List fields, String name) { + for (Field field : fields) { + if (field.getName().equals(name)) { + return field; + } + } + throw new IllegalArgumentException(String.format("field %s not found in %s", name, fields)); + } + + private static final ObjectMapper mapper = new ObjectMapper(); + private static final ObjectWriter writer = mapper.writerWithDefaultPrettyPrinter(); + private static final ObjectReader reader = mapper.readerFor(Schema.class); + + public static Schema fromJSON(String json) throws IOException { + return reader.readValue(checkNotNull(json)); + } + + public static Schema convertSchema(org.apache.arrow.flatbuf.Schema schema) { + ImmutableList.Builder childrenBuilder = ImmutableList.builder(); + for (int i = 0; i < schema.fieldsLength(); i++) { + childrenBuilder.add(convertField(schema.fields(i))); + } + List fields = childrenBuilder.build(); + return new Schema(fields); + } + + private final List fields; + + @JsonCreator + public Schema(@JsonProperty("fields") Iterable fields) { + List fieldList = new ArrayList<>(); + for (Field field : fields) { + fieldList.add(field); + } + this.fields = Collections.unmodifiableList(fieldList); + } + + public List getFields() { + return fields; + } + + /** + * @param name the name of the field to return + * @return the corresponding field + */ + public Field findField(String name) { + return findField(getFields(), name); + } + + public String toJson() { + try { + return writer.writeValueAsString(this); + } catch (JsonProcessingException e) { + // this should not happen + throw new RuntimeException(e); + } } public int getSchema(FlatBufferBuilder builder) { @@ -44,9 +116,6 @@ public int getSchema(FlatBufferBuilder builder) { return org.apache.arrow.flatbuf.Schema.endSchema(builder); } - public List getFields() { - return fields; - } @Override public int hashCode() { @@ -61,15 +130,6 @@ public boolean equals(Object obj) { return Objects.equals(this.fields, ((Schema) obj).fields); } - public static Schema convertSchema(org.apache.arrow.flatbuf.Schema schema) { - ImmutableList.Builder childrenBuilder = ImmutableList.builder(); - for (int i = 0; i < schema.fieldsLength(); i++) { - childrenBuilder.add(convertField(schema.fields(i))); - } - List fields = childrenBuilder.build(); - return new Schema(fields); - } - @Override public String toString() { return "Schema" + fields; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java new file mode 100644 index 00000000000..0ef8be7ef1b --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/types/pojo/TestSchema.java @@ -0,0 +1,119 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.types.pojo; + +import static java.util.Arrays.asList; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; + +import org.apache.arrow.flatbuf.IntervalUnit; +import org.apache.arrow.flatbuf.Precision; +import org.apache.arrow.flatbuf.TimeUnit; +import org.apache.arrow.flatbuf.UnionMode; +import org.junit.Test; + +public class TestSchema { + + private static Field field(String name, boolean nullable, ArrowType type, Field... children) { + return new Field(name, nullable, type, asList(children)); + } + + private static Field field(String name, ArrowType type, Field... children) { + return field(name, true, type, children); + } + + @Test + public void testAll() throws IOException { + Schema schema = new Schema(asList( + field("a", false, new ArrowType.Null()), + field("b", new ArrowType.Struct_(), field("ba", new ArrowType.Null())), + field("c", new ArrowType.List(), field("ca", new ArrowType.Null())), + field("d", new ArrowType.Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("da", new ArrowType.Null())), + field("e", new ArrowType.Int(8, true)), + field("f", new ArrowType.FloatingPoint(Precision.SINGLE)), + field("g", new ArrowType.Utf8()), + field("h", new ArrowType.Binary()), + field("i", new ArrowType.Bool()), + field("j", new ArrowType.Decimal(5, 5)), + field("k", new ArrowType.Date()), + field("l", new ArrowType.Time()), + field("m", new ArrowType.Timestamp(TimeUnit.MILLISECOND)), + field("n", new ArrowType.Interval(IntervalUnit.DAY_TIME)) + )); + roundTrip(schema); + } + + @Test + public void testUnion() throws IOException { + Schema schema = new Schema(asList( + field("d", new ArrowType.Union(UnionMode.Sparse, new int[] {1, 2, 3}), field("da", new ArrowType.Null())) + )); + roundTrip(schema); + contains(schema, "Sparse"); + } + + @Test + public void testTS() throws IOException { + Schema schema = new Schema(asList( + field("a", new ArrowType.Timestamp(TimeUnit.SECOND)), + field("b", new ArrowType.Timestamp(TimeUnit.MILLISECOND)), + field("c", new ArrowType.Timestamp(TimeUnit.MICROSECOND)), + field("d", new ArrowType.Timestamp(TimeUnit.NANOSECOND)) + )); + roundTrip(schema); + contains(schema, "SECOND", "MILLISECOND", "MICROSECOND", "NANOSECOND"); + } + + @Test + public void testInterval() throws IOException { + Schema schema = new Schema(asList( + field("a", new ArrowType.Interval(IntervalUnit.YEAR_MONTH)), + field("b", new ArrowType.Interval(IntervalUnit.DAY_TIME)) + )); + roundTrip(schema); + contains(schema, "YEAR_MONTH", "DAY_TIME"); + } + + @Test + public void testFP() throws IOException { + Schema schema = new Schema(asList( + field("a", new ArrowType.FloatingPoint(Precision.HALF)), + field("b", new ArrowType.FloatingPoint(Precision.SINGLE)), + field("c", new ArrowType.FloatingPoint(Precision.DOUBLE)) + )); + roundTrip(schema); + contains(schema, "HALF", "SINGLE", "DOUBLE"); + } + + private void roundTrip(Schema schema) throws IOException { + String json = schema.toJson(); + Schema actual = Schema.fromJSON(json); + assertEquals(schema.toJson(), actual.toJson()); + assertEquals(schema, actual); + } + + private void contains(Schema schema, String... s) throws IOException { + String json = schema.toJson(); + for (String string : s) { + assertTrue(json + " contains " + string, json.contains(string)); + } + } + +} From 1196691e221c5b00bbf9bf47eead6f684b61fe62 Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Fri, 7 Oct 2016 13:12:35 -0700 Subject: [PATCH 161/210] ARROW-326: Initialize nested writers in MapWriter based on the underlying MapVector's field Closes #163 --- .../main/codegen/templates/MapWriters.java | 22 +++++++++++++++++++ .../complex/impl/TestPromotableWriter.java | 21 +++++++++++++++++- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index 7f319a9ca34..9fe20df7a1d 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -56,6 +56,28 @@ public class ${mode}MapWriter extends AbstractFieldWriter { } this.container = container; + for (Field child : container.getField().getChildren()) { + switch (Types.getMinorTypeForArrowType(child.getType())) { + case MAP: + map(child.getName()); + break; + case LIST: + list(child.getName()); + break; + case UNION: + UnionWriter writer = new UnionWriter(container.addOrGet(child.getName(), MinorType.UNION, UnionVector.class)); + fields.put(child.getName().toLowerCase(), writer); + break; +<#list vv.types as type><#list type.minor as minor> +<#assign lowerName = minor.class?uncap_first /> +<#if lowerName == "int" ><#assign lowerName = "integer" /> +<#assign upperName = minor.class?upper_case /> + case ${upperName}: + ${lowerName}(child.getName()); + break; + + } + } } @Override diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index 689c96fda92..d439cebeda6 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -21,13 +21,16 @@ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; +import org.apache.arrow.flatbuf.Type; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.DirtyRootAllocator; import org.apache.arrow.vector.complex.AbstractMapVector; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -50,7 +53,7 @@ public void terminate() throws Exception { @Test public void testPromoteToUnion() throws Exception { - try (final AbstractMapVector container = new MapVector(EMPTY_SCHEMA_PATH, allocator, null); + try (final MapVector container = new MapVector(EMPTY_SCHEMA_PATH, allocator, null); final NullableMapVector v = container.addOrGet("test", MinorType.MAP, NullableMapVector.class); final PromotableWriter writer = new PromotableWriter(v, container)) { @@ -92,6 +95,22 @@ public void testPromoteToUnion() throws Exception { assertFalse("4 shouldn't be null", accessor.isNull(4)); assertEquals(100, accessor.getObject(4)); + + container.clear(); + container.allocateNew(); + + ComplexWriterImpl newWriter = new ComplexWriterImpl(EMPTY_SCHEMA_PATH, container); + + MapWriter newMapWriter = newWriter.rootAsMap(); + + newMapWriter.start(); + + newMapWriter.setPosition(2); + newMapWriter.integer("A").writeInt(10); + + Field childField = container.getField().getChildren().get(0).getChildren().get(0); + assertEquals("Child field should be union type: " + childField.getName(), Type.Union, childField.getType().getTypeType()); + } } } From eb1491a96d1fb92bf9c8bfc1acb7a8768af53a7e Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 7 Oct 2016 17:09:00 -0700 Subject: [PATCH 162/210] ARROW-325: make TestArrowFile not dependent on timezone Author: Julien Le Dem Closes #162 from julienledem/tz and squashes the following commits: 74b5ee8 [Julien Le Dem] ARROW-325: make TestArrowFile not dependent on timezone --- .../org/apache/arrow/vector/file/TestArrowFile.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index ad301689cd1..7a5e7b58db9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -40,10 +40,12 @@ import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; import org.apache.arrow.vector.complex.writer.BigIntWriter; import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.holders.NullableTimeStampHolder; import org.apache.arrow.vector.schema.ArrowBuffer; import org.apache.arrow.vector.schema.ArrowRecordBatch; import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Schema; +import org.joda.time.DateTimeZone; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -58,14 +60,18 @@ public class TestArrowFile { private static final int COUNT = 10; private BufferAllocator allocator; + private DateTimeZone defaultTimezone = DateTimeZone.getDefault(); + @Before public void init() { + DateTimeZone.setDefault(DateTimeZone.forOffsetHours(2)); allocator = new RootAllocator(Integer.MAX_VALUE); } @After public void tearDown() { allocator.close(); + DateTimeZone.setDefault(defaultTimezone); } @Test @@ -258,7 +264,9 @@ private void validateComplexContent(int count, NullableMapVector parent) { Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); Assert.assertEquals(i % 3, rootReader.reader("list").size()); - Assert.assertEquals(i, rootReader.reader("map").reader("timestamp").readDateTime().getMillis() % COUNT); + NullableTimeStampHolder h = new NullableTimeStampHolder(); + rootReader.reader("map").reader("timestamp").read(h); + Assert.assertEquals(i, h.value % COUNT); } } From e7080ef9f1bd91505996edd4e4b7643cc54f6b5f Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 7 Oct 2016 17:14:58 -0700 Subject: [PATCH 163/210] [maven-release-plugin] prepare release apache-arrow-0.1.0 --- java/format/pom.xml | 5 ++--- java/memory/pom.xml | 5 ++--- java/pom.xml | 7 +++---- java/vector/pom.xml | 5 ++--- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/java/format/pom.xml b/java/format/pom.xml index 78300047862..c81cfed04d9 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -9,14 +9,13 @@ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - + 4.0.0 arrow-java-root org.apache.arrow - 0.1-SNAPSHOT + 0.1.0 arrow-format diff --git a/java/memory/pom.xml b/java/memory/pom.xml index b91b5981559..8af23130791 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -9,13 +9,12 @@ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - + 4.0.0 org.apache.arrow arrow-java-root - 0.1-SNAPSHOT + 0.1.0 arrow-memory Arrow Memory diff --git a/java/pom.xml b/java/pom.xml index fc2c18d0e51..8ca8eac76a7 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -9,8 +9,7 @@ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - + 4.0.0 @@ -21,7 +20,7 @@ org.apache.arrow arrow-java-root - 0.1-SNAPSHOT + 0.1.0 pom Apache Arrow Java Root POM @@ -42,7 +41,7 @@ scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git https://github.com/apache/arrow - HEAD + apache-arrow-0.1.0 diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 08f9bc8da4e..ae48d22a6f4 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -9,13 +9,12 @@ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. --> - + 4.0.0 org.apache.arrow arrow-java-root - 0.1-SNAPSHOT + 0.1.0 arrow-vector Arrow Vectors From 17cd7a6466741d22053d132ea306ad6f05351419 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 7 Oct 2016 17:15:08 -0700 Subject: [PATCH 164/210] [maven-release-plugin] prepare for next development iteration --- java/format/pom.xml | 2 +- java/memory/pom.xml | 2 +- java/pom.xml | 4 ++-- java/vector/pom.xml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/java/format/pom.xml b/java/format/pom.xml index c81cfed04d9..eb045d655e9 100644 --- a/java/format/pom.xml +++ b/java/format/pom.xml @@ -15,7 +15,7 @@ arrow-java-root org.apache.arrow - 0.1.0 + 0.1.1-SNAPSHOT arrow-format diff --git a/java/memory/pom.xml b/java/memory/pom.xml index 8af23130791..6ed14480860 100644 --- a/java/memory/pom.xml +++ b/java/memory/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.1.0 + 0.1.1-SNAPSHOT arrow-memory Arrow Memory diff --git a/java/pom.xml b/java/pom.xml index 8ca8eac76a7..0147de70357 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -20,7 +20,7 @@ org.apache.arrow arrow-java-root - 0.1.0 + 0.1.1-SNAPSHOT pom Apache Arrow Java Root POM @@ -41,7 +41,7 @@ scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git scm:git:https://git-wip-us.apache.org/repos/asf/arrow.git https://github.com/apache/arrow - apache-arrow-0.1.0 + HEAD diff --git a/java/vector/pom.xml b/java/vector/pom.xml index ae48d22a6f4..1d06bdece01 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -14,7 +14,7 @@ org.apache.arrow arrow-java-root - 0.1.0 + 0.1.1-SNAPSHOT arrow-vector Arrow Vectors From a9747ceac2b6399c6acf027de8074d8661d5eb1d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 10 Oct 2016 11:21:49 -0400 Subject: [PATCH 165/210] ARROW-312: Read and write Arrow IPC file format from Python This also adds some IO scaffolding for interacting with `arrow::Buffer` objects from Python and assorted additions to help with testing. Author: Wes McKinney Closes #164 from wesm/ARROW-312 and squashes the following commits: 7df3e5f [Wes McKinney] Set BUILD_WITH_INSTALL_RPATH on arrow_ipc be8cee0 [Wes McKinney] Link Cython modules to libarrow* libraries 5716601 [Wes McKinney] Fix accidental deletion 77fb03b [Wes McKinney] Add / test Buffer wrapper. Test that we can write an arrow file to a wrapped buffer. Resize buffer in BufferOutputStream on close 316537d [Wes McKinney] Get ready to wrap Arrow buffers in a Python object 4822d32 [Wes McKinney] Implement RecordBatch::Equals, compare in Python ipc file writes a931e49 [Wes McKinney] Permit buffers (write padding) in a non-multiple of 64 in an IPC context, to allow zero-copy writing of NumPy arrays 2c49cd4 [Wes McKinney] Some debugging ca1562b [Wes McKinney] Draft implementations of Arrow file read/write from Python --- cpp/src/arrow/io/io-memory-test.cc | 25 ++ cpp/src/arrow/io/memory.cc | 13 +- cpp/src/arrow/ipc/CMakeLists.txt | 7 + cpp/src/arrow/ipc/adapter.cc | 16 +- cpp/src/arrow/ipc/util.h | 6 +- cpp/src/arrow/table-test.cc | 27 ++ cpp/src/arrow/table.cc | 16 ++ cpp/src/arrow/table.h | 2 + cpp/src/arrow/types/primitive-test.cc | 3 +- cpp/src/arrow/util/bit-util.h | 13 + cpp/src/arrow/util/buffer.cc | 16 +- cpp/src/arrow/util/buffer.h | 1 - cpp/src/arrow/util/logging.h | 4 +- python/CMakeLists.txt | 8 +- python/cmake_modules/FindArrow.cmake | 11 + python/pyarrow/__init__.py | 3 +- python/pyarrow/array.pyx | 44 +-- python/pyarrow/includes/common.pxd | 4 - python/pyarrow/includes/libarrow.pxd | 29 +- python/pyarrow/includes/libarrow_io.pxd | 14 +- python/pyarrow/includes/libarrow_ipc.pxd | 52 ++++ python/pyarrow/includes/pyarrow.pxd | 13 +- python/pyarrow/io.pxd | 6 + python/pyarrow/io.pyx | 340 ++++++++++++++--------- python/pyarrow/ipc.pyx | 155 +++++++++++ python/pyarrow/table.pxd | 17 +- python/pyarrow/table.pyx | 194 ++++++++++--- python/pyarrow/tests/test_array.py | 4 + python/pyarrow/tests/test_io.py | 41 +++ python/pyarrow/tests/test_ipc.py | 116 ++++++++ python/pyarrow/tests/test_table.py | 82 +++--- python/setup.py | 1 + python/src/pyarrow/adapters/builtin.cc | 2 +- python/src/pyarrow/adapters/pandas.cc | 8 + python/src/pyarrow/common.cc | 2 +- python/src/pyarrow/common.h | 20 +- python/src/pyarrow/io.cc | 6 +- 37 files changed, 1012 insertions(+), 309 deletions(-) create mode 100644 python/pyarrow/includes/libarrow_ipc.pxd create mode 100644 python/pyarrow/ipc.pyx create mode 100644 python/pyarrow/tests/test_ipc.py diff --git a/cpp/src/arrow/io/io-memory-test.cc b/cpp/src/arrow/io/io-memory-test.cc index 6de35dab59b..a49faf3bd85 100644 --- a/cpp/src/arrow/io/io-memory-test.cc +++ b/cpp/src/arrow/io/io-memory-test.cc @@ -121,5 +121,30 @@ TEST_F(TestMemoryMappedFile, InvalidFile) { IOError, MemoryMappedFile::Open(non_existent_path, FileMode::READ, &result)); } +class TestBufferOutputStream : public ::testing::Test { + public: + void SetUp() { + buffer_.reset(new PoolBuffer(default_memory_pool())); + stream_.reset(new BufferOutputStream(buffer_)); + } + + protected: + std::shared_ptr buffer_; + std::unique_ptr stream_; +}; + +TEST_F(TestBufferOutputStream, CloseResizes) { + std::string data = "data123456"; + + const int64_t nbytes = static_cast(data.size()); + const int K = 100; + for (int i = 0; i < K; ++i) { + EXPECT_OK(stream_->Write(reinterpret_cast(data.c_str()), nbytes)); + } + + ASSERT_OK(stream_->Close()); + ASSERT_EQ(K * nbytes, buffer_->size()); +} + } // namespace io } // namespace arrow diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 7d6e02e25b4..c7d0ae5d564 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -212,7 +212,11 @@ BufferOutputStream::BufferOutputStream(const std::shared_ptr& b mutable_data_(buffer->mutable_data()) {} Status BufferOutputStream::Close() { - return Status::OK(); + if (position_ < capacity_) { + return buffer_->Resize(position_); + } else { + return Status::OK(); + } } Status BufferOutputStream::Tell(int64_t* position) { @@ -228,8 +232,11 @@ Status BufferOutputStream::Write(const uint8_t* data, int64_t nbytes) { } Status BufferOutputStream::Reserve(int64_t nbytes) { - while (position_ + nbytes > capacity_) { - int64_t new_capacity = std::max(kBufferMinimumSize, capacity_ * 2); + int64_t new_capacity = capacity_; + while (position_ + nbytes > new_capacity) { + new_capacity = std::max(kBufferMinimumSize, new_capacity * 2); + } + if (new_capacity > capacity_) { RETURN_NOT_OK(buffer_->Resize(new_capacity)); capacity_ = new_capacity; } diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index bde8c5bf738..8dcd9ac1071 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -57,6 +57,13 @@ SET_TARGET_PROPERTIES(arrow_ipc PROPERTIES LINKER_LANGUAGE CXX LINK_FLAGS "${ARROW_IPC_LINK_FLAGS}") +if (APPLE) + set_target_properties(arrow_ipc + PROPERTIES + BUILD_WITH_INSTALL_RPATH ON + INSTALL_NAME_DIR "@rpath") +endif() + ADD_ARROW_TEST(ipc-adapter-test) ARROW_TEST_LINK_LIBRARIES(ipc-adapter-test ${ARROW_IPC_TEST_LINK_LIBS}) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 99974a4a4c7..cd8ab53a31d 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -162,15 +162,14 @@ class RecordBatchWriter { for (size_t i = 0; i < buffers_.size(); ++i) { const Buffer* buffer = buffers_[i].get(); int64_t size = 0; + int64_t padding = 0; // The buffer might be null if we are handling zero row lengths. if (buffer) { - // We use capacity here, because size might not reflect the padding - // requirements of buffers but capacity always should. - size = buffer->capacity(); - // check that padding is appropriate - RETURN_NOT_OK(CheckMultipleOf64(size)); + size = buffer->size(); + padding = util::RoundUpToMultipleOf64(size) - size; } + // TODO(wesm): We currently have no notion of shared memory page id's, // but we've included it in the metadata IDL for when we have it in the // future. Use page=0 for now @@ -179,12 +178,17 @@ class RecordBatchWriter { // are using from any OS-level shared memory. The thought is that systems // may (in the future) associate integer page id's with physical memory // pages (according to whatever is the desired shared memory mechanism) - buffer_meta_.push_back(flatbuf::Buffer(0, position, size)); + buffer_meta_.push_back(flatbuf::Buffer(0, position, size + padding)); if (size > 0) { RETURN_NOT_OK(dst->Write(buffer->data(), size)); position += size; } + + if (padding > 0) { + RETURN_NOT_OK(dst->Write(kPaddingBytes, padding)); + position += padding; + } } *body_end_offset = position; diff --git a/cpp/src/arrow/ipc/util.h b/cpp/src/arrow/ipc/util.h index 94079a38277..9000d1bb0c6 100644 --- a/cpp/src/arrow/ipc/util.h +++ b/cpp/src/arrow/ipc/util.h @@ -29,7 +29,11 @@ namespace ipc { // Align on 8-byte boundaries static constexpr int kArrowAlignment = 8; -static constexpr uint8_t kPaddingBytes[kArrowAlignment] = {0}; + +// Buffers are padded to 64-byte boundaries (for SIMD) +static constexpr int kArrowBufferAlignment = 64; + +static constexpr uint8_t kPaddingBytes[kArrowBufferAlignment] = {0}; static inline int64_t PaddedLength(int64_t nbytes, int64_t alignment = kArrowAlignment) { return ((nbytes + alignment - 1) / alignment) * alignment; diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc index 385e7d83150..743fb669700 100644 --- a/cpp/src/arrow/table-test.cc +++ b/cpp/src/arrow/table-test.cc @@ -123,4 +123,31 @@ TEST_F(TestTable, InvalidColumns) { ASSERT_RAISES(Invalid, table_->ValidateColumns()); } +class TestRecordBatch : public TestBase {}; + +TEST_F(TestRecordBatch, Equals) { + const int length = 10; + + auto f0 = std::make_shared("f0", INT32); + auto f1 = std::make_shared("f1", UINT8); + auto f2 = std::make_shared("f2", INT16); + + vector> fields = {f0, f1, f2}; + auto schema = std::make_shared(fields); + + auto a0 = MakePrimitive(length); + auto a1 = MakePrimitive(length); + auto a2 = MakePrimitive(length); + + RecordBatch b1(schema, length, {a0, a1, a2}); + RecordBatch b2(schema, 5, {a0, a1, a2}); + RecordBatch b3(schema, length, {a0, a1}); + RecordBatch b4(schema, length, {a0, a1, a1}); + + ASSERT_TRUE(b1.Equals(b1)); + ASSERT_FALSE(b1.Equals(b2)); + ASSERT_FALSE(b1.Equals(b3)); + ASSERT_FALSE(b1.Equals(b4)); +} + } // namespace arrow diff --git a/cpp/src/arrow/table.cc b/cpp/src/arrow/table.cc index 3a250df81d0..af84f27eab5 100644 --- a/cpp/src/arrow/table.cc +++ b/cpp/src/arrow/table.cc @@ -21,6 +21,7 @@ #include #include +#include "arrow/array.h" #include "arrow/column.h" #include "arrow/schema.h" #include "arrow/util/status.h" @@ -35,6 +36,21 @@ const std::string& RecordBatch::column_name(int i) const { return schema_->field(i)->name; } +bool RecordBatch::Equals(const RecordBatch& other) const { + if (num_columns() != other.num_columns() || num_rows_ != other.num_rows()) { + return false; + } + + for (int i = 0; i < num_columns(); ++i) { + if (!column(i)->Equals(other.column(i))) { return false; } + } + + return true; +} + +// ---------------------------------------------------------------------- +// Table methods + Table::Table(const std::string& name, const std::shared_ptr& schema, const std::vector>& columns) : name_(name), schema_(schema), columns_(columns) { diff --git a/cpp/src/arrow/table.h b/cpp/src/arrow/table.h index 36b3c8ecaf4..1a856c8a436 100644 --- a/cpp/src/arrow/table.h +++ b/cpp/src/arrow/table.h @@ -43,6 +43,8 @@ class ARROW_EXPORT RecordBatch { RecordBatch(const std::shared_ptr& schema, int32_t num_rows, const std::vector>& columns); + bool Equals(const RecordBatch& other) const; + // @returns: the table's schema const std::shared_ptr& schema() const { return schema_; } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index ffebb9269bd..87eb0fe3a8b 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -238,8 +238,7 @@ void TestPrimitiveBuilder::Check( } typedef ::testing::Types - Primitives; + PInt32, PInt64, PFloat, PDouble> Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 873a1959865..3087ce7784d 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -19,6 +19,7 @@ #define ARROW_UTIL_BIT_UTIL_H #include +#include #include #include @@ -77,6 +78,18 @@ static inline bool is_multiple_of_64(int64_t n) { return (n & 63) == 0; } +inline int64_t RoundUpToMultipleOf64(int64_t num) { + // TODO(wesm): is this definitely needed? + // DCHECK_GE(num, 0); + constexpr int64_t round_to = 64; + constexpr int64_t force_carry_addend = round_to - 1; + constexpr int64_t truncate_bitmask = ~(round_to - 1); + constexpr int64_t max_roundable_num = std::numeric_limits::max() - round_to; + if (num <= max_roundable_num) { return (num + force_carry_addend) & truncate_bitmask; } + // handle overflow case. This should result in a malloc error upstream + return num; +} + void bytes_to_bits(const std::vector& bytes, uint8_t* bits); ARROW_EXPORT Status bytes_to_bits(const std::vector&, std::shared_ptr*); diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 703ef8384ac..6faa048e4e5 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -20,25 +20,13 @@ #include #include +#include "arrow/util/bit-util.h" #include "arrow/util/logging.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" namespace arrow { -namespace { -int64_t RoundUpToMultipleOf64(int64_t num) { - DCHECK_GE(num, 0); - constexpr int64_t round_to = 64; - constexpr int64_t force_carry_addend = round_to - 1; - constexpr int64_t truncate_bitmask = ~(round_to - 1); - constexpr int64_t max_roundable_num = std::numeric_limits::max() - round_to; - if (num <= max_roundable_num) { return (num + force_carry_addend) & truncate_bitmask; } - // handle overflow case. This should result in a malloc error upstream - return num; -} -} // namespace - Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, int64_t size) { data_ = parent->data() + offset; size_ = size; @@ -64,7 +52,7 @@ PoolBuffer::~PoolBuffer() { Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; - new_capacity = RoundUpToMultipleOf64(new_capacity); + new_capacity = util::RoundUpToMultipleOf64(new_capacity); if (mutable_data_) { RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data)); memcpy(new_data, mutable_data_, size_); diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 1aeebc69b4e..01e4259c31f 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -23,7 +23,6 @@ #include #include -#include "arrow/util/bit-util.h" #include "arrow/util/macros.h" #include "arrow/util/status.h" #include "arrow/util/visibility.h" diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index b22f07dd634..06ee8411e28 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -118,9 +118,9 @@ class CerrLog { class FatalLog : public CerrLog { public: explicit FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL){} // NOLINT + : CerrLog(ARROW_FATAL) {} // NOLINT - [[noreturn]] ~FatalLog() { + [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } std::exit(1); } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 77a771ab21c..55f6d0543a1 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -352,6 +352,8 @@ ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) ADD_THIRDPARTY_LIB(arrow_io SHARED_LIB ${ARROW_IO_SHARED_LIB}) +ADD_THIRDPARTY_LIB(arrow_ipc + SHARED_LIB ${ARROW_IPC_SHARED_LIB}) ############################################################ # Linker setup @@ -415,6 +417,8 @@ if (UNIX) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) endif() +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) @@ -423,6 +427,7 @@ set(CYTHON_EXTENSIONS config error io + ipc scalar schema table @@ -442,6 +447,7 @@ set(PYARROW_SRCS set(LINK_LIBS arrow arrow_io + arrow_ipc ) if(PARQUET_FOUND AND PARQUET_ARROW_FOUND) @@ -455,8 +461,6 @@ if(PARQUET_FOUND AND PARQUET_ARROW_FOUND) parquet) endif() -SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - add_library(pyarrow SHARED ${PYARROW_SRCS}) target_link_libraries(pyarrow ${LINK_LIBS}) diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 9919746520b..3c359aac553 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -47,10 +47,16 @@ find_library(ARROW_IO_LIB_PATH NAMES arrow_io ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) +find_library(ARROW_IPC_LIB_PATH NAMES arrow_ipc + PATHS + ${ARROW_SEARCH_LIB_PATH} + NO_DEFAULT_PATH) + if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) set(ARROW_IO_LIB_NAME libarrow_io) + set(ARROW_IPC_LIB_NAME libarrow_ipc) set(ARROW_LIBS ${ARROW_SEARCH_LIB_PATH}) set(ARROW_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_LIB_NAME}.a) @@ -58,9 +64,14 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) set(ARROW_IO_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_IO_LIB_NAME}.a) set(ARROW_IO_SHARED_LIB ${ARROW_LIBS}/${ARROW_IO_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + + set(ARROW_IPC_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_IPC_LIB_NAME}.a) + set(ARROW_IPC_SHARED_LIB ${ARROW_LIBS}/${ARROW_IPC_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + if (NOT Arrow_FIND_QUIETLY) message(STATUS "Found the Arrow core library: ${ARROW_LIB_PATH}") message(STATUS "Found the Arrow IO library: ${ARROW_IO_LIB_PATH}") + message(STATUS "Found the Arrow IPC library: ${ARROW_IPC_LIB_PATH}") endif () else () if (NOT Arrow_FIND_QUIETLY) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 7561f6d46df..8b131aaa8f4 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -41,5 +41,4 @@ list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.array import RowBatch -from pyarrow.table import Column, Table, from_pandas_dataframe +from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index cdbe73ad21f..84ab4a48c9b 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -37,7 +37,7 @@ import pyarrow.schema as schema def total_allocated_bytes(): - cdef MemoryPool* pool = pyarrow.GetMemoryPool() + cdef MemoryPool* pool = pyarrow.get_memory_pool() return pool.bytes_allocated() @@ -243,12 +243,14 @@ def from_pandas_series(object series, object mask=None, timestamps_to_ms=False): series_values = series_values.astype('datetime64[ms]') if mask is None: - check_status(pyarrow.PandasToArrow(pyarrow.GetMemoryPool(), - series_values, &out)) + with nogil: + check_status(pyarrow.PandasToArrow(pyarrow.get_memory_pool(), + series_values, &out)) else: mask = series_as_ndarray(mask) - check_status(pyarrow.PandasMaskedToArrow( - pyarrow.GetMemoryPool(), series_values, mask, &out)) + with nogil: + check_status(pyarrow.PandasMaskedToArrow( + pyarrow.get_memory_pool(), series_values, mask, &out)) return box_arrow_array(out) @@ -262,35 +264,3 @@ cdef object series_as_ndarray(object obj): result = obj return result - -#---------------------------------------------------------------------- -# Table-like data structures - -cdef class RowBatch: - """ - - """ - cdef readonly: - Schema schema - int num_rows - list arrays - - def __cinit__(self, Schema schema, int num_rows, list arrays): - self.schema = schema - self.num_rows = num_rows - self.arrays = arrays - - if len(self.schema) != len(arrays): - raise ValueError('Mismatch number of data arrays and ' - 'schema fields') - - def __len__(self): - return self.num_rows - - property num_columns: - - def __get__(self): - return len(self.arrays) - - def __getitem__(self, i): - return self.arrays[i] diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 133797bc37b..05c0123ee7b 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -47,7 +47,3 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsKeyError() c_bool IsNotImplemented() c_bool IsInvalid() - - cdef cppclass Buffer: - uint8_t* data() - int64_t size() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 854d07d691d..3ae17891703 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -54,6 +54,18 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass MemoryPool" arrow::MemoryPool": int64_t bytes_allocated() + cdef cppclass CBuffer" arrow::Buffer": + uint8_t* data() + int64_t size() + + cdef cppclass ResizableBuffer(CBuffer): + CStatus Resize(int64_t nbytes) + CStatus Reserve(int64_t nbytes) + + cdef cppclass PoolBuffer(ResizableBuffer): + PoolBuffer() + PoolBuffer(MemoryPool*) + cdef MemoryPool* default_memory_pool() cdef cppclass CListType" arrow::ListType"(CDataType): @@ -149,6 +161,21 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CDataType]& type() const shared_ptr[CChunkedArray]& data() + cdef cppclass CRecordBatch" arrow::RecordBatch": + CRecordBatch(const shared_ptr[CSchema]& schema, int32_t num_rows, + const vector[shared_ptr[CArray]]& columns) + + c_bool Equals(const CRecordBatch& other) + + const shared_ptr[CSchema]& schema() + const shared_ptr[CArray]& column(int i) + const c_string& column_name(int i) + + const vector[shared_ptr[CArray]]& columns() + + int num_columns() + int32_t num_rows() + cdef cppclass CTable" arrow::Table": CTable(const c_string& name, const shared_ptr[CSchema]& schema, const vector[shared_ptr[CColumn]]& columns) @@ -186,7 +213,7 @@ cdef extern from "arrow/ipc/metadata.h" namespace "arrow::ipc" nogil: MessageType_DICTIONARY_BATCH" arrow::ipc::Message::DICTIONARY_BATCH" cdef cppclass Message: - CStatus Open(const shared_ptr[Buffer]& buf, + CStatus Open(const shared_ptr[CBuffer]& buf, shared_ptr[Message]* out) int64_t body_length() MessageType type() diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd index 56d8d4cf614..8074915508f 100644 --- a/python/pyarrow/includes/libarrow_io.pxd +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -18,7 +18,7 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport MemoryPool +from pyarrow.includes.libarrow cimport * cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: enum FileMode" arrow::io::FileMode::type": @@ -36,7 +36,7 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: FileMode mode() cdef cppclass Readable: - CStatus ReadB" Read"(int64_t nbytes, shared_ptr[Buffer]* out) + CStatus ReadB" Read"(int64_t nbytes, shared_ptr[CBuffer]* out) CStatus Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) cdef cppclass Seekable: @@ -57,7 +57,7 @@ cdef extern from "arrow/io/interfaces.h" namespace "arrow::io" nogil: CStatus ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, uint8_t* buffer) CStatus ReadAt(int64_t position, int64_t nbytes, - int64_t* bytes_read, shared_ptr[Buffer]* out) + int64_t* bytes_read, shared_ptr[CBuffer]* out) cdef cppclass WriteableFileInterface(OutputStream, Seekable): CStatus WriteAt(int64_t position, const uint8_t* data, @@ -143,9 +143,9 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: cdef extern from "arrow/io/memory.h" namespace "arrow::io" nogil: - cdef cppclass BufferReader(ReadableFileInterface): - BufferReader(const uint8_t* data, int64_t nbytes) + cdef cppclass CBufferReader" arrow::io::BufferReader"\ + (ReadableFileInterface): + CBufferReader(const uint8_t* data, int64_t nbytes) cdef cppclass BufferOutputStream(OutputStream): - # TODO(wesm) - pass + BufferOutputStream(const shared_ptr[ResizableBuffer]& buffer) diff --git a/python/pyarrow/includes/libarrow_ipc.pxd b/python/pyarrow/includes/libarrow_ipc.pxd new file mode 100644 index 00000000000..eda5b9bae9e --- /dev/null +++ b/python/pyarrow/includes/libarrow_ipc.pxd @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# distutils: language = c++ + +from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport (MemoryPool, CArray, CSchema, + CRecordBatch) +from pyarrow.includes.libarrow_io cimport (OutputStream, ReadableFileInterface) + +cdef extern from "arrow/ipc/file.h" namespace "arrow::ipc" nogil: + + cdef cppclass CFileWriter " arrow::ipc::FileWriter": + @staticmethod + CStatus Open(OutputStream* sink, const shared_ptr[CSchema]& schema, + shared_ptr[CFileWriter]* out) + + CStatus WriteRecordBatch(const vector[shared_ptr[CArray]]& columns, + int32_t num_rows) + + CStatus Close() + + cdef cppclass CFileReader " arrow::ipc::FileReader": + + @staticmethod + CStatus Open(const shared_ptr[ReadableFileInterface]& file, + shared_ptr[CFileReader]* out) + + @staticmethod + CStatus Open2" Open"(const shared_ptr[ReadableFileInterface]& file, + int64_t footer_offset, shared_ptr[CFileReader]* out) + + const shared_ptr[CSchema]& schema() + + int num_dictionaries() + int num_record_batches() + + CStatus GetRecordBatch(int i, shared_ptr[CRecordBatch]* batch) diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 4c971665ff6..2fa5a7d6325 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -18,8 +18,8 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport (CArray, CColumn, CDataType, CStatus, - Type, MemoryPool) +from pyarrow.includes.libarrow cimport (CArray, CBuffer, CColumn, + CDataType, CStatus, Type, MemoryPool) cimport pyarrow.includes.libarrow_io as arrow_io @@ -53,7 +53,12 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: PyStatus ArrowToPandas(const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) - MemoryPool* GetMemoryPool() + MemoryPool* get_memory_pool() + + +cdef extern from "pyarrow/common.h" namespace "pyarrow" nogil: + cdef cppclass PyBytesBuffer(CBuffer): + PyBytesBuffer(object o) cdef extern from "pyarrow/io.h" namespace "pyarrow" nogil: @@ -63,5 +68,5 @@ cdef extern from "pyarrow/io.h" namespace "pyarrow" nogil: cdef cppclass PyOutputStream(arrow_io.OutputStream): PyOutputStream(object fo) - cdef cppclass PyBytesReader(arrow_io.BufferReader): + cdef cppclass PyBytesReader(arrow_io.CBufferReader): PyBytesReader(object fo) diff --git a/python/pyarrow/io.pxd b/python/pyarrow/io.pxd index 1dbb3fd76bb..d6966cdaadd 100644 --- a/python/pyarrow/io.pxd +++ b/python/pyarrow/io.pxd @@ -22,6 +22,11 @@ from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_io cimport (ReadableFileInterface, OutputStream) +cdef class Buffer: + cdef: + shared_ptr[CBuffer] buffer + + cdef init(self, const shared_ptr[CBuffer]& buffer) cdef class NativeFile: cdef: @@ -29,6 +34,7 @@ cdef class NativeFile: shared_ptr[OutputStream] wr_file bint is_readonly bint is_open + bint own_file # By implementing these "virtual" functions (all functions in Cython # extension classes are technically virtual in the C++ sense) we can expose diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index e6e2b625e87..00a492fc0ba 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -36,6 +36,217 @@ import re import sys import threading + +cdef class NativeFile: + + def __cinit__(self): + self.is_open = False + self.own_file = False + + def __dealloc__(self): + if self.is_open and self.own_file: + self.close() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, tb): + self.close() + + def close(self): + if self.is_open: + with nogil: + if self.is_readonly: + check_cstatus(self.rd_file.get().Close()) + else: + check_cstatus(self.wr_file.get().Close()) + self.is_open = False + + cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): + self._assert_readable() + file[0] = self.rd_file + + cdef write_handle(self, shared_ptr[OutputStream]* file): + self._assert_writeable() + file[0] = self.wr_file + + def _assert_readable(self): + if not self.is_readonly: + raise IOError("only valid on readonly files") + + if not self.is_open: + raise IOError("file not open") + + def _assert_writeable(self): + if self.is_readonly: + raise IOError("only valid on writeonly files") + + if not self.is_open: + raise IOError("file not open") + + def size(self): + cdef int64_t size + self._assert_readable() + with nogil: + check_cstatus(self.rd_file.get().GetSize(&size)) + return size + + def tell(self): + cdef int64_t position + with nogil: + if self.is_readonly: + check_cstatus(self.rd_file.get().Tell(&position)) + else: + check_cstatus(self.wr_file.get().Tell(&position)) + return position + + def seek(self, int64_t position): + self._assert_readable() + with nogil: + check_cstatus(self.rd_file.get().Seek(position)) + + def write(self, data): + """ + Write bytes-like (unicode, encoded to UTF-8) to file + """ + self._assert_writeable() + + data = tobytes(data) + + cdef const uint8_t* buf = cp.PyBytes_AS_STRING(data) + cdef int64_t bufsize = len(data) + with nogil: + check_cstatus(self.wr_file.get().Write(buf, bufsize)) + + def read(self, int nbytes): + cdef: + int64_t bytes_read = 0 + uint8_t* buf + shared_ptr[CBuffer] out + + self._assert_readable() + + with nogil: + check_cstatus(self.rd_file.get() + .ReadB(nbytes, &out)) + + result = cp.PyBytes_FromStringAndSize( + out.get().data(), out.get().size()) + + return result + + +# ---------------------------------------------------------------------- +# Python file-like objects + +cdef class PythonFileInterface(NativeFile): + cdef: + object handle + + def __cinit__(self, handle, mode='w'): + self.handle = handle + + if mode.startswith('w'): + self.wr_file.reset(new pyarrow.PyOutputStream(handle)) + self.is_readonly = 0 + elif mode.startswith('r'): + self.rd_file.reset(new pyarrow.PyReadableFile(handle)) + self.is_readonly = 1 + else: + raise ValueError('Invalid file mode: {0}'.format(mode)) + + self.is_open = True + + +cdef class BytesReader(NativeFile): + cdef: + object obj + + def __cinit__(self, obj): + if not isinstance(obj, bytes): + raise ValueError('Must pass bytes object') + + self.obj = obj + self.is_readonly = 1 + self.is_open = True + + self.rd_file.reset(new pyarrow.PyBytesReader(obj)) + +# ---------------------------------------------------------------------- +# Arrow buffers + + +cdef class Buffer: + + def __cinit__(self): + pass + + cdef init(self, const shared_ptr[CBuffer]& buffer): + self.buffer = buffer + + def __len__(self): + return self.size + + property size: + + def __get__(self): + return self.buffer.get().size() + + def __getitem__(self, key): + # TODO(wesm): buffer slicing + raise NotImplementedError + + def to_pybytes(self): + return cp.PyBytes_FromStringAndSize( + self.buffer.get().data(), + self.buffer.get().size()) + + +cdef shared_ptr[PoolBuffer] allocate_buffer(): + cdef shared_ptr[PoolBuffer] result + result.reset(new PoolBuffer(pyarrow.get_memory_pool())) + return result + + +cdef class InMemoryOutputStream(NativeFile): + + cdef: + shared_ptr[PoolBuffer] buffer + + def __cinit__(self): + self.buffer = allocate_buffer() + self.wr_file.reset(new BufferOutputStream( + self.buffer)) + self.is_readonly = 0 + self.is_open = True + + def get_result(self): + cdef Buffer result = Buffer() + + check_cstatus(self.wr_file.get().Close()) + result.init( self.buffer) + + self.is_open = False + return result + + +def buffer_from_bytes(object obj): + """ + Construct an Arrow buffer from a Python bytes object + """ + if not isinstance(obj, bytes): + raise ValueError('Must pass bytes object') + + cdef shared_ptr[CBuffer] buf + buf.reset(new pyarrow.PyBytesBuffer(obj)) + + cdef Buffer result = Buffer() + result.init(buf) + return result + +# ---------------------------------------------------------------------- +# HDFS IO implementation + _HDFS_PATH_RE = re.compile('hdfs://(.*):(\d+)(.*)') try: @@ -274,6 +485,7 @@ cdef class HdfsClient: out.buffer_size = c_buffer_size out.parent = self out.is_open = True + out.own_file = True return out @@ -322,134 +534,6 @@ cdef class HdfsClient: f.download(stream) -cdef class NativeFile: - - def __cinit__(self): - self.is_open = False - - def __dealloc__(self): - if self.is_open: - self.close() - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, tb): - self.close() - - def close(self): - if self.is_open: - with nogil: - if self.is_readonly: - check_cstatus(self.rd_file.get().Close()) - else: - check_cstatus(self.wr_file.get().Close()) - self.is_open = False - - cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): - self._assert_readable() - file[0] = self.rd_file - - cdef write_handle(self, shared_ptr[OutputStream]* file): - self._assert_writeable() - file[0] = self.wr_file - - def _assert_readable(self): - if not self.is_readonly: - raise IOError("only valid on readonly files") - - def _assert_writeable(self): - if self.is_readonly: - raise IOError("only valid on writeonly files") - - def size(self): - cdef int64_t size - self._assert_readable() - with nogil: - check_cstatus(self.rd_file.get().GetSize(&size)) - return size - - def tell(self): - cdef int64_t position - with nogil: - if self.is_readonly: - check_cstatus(self.rd_file.get().Tell(&position)) - else: - check_cstatus(self.wr_file.get().Tell(&position)) - return position - - def seek(self, int64_t position): - self._assert_readable() - with nogil: - check_cstatus(self.rd_file.get().Seek(position)) - - def write(self, data): - """ - Write bytes-like (unicode, encoded to UTF-8) to file - """ - self._assert_writeable() - - data = tobytes(data) - - cdef const uint8_t* buf = cp.PyBytes_AS_STRING(data) - cdef int64_t bufsize = len(data) - with nogil: - check_cstatus(self.wr_file.get().Write(buf, bufsize)) - - def read(self, int nbytes): - cdef: - int64_t bytes_read = 0 - uint8_t* buf - shared_ptr[Buffer] out - - self._assert_readable() - - with nogil: - check_cstatus(self.rd_file.get() - .ReadB(nbytes, &out)) - - result = cp.PyBytes_FromStringAndSize( - out.get().data(), out.get().size()) - - return result - - -# ---------------------------------------------------------------------- -# Python file-like objects - -cdef class PythonFileInterface(NativeFile): - cdef: - object handle - - def __cinit__(self, handle, mode='w'): - self.handle = handle - - if mode.startswith('w'): - self.wr_file.reset(new pyarrow.PyOutputStream(handle)) - self.is_readonly = 0 - elif mode.startswith('r'): - self.rd_file.reset(new pyarrow.PyReadableFile(handle)) - self.is_readonly = 1 - else: - raise ValueError('Invalid file mode: {0}'.format(mode)) - - self.is_open = True - - -cdef class BytesReader(NativeFile): - cdef: - object obj - - def __cinit__(self, obj): - if not isinstance(obj, bytes): - raise ValueError('Must pass bytes object') - - self.obj = obj - self.is_readonly = 1 - self.is_open = True - - self.rd_file.reset(new pyarrow.PyBytesReader(obj)) - # ---------------------------------------------------------------------- # Specialization for HDFS diff --git a/python/pyarrow/ipc.pyx b/python/pyarrow/ipc.pyx new file mode 100644 index 00000000000..f8da3a70da8 --- /dev/null +++ b/python/pyarrow/ipc.pyx @@ -0,0 +1,155 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Cython wrappers for arrow::ipc + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from pyarrow.includes.libarrow cimport * +from pyarrow.includes.libarrow_io cimport * +from pyarrow.includes.libarrow_ipc cimport * +cimport pyarrow.includes.pyarrow as pyarrow + +from pyarrow.error cimport check_cstatus +from pyarrow.io cimport NativeFile +from pyarrow.schema cimport Schema +from pyarrow.table cimport RecordBatch + +from pyarrow.compat import frombytes, tobytes +import pyarrow.io as io + +cimport cpython as cp + + +cdef get_reader(source, shared_ptr[ReadableFileInterface]* reader): + cdef NativeFile nf + + if isinstance(source, bytes): + source = io.BytesReader(source) + elif not isinstance(source, io.NativeFile) and hasattr(source, 'read'): + # Optimistically hope this is file-like + source = io.PythonFileInterface(source, mode='r') + + if isinstance(source, NativeFile): + nf = source + + # TODO: what about read-write sources (e.g. memory maps) + if not nf.is_readonly: + raise IOError('Native file is not readable') + + nf.read_handle(reader) + else: + raise TypeError('Unable to read from object of type: {0}' + .format(type(source))) + + +cdef get_writer(source, shared_ptr[OutputStream]* writer): + cdef NativeFile nf + + if not isinstance(source, io.NativeFile) and hasattr(source, 'write'): + # Optimistically hope this is file-like + source = io.PythonFileInterface(source, mode='w') + + if isinstance(source, io.NativeFile): + nf = source + + if nf.is_readonly: + raise IOError('Native file is not writeable') + + nf.write_handle(writer) + else: + raise TypeError('Unable to read from object of type: {0}' + .format(type(source))) + + +cdef class ArrowFileWriter: + cdef: + shared_ptr[CFileWriter] writer + shared_ptr[OutputStream] sink + bint closed + + def __cinit__(self, sink, Schema schema): + self.closed = True + get_writer(sink, &self.sink) + + with nogil: + check_cstatus(CFileWriter.Open(self.sink.get(), schema.sp_schema, + &self.writer)) + + self.closed = False + + def __dealloc__(self): + if not self.closed: + self.close() + + def write_record_batch(self, RecordBatch batch): + cdef CRecordBatch* bptr = batch.batch + with nogil: + check_cstatus(self.writer.get() + .WriteRecordBatch(bptr.columns(), bptr.num_rows())) + + def close(self): + with nogil: + check_cstatus(self.writer.get().Close()) + self.closed = True + + +cdef class ArrowFileReader: + cdef: + shared_ptr[CFileReader] reader + + def __cinit__(self, source, footer_offset=None): + cdef shared_ptr[ReadableFileInterface] reader + get_reader(source, &reader) + + cdef int64_t offset = 0 + if footer_offset is not None: + offset = footer_offset + + with nogil: + if offset != 0: + check_cstatus(CFileReader.Open2(reader, offset, &self.reader)) + else: + check_cstatus(CFileReader.Open(reader, &self.reader)) + + property num_dictionaries: + + def __get__(self): + return self.reader.get().num_dictionaries() + + property num_record_batches: + + def __get__(self): + return self.reader.get().num_record_batches() + + def get_record_batch(self, int i): + cdef: + shared_ptr[CRecordBatch] batch + RecordBatch result + + if i < 0 or i >= self.num_record_batches: + raise ValueError('Batch number {0} out of range'.format(i)) + + with nogil: + check_cstatus(self.reader.get().GetRecordBatch(i, &batch)) + + result = RecordBatch() + result.init(batch) + + return result diff --git a/python/pyarrow/table.pxd b/python/pyarrow/table.pxd index 0a5c122c95c..79c9ae3b0a1 100644 --- a/python/pyarrow/table.pxd +++ b/python/pyarrow/table.pxd @@ -16,7 +16,10 @@ # under the License. from pyarrow.includes.common cimport shared_ptr -from pyarrow.includes.libarrow cimport CChunkedArray, CColumn, CTable +from pyarrow.includes.libarrow cimport (CChunkedArray, CColumn, CTable, + CRecordBatch) + +from pyarrow.schema cimport Schema cdef class ChunkedArray: @@ -41,6 +44,16 @@ cdef class Table: cdef: shared_ptr[CTable] sp_table CTable* table - + cdef init(self, const shared_ptr[CTable]& table) cdef _check_nullptr(self) + + +cdef class RecordBatch: + cdef: + shared_ptr[CRecordBatch] sp_batch + CRecordBatch* batch + Schema _schema + + cdef init(self, const shared_ptr[CRecordBatch]& table) + cdef _check_nullptr(self) diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index ade82aa6761..a1cadcd1e0f 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -19,6 +19,8 @@ # distutils: language = c++ # cython: embedsignature = True +from cython.operator cimport dereference as deref + from pyarrow.includes.libarrow cimport * cimport pyarrow.includes.pyarrow as pyarrow @@ -45,8 +47,8 @@ cdef class ChunkedArray: cdef _check_nullptr(self): if self.chunked_array == NULL: - raise ReferenceError("ChunkedArray object references a NULL pointer." - "Not initialized.") + raise ReferenceError("ChunkedArray object references a NULL " + "pointer. Not initialized.") def length(self): self._check_nullptr() @@ -144,6 +146,130 @@ cdef class Column: return chunked_array +cdef _schema_from_arrays(arrays, names, shared_ptr[CSchema]* schema): + cdef: + Array arr + c_string c_name + vector[shared_ptr[CField]] fields + + cdef int K = len(arrays) + + fields.resize(K) + for i in range(K): + arr = arrays[i] + c_name = tobytes(names[i]) + fields[i].reset(new CField(c_name, arr.type.sp_type, True)) + + schema.reset(new CSchema(fields)) + + + +cdef _dataframe_to_arrays(df, name, timestamps_to_ms): + from pyarrow.array import from_pandas_series + + cdef: + list names = [] + list arrays = [] + + for name in df.columns: + col = df[name] + arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms) + + names.append(name) + arrays.append(arr) + + return names, arrays + + +cdef class RecordBatch: + + def __cinit__(self): + self.batch = NULL + self._schema = None + + cdef init(self, const shared_ptr[CRecordBatch]& batch): + self.sp_batch = batch + self.batch = batch.get() + + cdef _check_nullptr(self): + if self.batch == NULL: + raise ReferenceError("Object not initialized") + + def __len__(self): + self._check_nullptr() + return self.batch.num_rows() + + property num_columns: + + def __get__(self): + self._check_nullptr() + return self.batch.num_columns() + + property num_rows: + + def __get__(self): + return len(self) + + property schema: + + def __get__(self): + cdef Schema schema + self._check_nullptr() + if self._schema is None: + schema = Schema() + schema.init_schema(self.batch.schema()) + self._schema = schema + + return self._schema + + def __getitem__(self, i): + cdef Array arr = Array() + arr.init(self.batch.column(i)) + return arr + + def equals(self, RecordBatch other): + self._check_nullptr() + other._check_nullptr() + + return self.batch.Equals(deref(other.batch)) + + @classmethod + def from_pandas(cls, df): + """ + Convert pandas.DataFrame to an Arrow RecordBatch + """ + names, arrays = _dataframe_to_arrays(df, None, False) + return cls.from_arrays(names, arrays) + + @staticmethod + def from_arrays(names, arrays): + cdef: + Array arr + RecordBatch result + c_string c_name + shared_ptr[CSchema] schema + shared_ptr[CRecordBatch] batch + vector[shared_ptr[CArray]] c_arrays + int32_t num_rows + + if len(arrays) == 0: + raise ValueError('Record batch cannot contain no arrays (for now)') + + num_rows = len(arrays[0]) + _schema_from_arrays(arrays, names, &schema) + + for i in range(len(arrays)): + arr = arrays[i] + c_arrays.push_back(arr.sp_array) + + batch.reset(new CRecordBatch(schema, num_rows, c_arrays)) + + result = RecordBatch() + result.init(batch) + + return result + + cdef class Table: ''' Do not call this class's constructor directly. @@ -161,38 +287,50 @@ cdef class Table: raise ReferenceError("Table object references a NULL pointer." "Not initialized.") - @staticmethod - def from_pandas(df, name=None): - return from_pandas_dataframe(df, name=name) + @classmethod + def from_pandas(cls, df, name=None, timestamps_to_ms=False): + """ + Convert pandas.DataFrame to an Arrow Table + + Parameters + ---------- + df: pandas.DataFrame + + name: str + + timestamps_to_ms: bool + Convert datetime columns to ms resolution. This is needed for + compability with other functionality like Parquet I/O which + only supports milliseconds. + """ + names, arrays = _dataframe_to_arrays(df, name=name, + timestamps_to_ms=timestamps_to_ms) + return cls.from_arrays(names, arrays, name=name) @staticmethod def from_arrays(names, arrays, name=None): cdef: Array arr - Table result c_string c_name vector[shared_ptr[CField]] fields vector[shared_ptr[CColumn]] columns + Table result shared_ptr[CSchema] schema shared_ptr[CTable] table - cdef int K = len(arrays) + _schema_from_arrays(arrays, names, &schema) - fields.resize(K) + cdef int K = len(arrays) columns.resize(K) for i in range(K): arr = arrays[i] - c_name = tobytes(names[i]) - - fields[i].reset(new CField(c_name, arr.type.sp_type, True)) - columns[i].reset(new CColumn(fields[i], arr.sp_array)) + columns[i].reset(new CColumn(schema.get().field(i), arr.sp_array)) if name is None: c_name = '' else: c_name = tobytes(name) - schema.reset(new CSchema(fields)) table.reset(new CTable(c_name, schema, columns)) result = Table() @@ -268,32 +406,4 @@ cdef class Table: -def from_pandas_dataframe(object df, name=None, timestamps_to_ms=False): - """ - Convert pandas.DataFrame to an Arrow Table - - Parameters - ---------- - df: pandas.DataFrame - - name: str - - timestamps_to_ms: bool - Convert datetime columns to ms resolution. This is needed for - compability with other functionality like Parquet I/O which - only supports milliseconds. - """ - from pyarrow.array import from_pandas_series - - cdef: - list names = [] - list arrays = [] - - for name in df.columns: - col = df[name] - arr = from_pandas_series(col, timestamps_to_ms=timestamps_to_ms) - - names.append(name) - arrays.append(arr) - - return Table.from_arrays(names, arrays, name=name) +from_pandas_dataframe = Table.from_pandas diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 86147f8df5a..0a17f691ccd 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -19,6 +19,10 @@ import pyarrow.formatting as fmt +def test_total_bytes_allocated(): + assert pyarrow.total_allocated_bytes() == 0 + + def test_repr_on_pre_init_array(): arr = pyarrow.array.Array() assert len(repr(arr)) > 0 diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 9a41ebe3e8c..211a12bcd92 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -98,3 +98,44 @@ def test_bytes_reader(): def test_bytes_reader_non_bytes(): with pytest.raises(ValueError): io.BytesReader(u('some sample data')) + + + +# ---------------------------------------------------------------------- +# Buffers + + +def test_buffer_bytes(): + val = b'some data' + + buf = io.buffer_from_bytes(val) + assert isinstance(buf, io.Buffer) + + result = buf.to_pybytes() + + assert result == val + + +def test_memory_output_stream(): + # 10 bytes + val = b'dataabcdef' + + f = io.InMemoryOutputStream() + + K = 1000 + for i in range(K): + f.write(val) + + buf = f.get_result() + + assert len(buf) == len(val) * K + assert buf.to_pybytes() == val * K + + +def test_inmemory_write_after_closed(): + f = io.InMemoryOutputStream() + f.write(b'ok') + f.get_result() + + with pytest.raises(IOError): + f.write(b'not ok') diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py new file mode 100644 index 00000000000..b9e9e6ed0c4 --- /dev/null +++ b/python/pyarrow/tests/test_ipc.py @@ -0,0 +1,116 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import io + +import numpy as np +import pandas as pd + +import pyarrow as A +import pyarrow.io as arrow_io +import pyarrow.ipc as ipc + + +class RoundtripTest(object): + # Also tests writing zero-copy NumPy array with additional padding + + def __init__(self): + self.sink = self._get_sink() + + def _get_sink(self): + return io.BytesIO() + + def _get_source(self): + return self.sink.getvalue() + + def run(self): + nrows = 5 + df = pd.DataFrame({ + 'one': np.random.randn(nrows), + 'two': ['foo', np.nan, 'bar', 'bazbaz', 'qux']}) + + batch = A.RecordBatch.from_pandas(df) + writer = ipc.ArrowFileWriter(self.sink, batch.schema) + + num_batches = 5 + frames = [] + batches = [] + for i in range(num_batches): + unique_df = df.copy() + unique_df['one'] = np.random.randn(nrows) + + batch = A.RecordBatch.from_pandas(unique_df) + writer.write_record_batch(batch) + frames.append(unique_df) + batches.append(batch) + + writer.close() + + file_contents = self._get_source() + reader = ipc.ArrowFileReader(file_contents) + + assert reader.num_record_batches == num_batches + + for i in range(num_batches): + # it works. Must convert back to DataFrame + batch = reader.get_record_batch(i) + assert batches[i].equals(batch) + + +class InMemoryStreamTest(RoundtripTest): + + def _get_sink(self): + return arrow_io.InMemoryOutputStream() + + def _get_source(self): + return self.sink.get_result() + + +def test_ipc_file_simple_roundtrip(): + helper = RoundtripTest() + helper.run() + + +# XXX: For benchmarking + +def big_batch(): + df = pd.DataFrame( + np.random.randn(2**4, 2**20).T, + columns=[str(i) for i in range(2**4)] + ) + + df = pd.concat([df] * 2 ** 3, ignore_index=True) + + return A.RecordBatch.from_pandas(df) + + +def write_to_memory(batch): + sink = io.BytesIO() + write_file(batch, sink) + return sink.getvalue() + + +def write_file(batch, sink): + writer = ipc.ArrowFileWriter(sink, batch.schema) + writer.write_record_batch(batch) + writer.close() + + +def read_file(source): + reader = ipc.ArrowFileReader(source) + return [reader.get_record_batch(i) + for i in range(reader.num_record_batches)] diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index abf143199fe..c5130329e02 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -15,60 +15,52 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest import pyarrow as A -class TestRowBatch(unittest.TestCase): +def test_recordbatch_basics(): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] - def test_basics(self): - data = [ - A.from_pylist(range(5)), - A.from_pylist([-10, -5, 0, 5, 10]) - ] - num_rows = 5 + batch = A.RecordBatch.from_arrays(['c0', 'c1'], data) - descr = A.schema([A.field('c0', data[0].type), - A.field('c1', data[1].type)]) + assert len(batch) == 5 + assert batch.num_rows == 5 + assert batch.num_columns == len(data) - batch = A.RowBatch(descr, num_rows, data) - assert len(batch) == num_rows - assert batch.num_rows == num_rows - assert batch.num_columns == len(data) +def test_table_basics(): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + assert table.name == 'table_name' + assert len(table) == 5 + assert table.num_rows == 5 + assert table.num_columns == 2 + assert table.shape == (5, 2) + for col in table.itercolumns(): + for chunk in col.data.iterchunks(): + assert chunk is not None -class TestTable(unittest.TestCase): - def test_basics(self): - data = [ - A.from_pylist(range(5)), - A.from_pylist([-10, -5, 0, 5, 10]) - ] - table = A.Table.from_arrays(('a', 'b'), data, 'table_name') - assert table.name == 'table_name' - assert len(table) == 5 - assert table.num_rows == 5 - assert table.num_columns == 2 - assert table.shape == (5, 2) +def test_table_pandas(): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') - for col in table.itercolumns(): - for chunk in col.data.iterchunks(): - assert chunk is not None + # TODO: Use this part once from_pandas is implemented + # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} + # df = pd.DataFrame(data) + # A.Table.from_pandas(df) - def test_pandas(self): - data = [ - A.from_pylist(range(5)), - A.from_pylist([-10, -5, 0, 5, 10]) - ] - table = A.Table.from_arrays(('a', 'b'), data, 'table_name') - - # TODO: Use this part once from_pandas is implemented - # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} - # df = pd.DataFrame(data) - # A.Table.from_pandas(df) - - df = table.to_pandas() - assert set(df.columns) == set(('a', 'b')) - assert df.shape == (5, 2) - assert df.ix[0, 'b'] == -10 + df = table.to_pandas() + assert set(df.columns) == set(('a', 'b')) + assert df.shape == (5, 2) + assert df.loc[0, 'b'] == -10 diff --git a/python/setup.py b/python/setup.py index d1be122888e..d040ea7e892 100644 --- a/python/setup.py +++ b/python/setup.py @@ -102,6 +102,7 @@ def initialize_options(self): 'config', 'error', 'io', + 'ipc', 'parquet', 'scalar', 'schema', diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 78ef1b31f34..680f3a539b5 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -426,7 +426,7 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { // Give the sequence converter an array builder std::shared_ptr builder; - RETURN_ARROW_NOT_OK(arrow::MakeBuilder(GetMemoryPool(), type, &builder)); + RETURN_ARROW_NOT_OK(arrow::MakeBuilder(get_memory_pool(), type, &builder)); converter->Init(builder); PY_RETURN_NOT_OK(converter->AppendData(obj)); diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index d224074d652..ae24b7ee584 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -602,6 +602,8 @@ class ArrowDeserializer { } Status AllocateOutput(int type) { + PyAcquireGIL lock; + npy_intp dims[1] = {col_->length()}; out_ = reinterpret_cast(PyArray_SimpleNew(1, dims, type)); @@ -616,6 +618,8 @@ class ArrowDeserializer { } Status OutputFromData(int type, void* data) { + PyAcquireGIL lock; + // Zero-Copy. We can pass the data pointer directly to NumPy. Py_INCREF(py_ref_); OwnedRef py_ref(py_ref_); @@ -706,6 +710,8 @@ class ArrowDeserializer { inline typename std::enable_if< arrow_traits::is_boolean, Status>::type ConvertValues(const std::shared_ptr& arr) { + PyAcquireGIL lock; + arrow::BooleanArray* bool_arr = static_cast(arr.get()); if (arr->null_count() > 0) { @@ -743,6 +749,8 @@ class ArrowDeserializer { inline typename std::enable_if< T2 == arrow::Type::STRING, Status>::type ConvertValues(const std::shared_ptr& arr) { + PyAcquireGIL lock; + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); PyObject** out_values = reinterpret_cast(PyArray_DATA(out_)); diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc index 82b14fdf401..09f3efb5a03 100644 --- a/python/src/pyarrow/common.cc +++ b/python/src/pyarrow/common.cc @@ -63,7 +63,7 @@ class PyArrowMemoryPool : public arrow::MemoryPool { int64_t bytes_allocated_; }; -arrow::MemoryPool* GetMemoryPool() { +arrow::MemoryPool* get_memory_pool() { static PyArrowMemoryPool memory_pool; return &memory_pool; } diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index bc599f84fab..96eed1654a7 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -109,7 +109,8 @@ class PyGILGuard { return Status::UnknownError(message); \ } -PYARROW_EXPORT arrow::MemoryPool* GetMemoryPool(); +// Return the common PyArrow memory pool +PYARROW_EXPORT arrow::MemoryPool* get_memory_pool(); class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { public: @@ -120,6 +121,7 @@ class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { data_ = reinterpret_cast(PyArray_DATA(arr_)); size_ = PyArray_SIZE(arr_); + capacity_ = size_ * PyArray_DESCR(arr_)->elsize; } virtual ~NumPyBuffer() { @@ -139,6 +141,22 @@ class PYARROW_EXPORT PyBytesBuffer : public arrow::Buffer { PyObject* obj_; }; + +class PyAcquireGIL { + public: + PyAcquireGIL() { + state_ = PyGILState_Ensure(); + } + + ~PyAcquireGIL() { + PyGILState_Release(state_); + } + + private: + PyGILState_STATE state_; + DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL); +}; + } // namespace pyarrow #endif // PYARROW_COMMON_H diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index 35054e9025a..9879b3474bc 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -47,9 +47,9 @@ static arrow::Status CheckPyError() { PyErr_Fetch(&exc_type, &exc_value, &traceback); PyObjectStringify stringified(exc_value); std::string message(stringified.bytes); - Py_DECREF(exc_type); - Py_DECREF(exc_value); - Py_DECREF(traceback); + Py_XDECREF(exc_type); + Py_XDECREF(exc_value); + Py_XDECREF(traceback); PyErr_Clear(); return arrow::Status::IOError(message); } From fb799bc8f818574aacf380b2694aec011d2c18dd Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Mon, 10 Oct 2016 22:49:47 -0400 Subject: [PATCH 166/210] ARROW-112: Changed constexprs to kValue naming. Consistent with Google style. Author: Leif Walsh Closes #168 from leifwalsh/constant-name-fix-no-enum and squashes the following commits: 37a0b34 [Leif Walsh] ARROW-112: Changed constexprs to kValue naming. --- cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/types/json.cc | 6 +++--- cpp/src/arrow/types/primitive-test.cc | 8 ++++---- cpp/src/arrow/types/primitive.cc | 2 +- cpp/src/arrow/util/bit-util.h | 10 +++++----- cpp/src/arrow/util/buffer.h | 2 -- 6 files changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 5d9fb992ff0..646a6f24e9d 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -33,7 +33,7 @@ class Array; class MemoryPool; class PoolBuffer; -static constexpr int32_t MIN_BUILDER_CAPACITY = 1 << 5; +static constexpr int32_t kMinBuilderCapacity = 1 << 5; // Base class for all data array builders. // This class provides a facilities for incrementally building the null bitmap diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc index a4e0d085620..89240fc22bb 100644 --- a/cpp/src/arrow/types/json.cc +++ b/cpp/src/arrow/types/json.cc @@ -30,8 +30,8 @@ static const TypePtr String(new StringType()); static const TypePtr Double(new DoubleType()); static const TypePtr Bool(new BooleanType()); -static const std::vector json_types = {Null, Int32, String, Double, Bool}; -TypePtr JSONScalar::dense_type = TypePtr(new DenseUnionType(json_types)); -TypePtr JSONScalar::sparse_type = TypePtr(new SparseUnionType(json_types)); +static const std::vector kJsonTypes = {Null, Int32, String, Double, Bool}; +TypePtr JSONScalar::dense_type = TypePtr(new DenseUnionType(kJsonTypes)); +TypePtr JSONScalar::sparse_type = TypePtr(new SparseUnionType(kJsonTypes)); } // namespace arrow diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 87eb0fe3a8b..5ac2867932d 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -460,7 +460,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAdvance) { TYPED_TEST(TestPrimitiveBuilder, TestResize) { DECL_TYPE(); - int cap = MIN_BUILDER_CAPACITY * 2; + int cap = kMinBuilderCapacity * 2; ASSERT_OK(this->builder_->Reserve(cap)); ASSERT_EQ(cap, this->builder_->capacity()); @@ -472,13 +472,13 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_OK(this->builder_->Reserve(10)); ASSERT_EQ(0, this->builder_->length()); - ASSERT_EQ(MIN_BUILDER_CAPACITY, this->builder_->capacity()); + ASSERT_EQ(kMinBuilderCapacity, this->builder_->capacity()); ASSERT_OK(this->builder_->Reserve(90)); ASSERT_OK(this->builder_->Advance(100)); - ASSERT_OK(this->builder_->Reserve(MIN_BUILDER_CAPACITY)); + ASSERT_OK(this->builder_->Reserve(kMinBuilderCapacity)); - ASSERT_EQ(util::next_power2(MIN_BUILDER_CAPACITY + 100), this->builder_->capacity()); + ASSERT_EQ(util::next_power2(kMinBuilderCapacity + 100), this->builder_->capacity()); } } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 375e94f2bc1..9ba2ebdcc2d 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -86,7 +86,7 @@ Status PrimitiveBuilder::Init(int32_t capacity) { template Status PrimitiveBuilder::Resize(int32_t capacity) { // XXX: Set floor size for now - if (capacity < MIN_BUILDER_CAPACITY) { capacity = MIN_BUILDER_CAPACITY; } + if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } if (capacity_ == 0) { RETURN_NOT_OK(Init(capacity)); diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 3087ce7784d..c33ef272f05 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -44,22 +44,22 @@ static inline int64_t ceil_2bytes(int64_t size) { return (size + 15) & ~15; } -static constexpr uint8_t BITMASK[] = {1, 2, 4, 8, 16, 32, 64, 128}; +static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; static inline bool get_bit(const uint8_t* bits, int i) { - return static_cast(bits[i / 8] & BITMASK[i % 8]); + return static_cast(bits[i / 8] & kBitmask[i % 8]); } static inline bool bit_not_set(const uint8_t* bits, int i) { - return (bits[i / 8] & BITMASK[i % 8]) == 0; + return (bits[i / 8] & kBitmask[i % 8]) == 0; } static inline void clear_bit(uint8_t* bits, int i) { - bits[i / 8] &= ~BITMASK[i % 8]; + bits[i / 8] &= ~kBitmask[i % 8]; } static inline void set_bit(uint8_t* bits, int i) { - bits[i / 8] |= BITMASK[i % 8]; + bits[i / 8] |= kBitmask[i % 8]; } static inline int64_t next_power2(int64_t n) { diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index 01e4259c31f..bc0df86221c 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -141,8 +141,6 @@ class ARROW_EXPORT PoolBuffer : public ResizableBuffer { MemoryPool* pool_; }; -static constexpr int64_t MIN_BUFFER_CAPACITY = 1024; - class BufferBuilder { public: explicit BufferBuilder(MemoryPool* pool) From 8c8d341e12efcedecd3c2545aaf349bf5f899bc1 Mon Sep 17 00:00:00 2001 From: Steven Phillips Date: Mon, 10 Oct 2016 13:42:41 -0700 Subject: [PATCH 167/210] ARROW-326: Include scale and precision when materializing decimal writer closes #166 --- java/vector/src/main/codegen/templates/MapWriters.java | 5 +++++ .../arrow/vector/complex/impl/TestPromotableWriter.java | 9 ++++++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index 9fe20df7a1d..696bbf655ca 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -73,7 +73,12 @@ public class ${mode}MapWriter extends AbstractFieldWriter { <#if lowerName == "int" ><#assign lowerName = "integer" /> <#assign upperName = minor.class?upper_case /> case ${upperName}: + <#if lowerName == "decimal" > + Decimal decimal = (Decimal)child.getType(); + decimal(child.getName(), decimal.getScale(), decimal.getPrecision()); + <#else> ${lowerName}(child.getName()); + break; } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java index d439cebeda6..176ad5195b3 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/impl/TestPromotableWriter.java @@ -67,6 +67,8 @@ public void testPromoteToUnion() throws Exception { writer.setPosition(1); writer.bit("A").writeBit(1); + writer.decimal("dec", 10,10); + writer.setPosition(2); writer.integer("A").writeInt(10); @@ -108,9 +110,10 @@ public void testPromoteToUnion() throws Exception { newMapWriter.setPosition(2); newMapWriter.integer("A").writeInt(10); - Field childField = container.getField().getChildren().get(0).getChildren().get(0); - assertEquals("Child field should be union type: " + childField.getName(), Type.Union, childField.getType().getTypeType()); - + Field childField1 = container.getField().getChildren().get(0).getChildren().get(0); + Field childField2 = container.getField().getChildren().get(0).getChildren().get(1); + assertEquals("Child field should be union type: " + childField1.getName(), Type.Union, childField1.getType().getTypeType()); + assertEquals("Child field should be decimal type: " + childField2.getName(), Type.Decimal, childField2.getType().getTypeType()); } } } From 994aa5a903917aca0c9dd372341d4dcbc8be3aa5 Mon Sep 17 00:00:00 2001 From: Leif Walsh Date: Tue, 11 Oct 2016 14:00:36 -0400 Subject: [PATCH 168/210] ARROW-189: Build 3rd party with ExternalProject. When third party env vars *_HOME are not present, use cmake's ExternalProject to fetch and build them. When those vars are present, we just use them. Author: Leif Walsh Closes #167 from leifwalsh/cmake-externalproject and squashes the following commits: e4fb63a [Leif Walsh] ARROW-189: Remove 3rd party from conda build. 7892bae [Leif Walsh] ARROW-189: Fix darwin build. 8630428 [Leif Walsh] ARROW-189: Addressed CR comments. 8215abc [Leif Walsh] ARROW-189: Build 3rd party with ExternalProject. --- ci/travis_before_script_cpp.sh | 8 -- ci/travis_script_python.sh | 5 -- cpp/CMakeLists.txt | 107 ++++++++++++++++++++++---- cpp/README.md | 18 +---- cpp/conda.recipe/build.sh | 10 --- cpp/doc/Parquet.md | 1 - cpp/setup_build_env.sh | 21 ----- cpp/src/arrow/ipc/CMakeLists.txt | 10 ++- cpp/thirdparty/build_thirdparty.sh | 104 ------------------------- cpp/thirdparty/download_thirdparty.sh | 44 ----------- cpp/thirdparty/set_thirdparty_env.sh | 24 ------ cpp/thirdparty/versions.sh | 23 ------ 12 files changed, 101 insertions(+), 274 deletions(-) delete mode 100755 cpp/setup_build_env.sh delete mode 100755 cpp/thirdparty/build_thirdparty.sh delete mode 100755 cpp/thirdparty/download_thirdparty.sh delete mode 100755 cpp/thirdparty/set_thirdparty_env.sh delete mode 100755 cpp/thirdparty/versions.sh diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index acd820bbed2..2d4224b3333 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -26,14 +26,6 @@ pushd $CPP_BUILD_DIR CPP_DIR=$TRAVIS_BUILD_DIR/cpp -# Build an isolated thirdparty -cp -r $CPP_DIR/thirdparty . -cp $CPP_DIR/setup_build_env.sh . - -source setup_build_env.sh - -echo $GTEST_HOME - : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} CMAKE_COMMON_FLAGS="\ diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index a75ff0778bc..97f0563240c 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -21,11 +21,6 @@ export MINICONDA=$HOME/miniconda export PATH="$MINICONDA/bin:$PATH" export PARQUET_HOME=$MINICONDA -# Share environment with C++ -pushd $CPP_BUILD_DIR -source setup_build_env.sh -popd - pushd $PYTHON_DIR python_version_tests() { diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f70c8ab4bcc..d682dc76f8c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -21,10 +21,15 @@ project(arrow) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake_modules") include(CMakeParseArguments) +include(ExternalProject) set(BUILD_SUPPORT_DIR "${CMAKE_SOURCE_DIR}/build-support") set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty") +set(GTEST_VERSION "1.7.0") +set(GBENCHMARK_VERSION "1.0.0") +set(FLATBUFFERS_VERSION "1.3.0") + find_package(ClangTools) if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) # Generate a Clang compile_commands.json "compilation database" file for use @@ -422,16 +427,6 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) endif() endfunction() -## GTest -if ("$ENV{GTEST_HOME}" STREQUAL "") - set(GTest_HOME ${THIRDPARTY_DIR}/googletest-release-1.7.0) -endif() - -## Google Benchmark -if ("$ENV{GBENCHMARK_HOME}" STREQUAL "") - set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed) -endif() - # ---------------------------------------------------------------------- # Add Boost dependencies (code adapted from Apache Kudu (incubating)) @@ -476,18 +471,78 @@ include_directories(SYSTEM ${Boost_INCLUDE_DIR}) if(ARROW_BUILD_TESTS) add_custom_target(unittest ctest -L unittest) - find_package(GTest REQUIRED) + + if("$ENV{GTEST_HOME}" STREQUAL "") + if(APPLE) + set(GTEST_CMAKE_CXX_FLAGS "-fPIC -std=c++11 -stdlib=libc++ -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes") + else() + set(GTEST_CMAKE_CXX_FLAGS "-fPIC") + endif() + + ExternalProject_Add(googletest_ep + URL "https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz" + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${GTEST_CMAKE_CXX_FLAGS} + # googletest doesn't define install rules, so just build in the + # source dir and don't try to install. See its README for + # details. + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "") + + set(GTEST_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/googletest_ep-prefix/src/googletest_ep") + set(GTEST_INCLUDE_DIR "${GTEST_PREFIX}/include") + set(GTEST_STATIC_LIB "${GTEST_PREFIX}/libgtest.a") + set(GTEST_VENDORED 1) + else() + find_package(GTest REQUIRED) + set(GTEST_VENDORED 0) + endif() + + message(STATUS "GTest include dir: ${GTEST_INCLUDE_DIR}") + message(STATUS "GTest static library: ${GTEST_STATIC_LIB}") include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) ADD_THIRDPARTY_LIB(gtest STATIC_LIB ${GTEST_STATIC_LIB}) + + if(GTEST_VENDORED) + add_dependencies(gtest googletest_ep) + endif() endif() if(ARROW_BUILD_BENCHMARKS) add_custom_target(runbenchmark ctest -L benchmark) - find_package(GBenchmark REQUIRED) + + if("$ENV{GBENCHMARK_HOME}" STREQUAL "") + if(APPLE) + set(GBENCHMARK_CMAKE_CXX_FLAGS "-std=c++11 -stdlib=libc++") + else() + set(GBENCHMARK_CMAKE_CXX_FLAGS "--std=c++11") + endif() + + set(GBENCHMARK_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/gbenchmark_ep/src/gbenchmark_ep-install") + ExternalProject_Add(gbenchmark_ep + URL "https://github.com/google/benchmark/archive/v${GBENCHMARK_VERSION}.tar.gz" + CMAKE_ARGS + "-DCMAKE_BUILD_TYPE=Release" + "-DCMAKE_INSTALL_PREFIX:PATH=${GBENCHMARK_PREFIX}" + "-DCMAKE_CXX_FLAGS=-fPIC ${GBENCHMARK_CMAKE_CXX_FLAGS}") + + set(GBENCHMARK_INCLUDE_DIR "${GBENCHMARK_PREFIX}/include") + set(GBENCHMARK_STATIC_LIB "${GBENCHMARK_PREFIX}/lib/libbenchmark.a") + set(GBENCHMARK_VENDORED 1) + else() + find_package(GBenchmark REQUIRED) + set(GBENCHMARK_VENDORED 0) + endif() + + message(STATUS "GBenchmark include dir: ${GBENCHMARK_INCLUDE_DIR}") + message(STATUS "GBenchmark static library: ${GBENCHMARK_STATIC_LIB}") include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR}) ADD_THIRDPARTY_LIB(benchmark STATIC_LIB ${GBENCHMARK_STATIC_LIB}) + + if(GBENCHMARK_VENDORED) + add_dependencies(benchmark gbenchmark_ep) + endif() endif() ## Google PerfTools @@ -705,14 +760,34 @@ add_subdirectory(src/arrow/types) ## Flatbuffers if(ARROW_IPC) - find_package(Flatbuffers REQUIRED) + if("$ENV{FLATBUFFERS_HOME}" STREQUAL "") + set(FLATBUFFERS_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/flatbuffers_ep-prefix/src/flatbuffers_ep-install") + ExternalProject_Add(flatbuffers_ep + URL "https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_VERSION}.tar.gz" + CMAKE_ARGS + "-DCMAKE_CXX_FLAGS=-fPIC" + "-DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_PREFIX}" + "-DFLATBUFFERS_BUILD_TESTS=OFF") + + set(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_PREFIX}/include") + set(FLATBUFFERS_STATIC_LIB "${FLATBUFFERS_PREFIX}/libflatbuffers.a") + set(FLATBUFFERS_COMPILER "${FLATBUFFERS_PREFIX}/bin/flatc") + set(FLATBUFFERS_VENDORED 1) + else() + find_package(Flatbuffers REQUIRED) + set(FLATBUFFERS_VENDORED 0) + endif() + message(STATUS "Flatbuffers include dir: ${FLATBUFFERS_INCLUDE_DIR}") message(STATUS "Flatbuffers static library: ${FLATBUFFERS_STATIC_LIB}") message(STATUS "Flatbuffers compiler: ${FLATBUFFERS_COMPILER}") include_directories(SYSTEM ${FLATBUFFERS_INCLUDE_DIR}) - add_library(flatbuffers STATIC IMPORTED) - set_target_properties(flatbuffers PROPERTIES - IMPORTED_LOCATION ${FLATBUFFERS_STATIC_LIB}) + ADD_THIRDPARTY_LIB(flatbuffers + STATIC_LIB ${FLATBUFFERS_STATIC_LIB}) + + if(FLATBUFFERS_VENDORED) + add_dependencies(flatbuffers flatbuffers_ep) + endif() add_subdirectory(src/arrow/ipc) endif() diff --git a/cpp/README.md b/cpp/README.md index a1c3ef28447..190e6f85b42 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -22,23 +22,6 @@ out-of-source builds with the latter one being preferred. Arrow requires a C++11-enabled compiler. On Linux, gcc 4.8 and higher should be sufficient. -To build the thirdparty build dependencies, run: - -``` -./thirdparty/download_thirdparty.sh -./thirdparty/build_thirdparty.sh -source ./thirdparty/set_thirdparty_env.sh -``` - -You can also run from the root of the C++ tree - -``` -source setup_build_env.sh -``` - -Arrow is configured to use the `thirdparty` directory by default for its build -dependencies. To set up a custom toolchain see below. - Simple debug build: mkdir debug @@ -76,6 +59,7 @@ variables * Googletest: `GTEST_HOME` (only required to build the unit tests) * Google Benchmark: `GBENCHMARK_HOME` (only required if building benchmarks) * Flatbuffers: `FLATBUFFERS_HOME` (only required for the IPC extensions) +* Hadoop: `HADOOP_HOME` (only required for the HDFS I/O extensions) ## Continuous Integration diff --git a/cpp/conda.recipe/build.sh b/cpp/conda.recipe/build.sh index 6d7454e9272..0536fd99b5c 100644 --- a/cpp/conda.recipe/build.sh +++ b/cpp/conda.recipe/build.sh @@ -38,19 +38,9 @@ cd .. rm -rf conda-build mkdir conda-build - -cp -r thirdparty conda-build/ - cd conda-build pwd -# Build googletest for running unit tests -./thirdparty/download_thirdparty.sh -./thirdparty/build_thirdparty.sh gtest - -source thirdparty/versions.sh -export GTEST_HOME=`pwd`/thirdparty/$GTEST_BASEDIR - # if [ `uname` == Linux ]; then # SHARED_LINKER_FLAGS='-static-libstdc++' # elif [ `uname` == Darwin ]; then diff --git a/cpp/doc/Parquet.md b/cpp/doc/Parquet.md index 34b83e78d0a..4985dd3b0bc 100644 --- a/cpp/doc/Parquet.md +++ b/cpp/doc/Parquet.md @@ -24,7 +24,6 @@ export ARROW_HOME=$HOME/local git clone https://github.com/apache/parquet-cpp.git cd parquet-cpp -source setup_build_env.sh cmake -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME -DPARQUET_ARROW=on make -j4 make install diff --git a/cpp/setup_build_env.sh b/cpp/setup_build_env.sh deleted file mode 100755 index 546216753b3..00000000000 --- a/cpp/setup_build_env.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) - -./thirdparty/download_thirdparty.sh || { echo "download_thirdparty.sh failed" ; return; } -./thirdparty/build_thirdparty.sh || { echo "build_thirdparty.sh failed" ; return; } -source ./thirdparty/set_thirdparty_env.sh || { echo "source set_thirdparty_env.sh failed" ; return; } - -echo "Build env initialized" diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 8dcd9ac1071..d2db339de7e 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -42,6 +42,9 @@ set(ARROW_IPC_SRCS add_library(arrow_ipc SHARED ${ARROW_IPC_SRCS} ) +if(FLATBUFFERS_VENDORED) + add_dependencies(arrow_ipc flatbuffers_ep) +endif() target_link_libraries(arrow_ipc LINK_PUBLIC ${ARROW_IPC_LINK_LIBS} LINK_PRIVATE ${ARROW_IPC_PRIVATE_LINK_LIBS}) @@ -91,10 +94,15 @@ foreach(FIL ${FBS_SRC}) list(APPEND ABS_FBS_SRC ${ABS_FIL}) endforeach() +if(FLATBUFFERS_VENDORED) + set(FBS_DEPENDS ${ABS_FBS_SRC} flatbuffers_ep) +else() + set(FBS_DEPENDS ${ABS_FBS_SRC}) +endif() add_custom_command( OUTPUT ${FBS_OUTPUT_FILES} COMMAND ${FLATBUFFERS_COMPILER} -c -o ${OUTPUT_DIR} ${ABS_FBS_SRC} - DEPENDS ${ABS_FBS_SRC} + DEPENDS ${FBS_DEPENDS} COMMENT "Running flatc compiler on ${ABS_FBS_SRC}" VERBATIM ) diff --git a/cpp/thirdparty/build_thirdparty.sh b/cpp/thirdparty/build_thirdparty.sh deleted file mode 100755 index 5011e29c01a..00000000000 --- a/cpp/thirdparty/build_thirdparty.sh +++ /dev/null @@ -1,104 +0,0 @@ -#!/bin/bash - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -set -x -set -e -TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) - -source $TP_DIR/versions.sh -PREFIX=$TP_DIR/installed - -################################################################################ - -if [ "$#" = "0" ]; then - F_ALL=1 -else - # Allow passing specific libs to build on the command line - for arg in "$*"; do - case $arg in - "gtest") F_GTEST=1 ;; - "gbenchmark") F_GBENCHMARK=1 ;; - "flatbuffers") F_FLATBUFFERS=1 ;; - *) echo "Unknown module: $arg"; exit 1 ;; - esac - done -fi - -################################################################################ - -# Determine how many parallel jobs to use for make based on the number of cores -if [[ "$OSTYPE" =~ ^linux ]]; then - PARALLEL=$(grep -c processor /proc/cpuinfo) -elif [[ "$OSTYPE" == "darwin"* ]]; then - PARALLEL=$(sysctl -n hw.ncpu) -else - echo Unsupported platform $OSTYPE - exit 1 -fi - -mkdir -p "$PREFIX/include" -mkdir -p "$PREFIX/lib" - -# On some systems, autotools installs libraries to lib64 rather than lib. Fix -# this by setting up lib64 as a symlink to lib. We have to do this step first -# to handle cases where one third-party library depends on another. -ln -sf lib "$PREFIX/lib64" - -# use the compiled tools -export PATH=$PREFIX/bin:$PATH - -type cmake >/dev/null 2>&1 || { echo >&2 "cmake not installed. Aborting."; exit 1; } -type make >/dev/null 2>&1 || { echo >&2 "make not installed. Aborting."; exit 1; } - -STANDARD_DARWIN_FLAGS="-std=c++11 -stdlib=libc++" - -# build googletest -GOOGLETEST_ERROR="failed for googletest!" -if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then - cd $TP_DIR/$GTEST_BASEDIR - - if [[ "$OSTYPE" == "darwin"* ]]; then - CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="$STANDARD_DARWIN_FLAGS -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" || { echo "cmake $GOOGLETEST_ERROR" ; exit 1; } - else - CXXFLAGS=-fPIC cmake . || { echo "cmake $GOOGLETEST_ERROR"; exit 1; } - fi - - make -j$PARALLEL VERBOSE=1 || { echo "Make $GOOGLETEST_ERROR" ; exit 1; } -fi - -# build google benchmark -GBENCHMARK_ERROR="failed for google benchmark" -if [ -n "$F_ALL" -o -n "$F_GBENCHMARK" ]; then - cd $TP_DIR/$GBENCHMARK_BASEDIR - - CMAKE_CXX_FLAGS="--std=c++11" - if [[ "$OSTYPE" == "darwin"* ]]; then - CMAKE_CXX_FLAGS=$STANDARD_DARWIN_FLAGS - fi - cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_CXX_FLAGS="-fPIC $CMAKE_CXX_FLAGS" . || { echo "cmake $GBENCHMARK_ERROR" ; exit 1; } - - make -j$PARALLEL VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; } -fi - -FLATBUFFERS_ERROR="failed for flatbuffers" -if [ -n "$F_ALL" -o -n "$F_FLATBUFFERS" ]; then - cd $TP_DIR/$FLATBUFFERS_BASEDIR - - CXXFLAGS=-fPIC cmake -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX -DFLATBUFFERS_BUILD_TESTS=OFF . || { echo "cmake $FLATBUFFERS_ERROR" ; exit 1; } - make VERBOSE=1 -j$PARALLEL || { echo "make $FLATBUFFERS_ERROR" ; exit 1; } - make install || { echo "install $FLATBUFFERS_ERROR" ; exit 1; } -fi - -echo "---------------------" -echo "Thirdparty dependencies built and installed into $PREFIX successfully" diff --git a/cpp/thirdparty/download_thirdparty.sh b/cpp/thirdparty/download_thirdparty.sh deleted file mode 100755 index b50e7bc06a1..00000000000 --- a/cpp/thirdparty/download_thirdparty.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -set -x -set -e - -TP_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) - -source $TP_DIR/versions.sh - -download_extract_and_cleanup() { - type curl >/dev/null 2>&1 || { echo >&2 "curl not installed. Aborting."; exit 1; } - filename=$TP_DIR/$(basename "$1") - curl -#LC - "$1" -o $filename - tar xzf $filename -C $TP_DIR - rm $filename -} - -if [ ! -d ${GTEST_BASEDIR} ]; then - echo "Fetching gtest" - download_extract_and_cleanup $GTEST_URL -fi - -echo ${GBENCHMARK_BASEDIR} -if [ ! -d ${GBENCHMARK_BASEDIR} ]; then - echo "Fetching google benchmark" - download_extract_and_cleanup $GBENCHMARK_URL -fi - -if [ ! -d ${FLATBUFFERS_BASEDIR} ]; then - echo "Fetching flatbuffers" - download_extract_and_cleanup $FLATBUFFERS_URL -fi diff --git a/cpp/thirdparty/set_thirdparty_env.sh b/cpp/thirdparty/set_thirdparty_env.sh deleted file mode 100755 index 135972ee9bd..00000000000 --- a/cpp/thirdparty/set_thirdparty_env.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/usr/bash - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -SOURCE_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -source $SOURCE_DIR/versions.sh - -if [ -z "$THIRDPARTY_DIR" ]; then - THIRDPARTY_DIR=$SOURCE_DIR -fi - -export GTEST_HOME=$THIRDPARTY_DIR/$GTEST_BASEDIR -export GBENCHMARK_HOME=$THIRDPARTY_DIR/installed -export FLATBUFFERS_HOME=$THIRDPARTY_DIR/installed diff --git a/cpp/thirdparty/versions.sh b/cpp/thirdparty/versions.sh deleted file mode 100755 index a7b21e19fcc..00000000000 --- a/cpp/thirdparty/versions.sh +++ /dev/null @@ -1,23 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -GTEST_VERSION=1.7.0 -GTEST_URL="https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz" -GTEST_BASEDIR=googletest-release-$GTEST_VERSION - -GBENCHMARK_VERSION=1.0.0 -GBENCHMARK_URL="https://github.com/google/benchmark/archive/v${GBENCHMARK_VERSION}.tar.gz" -GBENCHMARK_BASEDIR=benchmark-$GBENCHMARK_VERSION - -FLATBUFFERS_VERSION=1.3.0 -FLATBUFFERS_URL="https://github.com/google/flatbuffers/archive/v${FLATBUFFERS_VERSION}.tar.gz" -FLATBUFFERS_BASEDIR=flatbuffers-$FLATBUFFERS_VERSION From caa843bdaf395b915a739bf5e1d6c5eabe1f4693 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 11 Oct 2016 17:29:25 -0700 Subject: [PATCH 169/210] ARROW-333: Make writers update their internal schema even when no data is written Make PromotableWriter predefine writers when asked Author: Julien Le Dem Closes #170 from julienledem/promotable_writer_preset and squashes the following commits: 972eb9c [Julien Le Dem] ARROW-333: Make writers update their internal schema even when no data is written Make PromotableWriter predefine writers when asked --- .../main/codegen/templates/MapWriters.java | 15 +++++++++ .../main/codegen/templates/UnionWriter.java | 24 ++++++++++++++ .../vector/complex/impl/PromotableWriter.java | 14 ++++---- .../complex/writer/TestComplexWriter.java | 32 +++++++++++++++++-- 4 files changed, 76 insertions(+), 9 deletions(-) diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index 696bbf655ca..51327b43af0 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -112,6 +112,11 @@ public MapWriter map(String name) { } writer.setPosition(idx()); fields.put(name.toLowerCase(), writer); + } else { + if (writer instanceof PromotableWriter) { + // ensure writers are initialized + ((PromotableWriter)writer).getWriter(MinorType.MAP); + } } return writer; } @@ -149,6 +154,11 @@ public ListWriter list(String name) { } writer.setPosition(idx()); fields.put(name.toLowerCase(), writer); + } else { + if (writer instanceof PromotableWriter) { + // ensure writers are initialized + ((PromotableWriter)writer).getWriter(MinorType.LIST); + } } return writer; } @@ -210,6 +220,11 @@ public void end() { } writer.setPosition(idx()); fields.put(name.toLowerCase(), writer); + } else { + if (writer instanceof PromotableWriter) { + // ensure writers are initialized + ((PromotableWriter)writer).getWriter(MinorType.${upperName}); + } } return writer; } diff --git a/java/vector/src/main/codegen/templates/UnionWriter.java b/java/vector/src/main/codegen/templates/UnionWriter.java index 460ec1c0d95..efb66f168f5 100644 --- a/java/vector/src/main/codegen/templates/UnionWriter.java +++ b/java/vector/src/main/codegen/templates/UnionWriter.java @@ -25,6 +25,8 @@ package org.apache.arrow.vector.complex.impl; <#include "/@includes/vv_imports.ftl" /> +import org.apache.arrow.vector.complex.writer.BaseWriter; +import org.apache.arrow.vector.types.Types.MinorType; /* * This class is generated using freemarker and the ${.template_name} template. @@ -100,6 +102,28 @@ public ListWriter asList() { return getListWriter(); } + BaseWriter getWriter(MinorType minorType) { + switch (minorType) { + case MAP: + return getMapWriter(); + case LIST: + return getListWriter(); + <#list vv.types as type> + <#list type.minor as minor> + <#assign name = minor.class?cap_first /> + <#assign fields = minor.fields!type.fields /> + <#assign uncappedName = name?uncap_first/> + <#if !minor.class?starts_with("Decimal")> + case ${name?upper_case}: + return get${name}Writer(); + + + + default: + throw new UnsupportedOperationException("Unknown type: " + minorType); + } + } + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign fields = minor.fields!type.fields /> <#assign uncappedName = name?uncap_first/> diff --git a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java index c282688530b..94ff82c04bd 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/complex/impl/PromotableWriter.java @@ -94,19 +94,19 @@ public void setPosition(int index) { protected FieldWriter getWriter(MinorType type) { if (state == State.UNION) { - return writer; - } - if (state == State.UNTYPED) { + ((UnionWriter)writer).getWriter(type); + } else if (state == State.UNTYPED) { if (type == null) { + // ??? return null; } ValueVector v = listVector.addOrGetVector(type).getVector(); v.allocateNew(); setWriter(v); writer.setPosition(position); - } - if (type != this.type) { - return promoteToUnion(); + } else if (type != this.type) { + promoteToUnion(); + ((UnionWriter)writer).getWriter(type); } return writer; } @@ -133,7 +133,7 @@ private FieldWriter promoteToUnion() { unionVector.addVector((FieldVector)tp.getTo()); writer = new UnionWriter(unionVector); writer.setPosition(idx()); - for (int i = 0; i < idx(); i++) { + for (int i = 0; i <= idx(); i++) { unionVector.getMutator().setType(i, vector.getMinorType()); } vector = null; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 398aea915b3..9419f88de5b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -45,6 +45,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Union; import org.apache.arrow.vector.types.pojo.ArrowType.Utf8; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.util.Text; import org.junit.Assert; import org.junit.Test; @@ -362,11 +363,38 @@ public void promotableWriter() { MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); for (int i = 0; i < 100; i++) { rootReader.setPosition(i); - Assert.assertEquals(i, rootReader.reader("a").readLong().intValue()); + FieldReader reader = rootReader.reader("a"); + Long value = reader.readLong(); + Assert.assertNotNull("index: " + i, value); + Assert.assertEquals(i, value.intValue()); } for (int i = 100; i < 200; i++) { rootReader.setPosition(i); - Assert.assertEquals(Integer.toString(i), rootReader.reader("a").readText().toString()); + FieldReader reader = rootReader.reader("a"); + Text value = reader.readText(); + Assert.assertEquals(Integer.toString(i), value.toString()); } } + + /** + * Even without writing to the writer, the union schema is created correctly + */ + @Test + public void promotableWriterSchema() { + MapVector parent = new MapVector("parent", allocator, null); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + BigIntWriter bigIntWriter = rootWriter.bigInt("a"); + VarCharWriter varCharWriter = rootWriter.varChar("a"); + + Field field = parent.getField().getChildren().get(0).getChildren().get(0); + Assert.assertEquals("a", field.getName()); + Assert.assertEquals(Union.TYPE_TYPE, field.getType().getTypeType()); + + Assert.assertEquals(Int.TYPE_TYPE, field.getChildren().get(0).getType().getTypeType()); + Int intType = (Int) field.getChildren().get(0).getType(); + Assert.assertEquals(64, intType.getBitWidth()); + Assert.assertTrue(intType.getIsSigned()); + Assert.assertEquals(Utf8.TYPE_TYPE, field.getChildren().get(1).getType().getTypeType()); + } } \ No newline at end of file From 3919a277884cf504fdca5d730cf128e36db6f700 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Oct 2016 23:08:48 -0400 Subject: [PATCH 170/210] ARROW-332: Add RecordBatch.to_pandas method This makes testing and IPC data wrangling a little easier. Author: Wes McKinney Closes #165 from wesm/ARROW-332 and squashes the following commits: 5f19b97 [Wes McKinney] Add simple arrow::Array->NumPy-for-pandas conversion helper and RecordBatch.to_pandas --- python/pyarrow/includes/pyarrow.pxd | 7 +++-- python/pyarrow/io.pyx | 12 ++++++++ python/pyarrow/table.pyx | 25 ++++++++++++++-- python/pyarrow/tests/test_ipc.py | 40 ++++++++++++++++++++++++-- python/pyarrow/tests/test_table.py | 41 ++++++++++++++++++++------- python/src/pyarrow/adapters/pandas.cc | 19 +++++++++++-- python/src/pyarrow/adapters/pandas.h | 7 ++++- python/src/pyarrow/common.h | 4 +-- python/src/pyarrow/io.cc | 2 +- 9 files changed, 133 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 2fa5a7d6325..7c47f21854e 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -50,8 +50,11 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: PyStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, shared_ptr[CArray]* out) - PyStatus ArrowToPandas(const shared_ptr[CColumn]& arr, object py_ref, - PyObject** out) + PyStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, + object py_ref, PyObject** out) + + PyStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, + object py_ref, PyObject** out) MemoryPool* get_memory_pool() diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 00a492fc0ba..8970e06effd 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -230,6 +230,18 @@ cdef class InMemoryOutputStream(NativeFile): return result +cdef class BufferReader(NativeFile): + cdef: + Buffer buffer + + def __cinit__(self, Buffer buffer): + self.buffer = buffer + self.rd_file.reset(new CBufferReader(buffer.buffer.get().data(), + buffer.buffer.get().size())) + self.is_readonly = 1 + self.is_open = True + + def buffer_from_bytes(object obj): """ Construct an Arrow buffer from a Python bytes object diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index a1cadcd1e0f..969571262ca 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -100,7 +100,7 @@ cdef class Column: import pandas as pd - check_status(pyarrow.ArrowToPandas(self.sp_column, self, &arr)) + check_status(pyarrow.ConvertColumnToPandas(self.sp_column, self, &arr)) return pd.Series(arr, name=self.name) cdef _check_nullptr(self): @@ -233,6 +233,27 @@ cdef class RecordBatch: return self.batch.Equals(deref(other.batch)) + def to_pandas(self): + """ + Convert the arrow::RecordBatch to a pandas DataFrame + """ + cdef: + PyObject* np_arr + shared_ptr[CArray] arr + Column column + + import pandas as pd + + names = [] + data = [] + for i in range(self.batch.num_columns()): + arr = self.batch.column(i) + check_status(pyarrow.ConvertArrayToPandas(arr, self, &np_arr)) + names.append(frombytes(self.batch.column_name(i))) + data.append( np_arr) + + return pd.DataFrame(dict(zip(names, data)), columns=names) + @classmethod def from_pandas(cls, df): """ @@ -354,7 +375,7 @@ cdef class Table: for i in range(self.table.num_columns()): col = self.table.column(i) column = self.column(i) - check_status(pyarrow.ArrowToPandas(col, column, &arr)) + check_status(pyarrow.ConvertColumnToPandas(col, column, &arr)) names.append(frombytes(col.get().name())) data.append( arr) diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index b9e9e6ed0c4..14cbb30d5d4 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -18,6 +18,8 @@ import io import numpy as np + +from pandas.util.testing import assert_frame_equal import pandas as pd import pyarrow as A @@ -85,17 +87,40 @@ def test_ipc_file_simple_roundtrip(): helper.run() +def test_ipc_zero_copy_numpy(): + df = pd.DataFrame({'foo': [1.5]}) + + batch = A.RecordBatch.from_pandas(df) + sink = arrow_io.InMemoryOutputStream() + write_file(batch, sink) + buffer = sink.get_result() + reader = arrow_io.BufferReader(buffer) + + batches = read_file(reader) + + data = batches[0].to_pandas() + rdf = pd.DataFrame(data) + assert_frame_equal(df, rdf) + + # XXX: For benchmarking def big_batch(): + K = 2**4 + N = 2**20 df = pd.DataFrame( - np.random.randn(2**4, 2**20).T, - columns=[str(i) for i in range(2**4)] + np.random.randn(K, N).T, + columns=[str(i) for i in range(K)] ) df = pd.concat([df] * 2 ** 3, ignore_index=True) + return df + - return A.RecordBatch.from_pandas(df) +def write_to_memory2(batch): + sink = arrow_io.InMemoryOutputStream() + write_file(batch, sink) + return sink.get_result() def write_to_memory(batch): @@ -114,3 +139,12 @@ def read_file(source): reader = ipc.ArrowFileReader(source) return [reader.get_record_batch(i) for i in range(reader.num_record_batches)] + +# df = big_batch() +# batch = A.RecordBatch.from_pandas(df) +# mem = write_to_memory(batch) +# batches = read_file(mem) +# data = batches[0].to_pandas() +# rdf = pd.DataFrame(data) + +# [x.to_pandas() for x in batches] diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index c5130329e02..4c9d302106a 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -15,28 +15,47 @@ # specific language governing permissions and limitations # under the License. -import pyarrow as A +import numpy as np + +from pandas.util.testing import assert_frame_equal +import pandas as pd + +import pyarrow as pa def test_recordbatch_basics(): data = [ - A.from_pylist(range(5)), - A.from_pylist([-10, -5, 0, 5, 10]) + pa.from_pylist(range(5)), + pa.from_pylist([-10, -5, 0, 5, 10]) ] - batch = A.RecordBatch.from_arrays(['c0', 'c1'], data) + batch = pa.RecordBatch.from_arrays(['c0', 'c1'], data) assert len(batch) == 5 assert batch.num_rows == 5 assert batch.num_columns == len(data) +def test_recordbatch_from_to_pandas(): + data = pd.DataFrame({ + 'c1': np.array([1, 2, 3, 4, 5], dtype='int64'), + 'c2': np.array([1, 2, 3, 4, 5], dtype='uint32'), + 'c2': np.random.randn(5), + 'c3': ['foo', 'bar', None, 'baz', 'qux'], + 'c4': [False, True, False, True, False] + }) + + batch = pa.RecordBatch.from_pandas(data) + result = batch.to_pandas() + assert_frame_equal(data, result) + + def test_table_basics(): data = [ - A.from_pylist(range(5)), - A.from_pylist([-10, -5, 0, 5, 10]) + pa.from_pylist(range(5)), + pa.from_pylist([-10, -5, 0, 5, 10]) ] - table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + table = pa.Table.from_arrays(('a', 'b'), data, 'table_name') assert table.name == 'table_name' assert len(table) == 5 assert table.num_rows == 5 @@ -50,15 +69,15 @@ def test_table_basics(): def test_table_pandas(): data = [ - A.from_pylist(range(5)), - A.from_pylist([-10, -5, 0, 5, 10]) + pa.from_pylist(range(5)), + pa.from_pylist([-10, -5, 0, 5, 10]) ] - table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + table = pa.Table.from_arrays(('a', 'b'), data, 'table_name') # TODO: Use this part once from_pandas is implemented # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} # df = pd.DataFrame(data) - # A.Table.from_pandas(df) + # pa.Table.from_pandas(df) df = table.to_pandas() assert set(df.columns) == set(('a', 'b')) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index ae24b7ee584..b2fcd37aec9 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -21,6 +21,8 @@ #include "pyarrow/numpy_interop.h" +#include "pyarrow/adapters/pandas.h" + #include #include #include @@ -38,6 +40,7 @@ namespace pyarrow { using arrow::Array; using arrow::Column; +using arrow::Field; using arrow::DataType; namespace util = arrow::util; @@ -106,7 +109,7 @@ struct npy_traits { template <> struct npy_traits { - typedef double value_type; + typedef int64_t value_type; using TypeClass = arrow::TimestampType; static constexpr bool supports_nulls = true; @@ -163,6 +166,8 @@ class ArrowSerializer { Status ConvertData(); Status ConvertObjectStrings(std::shared_ptr* out) { + PyAcquireGIL lock; + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); arrow::TypePtr string_type(new arrow::StringType()); arrow::StringBuilder string_builder(pool_, string_type); @@ -197,6 +202,8 @@ class ArrowSerializer { } Status ConvertBooleans(std::shared_ptr* out) { + PyAcquireGIL lock; + PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); int nbytes = util::bytes_for_bits(length_); @@ -798,7 +805,15 @@ class ArrowDeserializer { } \ break; -Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, +Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out) { + static std::string dummy_name = "dummy"; + auto field = std::make_shared(dummy_name, arr->type()); + auto col = std::make_shared(field, arr); + return ConvertColumnToPandas(col, py_ref, out); +} + +Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, PyObject** out) { switch(col->type()->type) { FROM_ARROW_CASE(BOOL); diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index c3377685bcc..141d1219e64 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -31,6 +31,7 @@ namespace arrow { class Array; class Column; +class MemoryPool; } // namespace arrow @@ -39,7 +40,11 @@ namespace pyarrow { class Status; PYARROW_EXPORT -Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, +Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, + PyObject** out); + +PYARROW_EXPORT +Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, PyObject** out); PYARROW_EXPORT diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index 96eed1654a7..50c2577b93c 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -120,8 +120,8 @@ class PYARROW_EXPORT NumPyBuffer : public arrow::Buffer { Py_INCREF(arr); data_ = reinterpret_cast(PyArray_DATA(arr_)); - size_ = PyArray_SIZE(arr_); - capacity_ = size_ * PyArray_DESCR(arr_)->elsize; + size_ = PyArray_SIZE(arr_) * PyArray_DESCR(arr_)->elsize; + capacity_ = size_; } virtual ~NumPyBuffer() { diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index 9879b3474bc..7bf32ffa8d2 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -85,7 +85,7 @@ arrow::Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { ARROW_RETURN_NOT_OK(CheckPyError()); PyObject* result = PyObject_CallMethod(file_, "write", "(O)", py_data); - Py_DECREF(py_data); + Py_XDECREF(py_data); Py_XDECREF(result); ARROW_RETURN_NOT_OK(CheckPyError()); return arrow::Status::OK(); From bf749f55a1e24d79b08813a39ce51e9aaf6fb425 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 11 Oct 2016 20:11:48 -0700 Subject: [PATCH 171/210] ARROW-275: Add tests for UnionVector in Arrow File Author: Julien Le Dem Closes #169 from julienledem/union_test and squashes the following commits: 120f504 [Julien Le Dem] ARROW-275: Add tests for UnionVector in Arrow File --- .../main/codegen/templates/UnionReader.java | 4 + .../main/codegen/templates/UnionVector.java | 30 ++--- .../org/apache/arrow/vector/VectorLoader.java | 2 + .../arrow/vector/schema/TypeLayout.java | 3 +- .../arrow/vector/file/TestArrowFile.java | 110 +++++++++++++++++- 5 files changed, 127 insertions(+), 22 deletions(-) diff --git a/java/vector/src/main/codegen/templates/UnionReader.java b/java/vector/src/main/codegen/templates/UnionReader.java index 7351ae3776f..c56e95c89dc 100644 --- a/java/vector/src/main/codegen/templates/UnionReader.java +++ b/java/vector/src/main/codegen/templates/UnionReader.java @@ -134,6 +134,10 @@ public void copyAsValue(UnionWriter writer) { + public int size() { + return getReaderForIndex(idx()).size(); + } + <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> <#assign uncappedName = name?uncap_first/> <#assign boxedType = (minor.boxedType!type.boxedType) /> diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index b14314d2b0d..5ca3f901484 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -15,17 +15,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -import com.google.common.collect.ImmutableList; -import com.google.flatbuffers.FlatBufferBuilder; -import io.netty.buffer.ArrowBuf; -import org.apache.arrow.flatbuf.Field; -import org.apache.arrow.flatbuf.Type; -import org.apache.arrow.flatbuf.Union; -import org.apache.arrow.vector.ValueVector; -import org.apache.arrow.vector.types.pojo.ArrowType; - -import java.util.ArrayList; import java.util.List; <@pp.dropOutputFile /> @@ -39,7 +28,9 @@ <#include "/@includes/vv_imports.ftl" /> import com.google.common.collect.ImmutableList; import java.util.ArrayList; +import java.util.Collections; import java.util.Iterator; +import org.apache.arrow.vector.BaseDataValueVector; import org.apache.arrow.vector.complex.impl.ComplexCopier; import org.apache.arrow.vector.util.CallBack; import org.apache.arrow.vector.schema.ArrowFieldNode; @@ -47,6 +38,7 @@ import static org.apache.arrow.flatbuf.UnionMode.Sparse; + /* * This class is generated using freemarker and the ${.template_name} template. */ @@ -81,6 +73,7 @@ public class UnionVector implements FieldVector { private ValueVector singleVector; private final CallBack callBack; + private final List innerVectors; public UnionVector(String name, BufferAllocator allocator, CallBack callBack) { this.name = name; @@ -88,6 +81,7 @@ public UnionVector(String name, BufferAllocator allocator, CallBack callBack) { this.internalMap = new MapVector("internal", allocator, callBack); this.typeVector = new UInt1Vector("types", allocator); this.callBack = callBack; + this.innerVectors = Collections.unmodifiableList(Arrays.asList(typeVector)); } public BufferAllocator getAllocator() { @@ -101,30 +95,28 @@ public MinorType getMinorType() { @Override public void initializeChildrenFromFields(List children) { - getMap().initializeChildrenFromFields(children); + internalMap.initializeChildrenFromFields(children); } @Override public List getChildrenFromFields() { - return getMap().getChildrenFromFields(); + return internalMap.getChildrenFromFields(); } @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { - // TODO - throw new UnsupportedOperationException(); + BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); + this.valueCount = fieldNode.getLength(); } @Override public List getFieldBuffers() { - // TODO - throw new UnsupportedOperationException(); + return BaseDataValueVector.unload(getFieldInnerVectors()); } @Override public List getFieldInnerVectors() { - // TODO - throw new UnsupportedOperationException(); + return this.innerVectors; } public NullableMapVector getMap() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index 58ac68b8282..b7040da9d82 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -74,6 +74,8 @@ public void load(ArrowRecordBatch recordBatch) { } private void loadBuffers(FieldVector vector, Field field, Iterator buffers, Iterator nodes) { + checkArgument(nodes.hasNext(), + "no more field nodes for for field " + field + " and vector " + vector); ArrowFieldNode fieldNode = nodes.next(); List typeLayout = field.getTypeLayout().getVectors(); List ownBuffers = new ArrayList<>(typeLayout.size()); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java index 06ae203bf44..c5f53fe508d 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/schema/TypeLayout.java @@ -82,8 +82,7 @@ public static TypeLayout getTypeLayout(final ArrowType arrowType) { break; case UnionMode.Sparse: vectors = asList( - validityVector(), - typeVector() + typeVector() // type of the value at the index or 0 if null ); break; default: diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index 7a5e7b58db9..0f28d53295c 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -266,7 +266,7 @@ private void validateComplexContent(int count, NullableMapVector parent) { Assert.assertEquals(i % 3, rootReader.reader("list").size()); NullableTimeStampHolder h = new NullableTimeStampHolder(); rootReader.reader("map").reader("timestamp").read(h); - Assert.assertEquals(i, h.value % COUNT); + Assert.assertEquals(i, h.value); } } @@ -339,4 +339,112 @@ public void testWriteReadMultipleRBs() throws IOException { } } + @Test + public void testWriteReadUnion() throws IOException { + File file = new File("target/mytest_write_union.arrow"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null)) { + + writeUnionData(count, parent); + + printVectors(parent.getChildrenFromFields()); + + validateUnionData(count, parent); + + write(parent.getChild("root"), file); + } + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(file); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); + BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null) + ) { + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + + NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); + VectorLoader vectorLoader = new VectorLoader(schema, root); + + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + vectorLoader.load(recordBatch); + } + validateUnionData(count, parent); + } + } + } + + public void validateUnionData(int count, MapVector parent) { + MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + for (int i = 0; i < count; i++) { + rootReader.setPosition(i); + switch (i % 4) { + case 0: + Assert.assertEquals(i, rootReader.reader("union").readInteger().intValue()); + break; + case 1: + Assert.assertEquals(i, rootReader.reader("union").readLong().longValue()); + break; + case 2: + Assert.assertEquals(i % 3, rootReader.reader("union").size()); + break; + case 3: + NullableTimeStampHolder h = new NullableTimeStampHolder(); + rootReader.reader("union").reader("timestamp").read(h); + Assert.assertEquals(i, h.value); + break; + } + } + } + + public void writeUnionData(int count, NullableMapVector parent) { + ArrowBuf varchar = allocator.buffer(3); + varchar.readerIndex(0); + varchar.setByte(0, 'a'); + varchar.setByte(1, 'b'); + varchar.setByte(2, 'c'); + varchar.writerIndex(3); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("union"); + BigIntWriter bigIntWriter = rootWriter.bigInt("union"); + ListWriter listWriter = rootWriter.list("union"); + MapWriter mapWriter = rootWriter.map("union"); + for (int i = 0; i < count; i++) { + switch (i % 4) { + case 0: + intWriter.setPosition(i); + intWriter.writeInt(i); + break; + case 1: + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + break; + case 2: + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + break; + case 3: + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.timeStamp("timestamp").writeTimeStamp(i); + mapWriter.end(); + break; + } + } + writer.setValueCount(count); + varchar.release(); + } } From 4ecf327636c1373f601679fac18b7fcf7f382e1b Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 16 Oct 2016 16:21:59 -0400 Subject: [PATCH 172/210] ARROW-191: Python: Provide infrastructure for manylinux1 wheels Author: Uwe L. Korn Closes #173 from xhochy/ARROW-191 and squashes the following commits: 278f8b0 [Uwe L. Korn] ARROW-191: Python: Provide infrastructure for manylinux1 wheels --- NOTICE.txt | 3 + .../Dockerfile-parquet_arrow-base-x86_64 | 40 ++++++++++ python/manylinux1/Dockerfile-x86_64 | 47 ++++++++++++ python/manylinux1/README.md | 40 ++++++++++ python/manylinux1/build_arrow.sh | 76 +++++++++++++++++++ 5 files changed, 206 insertions(+) create mode 100644 python/manylinux1/Dockerfile-parquet_arrow-base-x86_64 create mode 100644 python/manylinux1/Dockerfile-x86_64 create mode 100644 python/manylinux1/README.md create mode 100755 python/manylinux1/build_arrow.sh diff --git a/NOTICE.txt b/NOTICE.txt index 679bb59e6a9..5c699ca022c 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -38,3 +38,6 @@ This product includes software from the CMake project * Copyright 2001-2009 Kitware, Inc. * Copyright 2012-2014 Continuum Analytics, Inc. * All rights reserved. + +This product includes software from https://github.com/matthew-brett/multibuild (BSD 2-clause) + * Copyright (c) 2013-2016, Matt Terry and Matthew Brett; all rights reserved. diff --git a/python/manylinux1/Dockerfile-parquet_arrow-base-x86_64 b/python/manylinux1/Dockerfile-parquet_arrow-base-x86_64 new file mode 100644 index 00000000000..714fa1a91b3 --- /dev/null +++ b/python/manylinux1/Dockerfile-parquet_arrow-base-x86_64 @@ -0,0 +1,40 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + +FROM arrow-base-x86_64 + +WORKDIR / +ADD http://zlib.net/zlib-1.2.8.tar.gz /zlib-1.2.8.tar.gz +RUN tar xf zlib-1.2.8.tar.gz +WORKDIR zlib-1.2.8 +RUN CFLAGS=-fPIC cmake -DCMAKE_INSTALL_PREFIX:PATH=/usr -DCMAKE_BUILD_TYPE=Release . +RUN make -j5 install + +WORKDIR / +ADD https://github.com/google/snappy/releases/download/1.1.3/snappy-1.1.3.tar.gz /snappy-1.1.3.tar.gz +RUN tar xf snappy-1.1.3.tar.gz +WORKDIR /snappy-1.1.3 +RUN ./configure --with-pic --prefix=/usr +RUN make -j5 install + +WORKDIR / +ADD http://archive.apache.org/dist/thrift/0.9.1/thrift-0.9.1.tar.gz /thrift-0.9.1.tar.gz +RUN tar xf thrift-0.9.1.tar.gz +WORKDIR /thrift-0.9.1 +RUN ./configure LDFLAGS='-L/usr/lib64' CXXFLAGS='-fPIC' --without-qt4 --without-c_glib --without-csharp --without-java --without-erlang --without-nodejs --without-lua --without-python --without-perl --without-php --without-php_extension --without-ruby --without-haskell --without-go --without-d --without-tests --with-cpp --prefix=/usr --disable-shared --enable-static +RUN make -j5 install + +WORKDIR / +RUN git clone https://github.com/apache/parquet-cpp.git +WORKDIR /parquet-cpp +RUN ARROW_HOME=/usr THRIFT_HOME=/usr cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DPARQUET_BUILD_TESTS=OFF -DPARQUET_ARROW=ON . +RUN make -j5 install diff --git a/python/manylinux1/Dockerfile-x86_64 b/python/manylinux1/Dockerfile-x86_64 new file mode 100644 index 00000000000..e62a60111af --- /dev/null +++ b/python/manylinux1/Dockerfile-x86_64 @@ -0,0 +1,47 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + +FROM quay.io/pypa/manylinux1_x86_64:latest + +# Install dependencies +RUN yum install -y flex openssl-devel + +WORKDIR / +ADD http://downloads.sourceforge.net/project/boost/boost/1.60.0/boost_1_60_0.tar.gz /boost_1_60_0.tar.gz +RUN tar xf boost_1_60_0.tar.gz +WORKDIR /boost_1_60_0 +RUN ./bootstrap.sh +RUN ./bjam cxxflags=-fPIC cflags=-fPIC --prefix=/usr --with-filesystem --with-date_time --with-system install + +WORKDIR / +ADD https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz /cmake-3.5.2.tar.gz +RUN tar xf cmake-3.5.2.tar.gz +WORKDIR /cmake-3.5.2 +RUN ./configure --prefix=/usr +RUN make -j5 install + +WORKDIR / +ADD https://github.com/google/flatbuffers/archive/v1.3.0.tar.gz /flatbuffers-1.3.0.tar.gz +RUN tar xf flatbuffers-1.3.0.tar.gz +WORKDIR /flatbuffers-1.3.0 +RUN CXXFLAGS='-fPIC' cmake -DFLATBUFFERS_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr . +RUN make -j5 install + +WORKDIR / +RUN git clone https://github.com/matthew-brett/multibuild.git +WORKDIR /multibuild +RUN git checkout ffe59955ad8690c2f8bb74766cb7e9b0d0ee3963 + +ADD arrow /arrow +WORKDIR /arrow/cpp +RUN FLATBUFFERS_HOME=/usr cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr -DARROW_HDFS=ON -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=OFF . +RUN make -j5 install diff --git a/python/manylinux1/README.md b/python/manylinux1/README.md new file mode 100644 index 00000000000..8cd9f6db004 --- /dev/null +++ b/python/manylinux1/README.md @@ -0,0 +1,40 @@ + + +## Manylinux1 wheels for Apache Arrow + +This folder provides base Docker images and an infrastructure to build +`manylinux1` compatible Python wheels that should be installable on all +Linux distributions published in last four years. + +The process is split up in two parts: There are base Docker images that build +the native, Python-indenpendent dependencies. For these you can select if you +want to also build the dependencies used for the Parquet support. Depending on +these images, there is also a bash script that will build the pyarrow wheels +for all supported Python versions and place them in the `dist` folder. + +### Build instructions + +```bash +# Create a clean copy of the arrow source tree +git clone ../../ arrow +# Build the native baseimage +docker build -t arrow-base-x86_64 -f Dockerfile-x86_64 . +# (optionally) build parquet-cpp +docker build -t parquet_arrow-base-x86_64 -f Dockerfile-parquet_arrow-base-x86_64 . +# Build the python packages +docker run --rm -v $PWD:/io parquet_arrow-base-x86_64 /io/build_arrow.sh +# Now the new packages are located in the dist/ folder +ls -l dist/ +``` diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh new file mode 100755 index 00000000000..0786b6f490a --- /dev/null +++ b/python/manylinux1/build_arrow.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. +# +# Usage: +# docker run --rm -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh +# or with Parquet support +# docker run --rm -v $PWD:/io parquet_arrow-base-x86_64 /io/build_arrow.sh + +# Build upon the scripts in https://github.com/matthew-brett/manylinux-builds +# * Copyright (c) 2013-2016, Matt Terry and Matthew Brett (BSD 2-clause) + +PYTHON_VERSIONS="${PYTHON_VERSIONS:-2.7 3.4 3.5}" + +# Package index with only manylinux1 builds +MANYLINUX_URL=https://nipy.bic.berkeley.edu/manylinux + +source /multibuild/manylinux_utils.sh + +cd /arrow/python + +# PyArrow build configuration +export PYARROW_CMAKE_OPTIONS='-DCMAKE_BUILD_TYPE=Release' +# Need as otherwise arrow_io is sometimes not linked +export LDFLAGS="-Wl,--no-as-needed" +export ARROW_HOME="/usr" + +# Ensure the target directory exists +mkdir -p /io/dist +# Temporary directory to store the wheels that should be sent through auditwheel +rm_mkdir unfixed_wheels + +PY35_BIN=/opt/python/cp35-cp35m/bin +$PY35_BIN/pip install 'pyelftools<0.24' +$PY35_BIN/pip install 'git+https://github.com/xhochy/auditwheel.git@pyarrow-fixes' + +# Override repair_wheelhouse function +function repair_wheelhouse { + local in_dir=$1 + local out_dir=$2 + for whl in $in_dir/*.whl; do + if [[ $whl == *none-any.whl ]]; then + cp $whl $out_dir + else + # Store libraries directly in . not .libs to fix problems with libpyarrow.so linkage. + auditwheel -v repair -L . $whl -w $out_dir/ + fi + done + chmod -R a+rwX $out_dir +} + +for PYTHON in ${PYTHON_VERSIONS}; do + PYTHON_INTERPRETER="$(cpython_path $PYTHON)/bin/python" + PIP="$(cpython_path $PYTHON)/bin/pip" + PIPI_IO="$PIP install -f $MANYLINUX_URL" + PATH="$PATH:$(cpython_path $PYTHON)" + + $PIPI_IO "numpy==1.9.0" + $PIPI_IO "cython==0.24" + + PATH="$PATH:$(cpython_path $PYTHON)/bin" $PYTHON_INTERPRETER setup.py bdist_wheel + + rm_mkdir fixed_wheels + repair_wheelhouse dist /io/dist +done + From 8520061d38c4aa407ac6453aff786833efa5cbaa Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 16 Oct 2016 16:23:04 -0400 Subject: [PATCH 173/210] ARROW-336: Run Apache Rat in Travis builds @julienledem Integrated the rat call in the cpp build. It should fail if licenses are not matching. We could also make a separate `lint` Travis build but for the moment this seemed overkill to me. Author: Uwe L. Korn Closes #174 from xhochy/ARROW-336 and squashes the following commits: 25f797c [Uwe L. Korn] Make run-rat executable 6b6221f [Uwe L. Korn] ARROW-336: Run Apache Rat in Travis builds --- ci/travis_script_cpp.sh | 4 +++ dev/release/02-source.sh | 37 ++++------------------------ dev/release/run-rat.sh | 53 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+), 32 deletions(-) create mode 100755 dev/release/run-rat.sh diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index c3bd3b5f207..d555cab3e64 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -16,6 +16,10 @@ set -e : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build} +# Check licenses according to Apache policy +git archive HEAD -o arrow-src.tar.gz +./dev/release/run-rat.sh arrow-src.tar.gz + pushd $CPP_BUILD_DIR make lint diff --git a/dev/release/02-source.sh b/dev/release/02-source.sh index 1bbe2e92753..bdaa5cc9340 100644 --- a/dev/release/02-source.sh +++ b/dev/release/02-source.sh @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -18,6 +18,8 @@ # under the License. # +SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + if [ -z "$1" ]; then echo "Usage: $0 " exit @@ -56,36 +58,7 @@ tarball=$tag.tar.gz # archive (identical hashes) using the scm tag git archive $release_hash --prefix $tag/ -o $tarball -# download apache rat -curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/0.12/apache-rat-0.12.jar > apache-rat-0.12.jar - -RAT="java -jar apache-rat-0.12.jar -d " - -# generate the rat report -$RAT $tarball \ - -e ".*" \ - -e mman.h \ - -e "*_generated.h" \ - -e random.h \ - -e status.cc \ - -e status.h \ - -e asan_symbolize.py \ - -e cpplint.py \ - -e FindPythonLibsNew.cmake \ - -e pax_global_header \ - -e MANIFEST.in \ - -e __init__.pxd \ - -e __init__.py \ - -e requirements.txt \ - > rat.txt -UNAPPROVED=`cat rat.txt | grep "Unknown Licenses" | head -n 1 | cut -d " " -f 1` - -if [ "0" -eq "${UNAPPROVED}" ]; then - echo "No unnaproved licenses" -else - echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" - exit -fi +${SOURCE_DIR}/run-rat.sh $tarball # sign the archive gpg --armor --output ${tarball}.asc --detach-sig $tarball diff --git a/dev/release/run-rat.sh b/dev/release/run-rat.sh new file mode 100755 index 00000000000..d8ec6507fc4 --- /dev/null +++ b/dev/release/run-rat.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# download apache rat +curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/0.12/apache-rat-0.12.jar > apache-rat-0.12.jar + +RAT="java -jar apache-rat-0.12.jar -d " + +# generate the rat report +$RAT $1 \ + -e ".*" \ + -e mman.h \ + -e "*_generated.h" \ + -e random.h \ + -e status.cc \ + -e status.h \ + -e asan_symbolize.py \ + -e cpplint.py \ + -e FindPythonLibsNew.cmake \ + -e pax_global_header \ + -e MANIFEST.in \ + -e __init__.pxd \ + -e __init__.py \ + -e requirements.txt \ + > rat.txt +cat rat.txt +UNAPPROVED=`cat rat.txt | grep "Unknown Licenses" | head -n 1 | cut -d " " -f 1` + +if [ "0" -eq "${UNAPPROVED}" ]; then + echo "No unnaproved licenses" +else + echo "${UNAPPROVED} unapproved licences. Check rat report: rat.txt" + exit 1 +fi + + From 8e8b17f992aa3bb3a642a93b44beb9b87d589fea Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 16 Oct 2016 16:23:54 -0400 Subject: [PATCH 174/210] ARROW-97: API documentation via sphinx-apidoc Author: Uwe L. Korn Closes #175 from xhochy/ARROW-97 and squashes the following commits: 2ec3e11 [Uwe L. Korn] Add license headers d838e81 [Uwe L. Korn] ARROW-97: API documentation via sphinx-apidoc --- ci/travis_script_python.sh | 7 + python/README.md | 7 + python/doc/.gitignore | 3 + python/doc/Makefile | 237 +++++++++++++++++++++++ python/doc/conf.py | 369 ++++++++++++++++++++++++++++++++++++ python/doc/index.rst | 28 +++ python/doc/requirements.txt | 3 + python/pyarrow/parquet.pyx | 8 +- 8 files changed, 661 insertions(+), 1 deletion(-) create mode 100644 python/doc/.gitignore create mode 100644 python/doc/Makefile create mode 100644 python/doc/conf.py create mode 100644 python/doc/index.rst create mode 100644 python/doc/requirements.txt diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 97f0563240c..55cb2a76f6d 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -51,6 +51,13 @@ python_version_tests() { --inplace python -m pytest -vv -r sxX pyarrow + + # Build documentation once + if [[ "$PYTHON_VERSION" == "3.5" ]] + then + pip install -r doc/requirements.txt + python setup.py build_sphinx + fi } # run tests for python 2.7 and 3.5 diff --git a/python/README.md b/python/README.md index 6febcbcbcbf..e11f6456455 100644 --- a/python/README.md +++ b/python/README.md @@ -47,3 +47,10 @@ The Arrow C++ library must be built with all options enabled and installed with python setup.py build_ext --inplace py.test pyarrow ``` + +#### Build the documentation + +```bash +pip install -r doc/requirements.txt +python setup.py build_sphinx +``` diff --git a/python/doc/.gitignore b/python/doc/.gitignore new file mode 100644 index 00000000000..87d04134d6f --- /dev/null +++ b/python/doc/.gitignore @@ -0,0 +1,3 @@ +# auto-generated module documentation +pyarrow*.rst +modules.rst diff --git a/python/doc/Makefile b/python/doc/Makefile new file mode 100644 index 00000000000..72575839524 --- /dev/null +++ b/python/doc/Makefile @@ -0,0 +1,237 @@ + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " epub3 to make an epub3" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + @echo " dummy to check syntax errors of document sources" + +.PHONY: clean +clean: + rm -rf $(BUILDDIR)/* + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/pyarrow.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/pyarrow.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/pyarrow" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/pyarrow" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: epub3 +epub3: + $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 + @echo + @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +.PHONY: dummy +dummy: + $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy + @echo + @echo "Build finished. Dummy builder generates no files." diff --git a/python/doc/conf.py b/python/doc/conf.py new file mode 100644 index 00000000000..99ac3512ec9 --- /dev/null +++ b/python/doc/conf.py @@ -0,0 +1,369 @@ +# -*- coding: utf-8 -*- +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import inspect +import os +import sys + +from sphinx import apidoc + +import sphinx_rtd_theme + + +__location__ = os.path.join(os.getcwd(), os.path.dirname( + inspect.getfile(inspect.currentframe()))) +output_dir = os.path.join(__location__) +module_dir = os.path.join(__location__, "..", "pyarrow") +cmd_line_template = "sphinx-apidoc -f -e -o {outputdir} {moduledir}" +cmd_line = cmd_line_template.format(outputdir=output_dir, moduledir=module_dir) +apidoc.main(cmd_line.split(" ")) + +sys.path.insert(0, os.path.abspath('..')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'numpydoc' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +# +# source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pyarrow' +copyright = u'2016 Apache Software Foundation' +author = u'Apache Software Foundation' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = u'' +# The full version, including alpha/beta/rc tags. +release = u'' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +# +# today = '' +# +# Else, today_fmt is used as the format for a strftime call. +# +# today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +# +# default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +# +# add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +# +# add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +# +# show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +# modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +# keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# The name for this set of Sphinx documents. +# " v documentation" by default. +# +# html_title = u'pyarrow v0.1.0' + +# A shorter title for the navigation bar. Default is the same as html_title. +# +# html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +# +# html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +# +# html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +# +# html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. +# The empty string is equivalent to '%b %d, %Y'. +# +# html_last_updated_fmt = None + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +# +# html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +# +# html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +# +# html_additional_pages = {} + +# If false, no module index is generated. +# +# html_domain_indices = True + +# If false, no index is generated. +# +# html_use_index = True + +# If true, the index is split into individual pages for each letter. +# +# html_split_index = False + +# If true, links to the reST sources are added to the pages. +# +# html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +# +# html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +# +# html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +# +# html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +# html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' +# +# html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# 'ja' uses this config value. +# 'zh' user can custom change `jieba` dictionary path. +# +# html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +# +# html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pyarrowdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pyarrow.tex', u'pyarrow Documentation', + u'Apache Arrow Team', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +# +# latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +# +# latex_use_parts = False + +# If true, show page references after internal links. +# +# latex_show_pagerefs = False + +# If true, show URL addresses after external links. +# +# latex_show_urls = False + +# Documents to append as an appendix to all manuals. +# +# latex_appendices = [] + +# It false, will not define \strong, \code, itleref, \crossref ... but only +# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added +# packages. +# +# latex_keep_old_macro_names = True + +# If false, no module index is generated. +# +# latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pyarrow', u'pyarrow Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +# +# man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pyarrow', u'pyarrow Documentation', + author, 'pyarrow', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +# +# texinfo_appendices = [] + +# If false, no module index is generated. +# +# texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +# +# texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +# +# texinfo_no_detailmenu = False diff --git a/python/doc/index.rst b/python/doc/index.rst new file mode 100644 index 00000000000..550e544eef9 --- /dev/null +++ b/python/doc/index.rst @@ -0,0 +1,28 @@ +Apache Arrow (Python) +===================== + +Arrow is a columnar in-memory analytics layer designed to accelerate big data. +It houses a set of canonical in-memory representations of flat and hierarchical +data along with multiple language-bindings for structure manipulation. It also +provides IPC and common algorithm implementations. + +This is the documentation of the Python API of Apache Arrow. For more details +on the format and other language bindings see +`the main page for Arrow `_. Here will we only +detail the usage of the Python API for Arrow and the leaf libraries that add +additional functionality such as reading Apache Parquet files into Arrow +structures. + +.. toctree:: + :maxdepth: 4 + :hidden: + + Module Reference + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/python/doc/requirements.txt b/python/doc/requirements.txt new file mode 100644 index 00000000000..ce0793c31de --- /dev/null +++ b/python/doc/requirements.txt @@ -0,0 +1,3 @@ +numpydoc +sphinx +sphinx_rtd_theme diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index ca0176a7c04..2abe57b33ed 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -34,6 +34,10 @@ from pyarrow.io cimport NativeFile import six +__all__ = [ + 'read_table', + 'write_table' +] cdef class ParquetReader: cdef: @@ -76,9 +80,11 @@ cdef class ParquetReader: def read_table(source, columns=None): """ Read a Table from Parquet format + Returns ------- - table: pyarrow.Table + pyarrow.table.Table + Content of the file as a table (of columns) """ cdef ParquetReader reader = ParquetReader() From 732a2059d0c4493e451c566160b9d5d01dfe87be Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Oct 2016 13:44:34 -0400 Subject: [PATCH 175/210] ARROW-261: Refactor String/Binary code paths to reflect unnested (non-list-based) structure Per discussions on the mailing list. This should in theory match the Java implementation. Author: Wes McKinney Closes #176 from wesm/ARROW-261 and squashes the following commits: dca39ce [Wes McKinney] Make binary/string constants static to avoid memory-access-related segfaults in third party libraries 1e65b01 [Wes McKinney] Deprecate pyarrow::Status in favor of just arrow::Status. Conform pyarrow use of ArrayBuilder::Finish 9a1f77e [Wes McKinney] Add license header to index.rst bd70cab [Wes McKinney] Complete refactoring, fix up IPC tests for flattened string/binary buffer/metadata layout ae64f2e [Wes McKinney] Refactoring to reflect collaprsed list-like structure of Binary and String types. Not yet complete --- cpp/CMakeLists.txt | 1 - cpp/src/arrow/builder.h | 2 +- cpp/src/arrow/ipc/adapter.cc | 47 ++++---- cpp/src/arrow/ipc/test-common.h | 19 ++-- cpp/src/arrow/type.h | 20 +--- cpp/src/arrow/types/CMakeLists.txt | 1 - cpp/src/arrow/types/binary.h | 28 ----- cpp/src/arrow/types/construct.cc | 31 +----- cpp/src/arrow/types/construct.h | 8 -- cpp/src/arrow/types/json.cc | 37 ------- cpp/src/arrow/types/json.h | 36 ------ cpp/src/arrow/types/list-test.cc | 15 ++- cpp/src/arrow/types/list.cc | 42 ++++++- cpp/src/arrow/types/list.h | 49 ++------- cpp/src/arrow/types/primitive-test.cc | 26 +++-- cpp/src/arrow/types/primitive.cc | 31 ++++-- cpp/src/arrow/types/primitive.h | 31 +++--- cpp/src/arrow/types/string-test.cc | 33 +++--- cpp/src/arrow/types/string.cc | 101 ++++++++++++++--- cpp/src/arrow/types/string.h | 49 +++++---- cpp/src/arrow/types/struct-test.cc | 21 +++- cpp/src/arrow/types/struct.cc | 14 +++ cpp/src/arrow/types/struct.h | 17 +-- cpp/src/arrow/util/status.cc | 6 + cpp/src/arrow/util/status.h | 17 ++- python/CMakeLists.txt | 2 - python/doc/index.rst | 18 ++- python/pyarrow/error.pxd | 4 +- python/pyarrow/error.pyx | 10 +- python/pyarrow/includes/pyarrow.pxd | 35 ++---- python/pyarrow/io.pyx | 56 +++++----- python/pyarrow/ipc.pyx | 18 +-- python/pyarrow/parquet.pyx | 14 +-- python/src/pyarrow/adapters/builtin.cc | 39 ++++--- python/src/pyarrow/adapters/builtin.h | 9 +- python/src/pyarrow/adapters/pandas.cc | 32 +++--- python/src/pyarrow/adapters/pandas.h | 15 ++- python/src/pyarrow/api.h | 2 - python/src/pyarrow/common.cc | 12 +- python/src/pyarrow/common.h | 7 -- python/src/pyarrow/io.cc | 59 +++++----- python/src/pyarrow/status.cc | 92 ---------------- python/src/pyarrow/status.h | 146 ------------------------- 43 files changed, 484 insertions(+), 768 deletions(-) delete mode 100644 cpp/src/arrow/types/binary.h delete mode 100644 cpp/src/arrow/types/json.cc delete mode 100644 cpp/src/arrow/types/json.h delete mode 100644 python/src/pyarrow/status.cc delete mode 100644 python/src/pyarrow/status.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d682dc76f8c..6f954830b63 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -681,7 +681,6 @@ set(ARROW_SRCS src/arrow/types/construct.cc src/arrow/types/decimal.cc - src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc src/arrow/types/string.cc diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 646a6f24e9d..cef17e5aaba 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -93,7 +93,7 @@ class ARROW_EXPORT ArrayBuilder { // Creates new array object to hold the contents of the builder and transfers // ownership of the data. This resets all variables on the builder. - virtual std::shared_ptr Finish() = 0; + virtual Status Finish(std::shared_ptr* out) = 0; const std::shared_ptr& type() const { return type_; } diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index cd8ab53a31d..f84cb264f70 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -78,22 +78,6 @@ static bool IsPrimitive(const DataType* type) { } } -static bool IsListType(const DataType* type) { - DCHECK(type != nullptr); - switch (type->type) { - // TODO(emkornfield) grouping like this are used in a few places in the - // code consider using pattern like: - // http://stackoverflow.com/questions/26784685/c-macro-for-calling-function-based-on-enum-type - // - case Type::BINARY: - case Type::LIST: - case Type::STRING: - return true; - default: - return false; - } -} - // ---------------------------------------------------------------------- // Record batch write path @@ -115,7 +99,11 @@ Status VisitArray(const Array* arr, std::vector* field_nodes if (IsPrimitive(arr_type)) { const auto prim_arr = static_cast(arr); buffers->push_back(prim_arr->data()); - } else if (IsListType(arr_type)) { + } else if (arr->type_enum() == Type::STRING || arr->type_enum() == Type::BINARY) { + const auto binary_arr = static_cast(arr); + buffers->push_back(binary_arr->offsets()); + buffers->push_back(binary_arr->data()); + } else if (arr->type_enum() == Type::LIST) { const auto list_arr = static_cast(arr); buffers->push_back(list_arr->offset_buffer()); RETURN_NOT_OK(VisitArray( @@ -331,9 +319,21 @@ class RecordBatchReader::RecordBatchReaderImpl { } return MakePrimitiveArray( type, field_meta.length, data, field_meta.null_count, null_bitmap, out); - } + } else if (type->type == Type::STRING || type->type == Type::BINARY) { + std::shared_ptr offsets; + std::shared_ptr values; + RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); + RETURN_NOT_OK(GetBuffer(buffer_index_++, &values)); - if (IsListType(type.get())) { + if (type->type == Type::STRING) { + *out = std::make_shared( + field_meta.length, offsets, values, field_meta.null_count, null_bitmap); + } else { + *out = std::make_shared( + field_meta.length, offsets, values, field_meta.null_count, null_bitmap); + } + return Status::OK(); + } else if (type->type == Type::LIST) { std::shared_ptr offsets; RETURN_NOT_OK(GetBuffer(buffer_index_++, &offsets)); const int num_children = type->num_children(); @@ -346,11 +346,10 @@ class RecordBatchReader::RecordBatchReaderImpl { std::shared_ptr values_array; RETURN_NOT_OK( NextArray(type->child(0).get(), max_recursion_depth - 1, &values_array)); - return MakeListArray(type, field_meta.length, offsets, values_array, - field_meta.null_count, null_bitmap, out); - } - - if (type->type == Type::STRUCT) { + *out = std::make_shared(type, field_meta.length, offsets, values_array, + field_meta.null_count, null_bitmap); + return Status::OK(); + } else if (type->type == Type::STRUCT) { const int num_children = type->num_children(); std::vector fields; fields.reserve(num_children); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 7d02bc302f4..13bbbebde8a 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -42,7 +42,7 @@ const auto kListInt32 = std::make_shared(kInt32); const auto kListListInt32 = std::make_shared(kListInt32); Status MakeRandomInt32Array( - int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { + int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { std::shared_ptr data; test::MakeRandomInt32PoolBuffer(length, pool, &data); const auto kInt32 = std::make_shared(); @@ -52,16 +52,14 @@ Status MakeRandomInt32Array( test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); RETURN_NOT_OK(builder.Append( reinterpret_cast(data->data()), length, valid_bytes->data())); - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } RETURN_NOT_OK(builder.Append(reinterpret_cast(data->data()), length)); - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } Status MakeRandomListArray(const std::shared_ptr& child_array, int num_lists, - bool include_nulls, MemoryPool* pool, std::shared_ptr* array) { + bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { // Create the null list values std::vector valid_lists(num_lists); const double null_percent = include_nulls ? 0.1 : 0; @@ -90,8 +88,8 @@ Status MakeRandomListArray(const std::shared_ptr& child_array, int num_li } ListBuilder builder(pool, child_array); RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); - *array = builder.Finish(); - return (*array)->Validate(); + RETURN_NOT_OK(builder.Finish(out)); + return (*out)->Validate(); } typedef Status MakeRecordBatch(std::shared_ptr* out); @@ -115,7 +113,7 @@ Status MakeIntRecordBatch(std::shared_ptr* out) { template Status MakeRandomBinaryArray( - const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* array) { + const TypePtr& type, int32_t length, MemoryPool* pool, ArrayPtr* out) { const std::vector values = { "", "", "abc", "123", "efg", "456!@#!@#", "12312"}; Builder builder(pool, type); @@ -130,8 +128,7 @@ Status MakeRandomBinaryArray( builder.Append(reinterpret_cast(value.data()), value.size())); } } - *array = builder.Finish(); - return Status::OK(); + return builder.Finish(out); } Status MakeStringTypesRecordBatch(std::shared_ptr* out) { diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index b4c3721a728..ea8516fc347 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -242,7 +242,7 @@ struct ARROW_EXPORT DoubleType : public PrimitiveType { struct ARROW_EXPORT ListType : public DataType { // List can contain any other logical value type explicit ListType(const std::shared_ptr& value_type) - : ListType(value_type, Type::LIST) {} + : ListType(std::make_shared("item", value_type)) {} explicit ListType(const std::shared_ptr& value_field) : DataType(Type::LIST) { children_ = {value_field}; @@ -255,26 +255,17 @@ struct ARROW_EXPORT ListType : public DataType { static char const* name() { return "list"; } std::string ToString() const override; - - protected: - // Constructor for classes that are implemented as List Arrays. - ListType(const std::shared_ptr& value_type, Type::type logical_type) - : DataType(logical_type) { - // TODO ARROW-187 this can technically fail, make a constructor method ? - children_ = {std::make_shared("item", value_type)}; - } }; // BinaryType type is reprsents lists of 1-byte values. -struct ARROW_EXPORT BinaryType : public ListType { +struct ARROW_EXPORT BinaryType : public DataType { BinaryType() : BinaryType(Type::BINARY) {} static char const* name() { return "binary"; } std::string ToString() const override; protected: // Allow subclasses to change the logical type. - explicit BinaryType(Type::type logical_type) - : ListType(std::shared_ptr(new UInt8Type()), logical_type) {} + explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} }; // UTF encoded strings @@ -284,9 +275,6 @@ struct ARROW_EXPORT StringType : public BinaryType { static char const* name() { return "string"; } std::string ToString() const override; - - protected: - explicit StringType(Type::type logical_type) : BinaryType(logical_type) {} }; struct ARROW_EXPORT StructType : public DataType { @@ -300,7 +288,7 @@ struct ARROW_EXPORT StructType : public DataType { // These will be defined elsewhere template -struct type_traits {}; +struct TypeTraits {}; } // namespace arrow diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 72a8e776646..9f781698982 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -25,7 +25,6 @@ install(FILES construct.h datetime.h decimal.h - json.h list.h primitive.h string.h diff --git a/cpp/src/arrow/types/binary.h b/cpp/src/arrow/types/binary.h deleted file mode 100644 index 201fbb6e795..00000000000 --- a/cpp/src/arrow/types/binary.h +++ /dev/null @@ -1,28 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_BINARY_H -#define ARROW_TYPES_BINARY_H - -#include -#include - -#include "arrow/type.h" - -namespace arrow {} // namespace arrow - -#endif // ARROW_TYPES_BINARY_H diff --git a/cpp/src/arrow/types/construct.cc b/cpp/src/arrow/types/construct.cc index 0b71ea96551..67245f8ea1f 100644 --- a/cpp/src/arrow/types/construct.cc +++ b/cpp/src/arrow/types/construct.cc @@ -59,6 +59,7 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr& type, BUILDER_CASE(DOUBLE, DoubleBuilder); BUILDER_CASE(STRING, StringBuilder); + BUILDER_CASE(BINARY, BinaryBuilder); case Type::LIST: { std::shared_ptr value_builder; @@ -105,10 +106,10 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length, MAKE_PRIMITIVE_ARRAY_CASE(INT32, Int32Array); MAKE_PRIMITIVE_ARRAY_CASE(UINT64, UInt64Array); MAKE_PRIMITIVE_ARRAY_CASE(INT64, Int64Array); - MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); - MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); MAKE_PRIMITIVE_ARRAY_CASE(FLOAT, FloatArray); MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray); + MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array); + MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray); MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray); default: return Status::NotImplemented(type->ToString()); @@ -120,30 +121,4 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length, #endif } -Status MakeListArray(const TypePtr& type, int32_t length, - const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, - const std::shared_ptr& null_bitmap, ArrayPtr* out) { - switch (type->type) { - case Type::BINARY: - out->reset(new BinaryArray(type, length, offsets, values, null_count, null_bitmap)); - break; - - case Type::LIST: - out->reset(new ListArray(type, length, offsets, values, null_count, null_bitmap)); - break; - - case Type::DECIMAL_TEXT: - case Type::STRING: - out->reset(new StringArray(type, length, offsets, values, null_count, null_bitmap)); - break; - default: - return Status::NotImplemented(type->ToString()); - } -#ifdef NDEBUG - return Status::OK(); -#else - return (*out)->Validate(); -#endif -} - } // namespace arrow diff --git a/cpp/src/arrow/types/construct.h b/cpp/src/arrow/types/construct.h index afdadbe0790..e18e946d1a6 100644 --- a/cpp/src/arrow/types/construct.h +++ b/cpp/src/arrow/types/construct.h @@ -42,14 +42,6 @@ Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr& type, int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap, std::shared_ptr* out); -// Create new list arrays for logical types that are backed by ListArrays (e.g. list of -// primitives and strings) -// TODO(emkornfield) split up string vs list? -Status ARROW_EXPORT MakeListArray(const std::shared_ptr& type, int32_t length, - const std::shared_ptr& offests, const std::shared_ptr& values, - int32_t null_count, const std::shared_ptr& null_bitmap, - std::shared_ptr* out); - } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/types/json.cc b/cpp/src/arrow/types/json.cc deleted file mode 100644 index 89240fc22bb..00000000000 --- a/cpp/src/arrow/types/json.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/types/json.h" - -#include - -#include "arrow/type.h" -#include "arrow/types/union.h" - -namespace arrow { - -static const TypePtr Null(new NullType()); -static const TypePtr Int32(new Int32Type()); -static const TypePtr String(new StringType()); -static const TypePtr Double(new DoubleType()); -static const TypePtr Bool(new BooleanType()); - -static const std::vector kJsonTypes = {Null, Int32, String, Double, Bool}; -TypePtr JSONScalar::dense_type = TypePtr(new DenseUnionType(kJsonTypes)); -TypePtr JSONScalar::sparse_type = TypePtr(new SparseUnionType(kJsonTypes)); - -} // namespace arrow diff --git a/cpp/src/arrow/types/json.h b/cpp/src/arrow/types/json.h deleted file mode 100644 index 9de961f79a6..00000000000 --- a/cpp/src/arrow/types/json.h +++ /dev/null @@ -1,36 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_JSON_H -#define ARROW_TYPES_JSON_H - -#include "arrow/type.h" - -namespace arrow { - -struct JSONScalar : public DataType { - bool dense; - - static TypePtr dense_type; - static TypePtr sparse_type; - - explicit JSONScalar(bool dense = true) : DataType(Type::JSON_SCALAR), dense(dense) {} -}; - -} // namespace arrow - -#endif // ARROW_TYPES_JSON_H diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 2e41b4a61ca..12c539495a2 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -76,7 +76,11 @@ class TestListBuilder : public TestBuilder { builder_ = std::dynamic_pointer_cast(tmp); } - void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } + void Done() { + std::shared_ptr out; + EXPECT_OK(builder_->Finish(&out)); + result_ = std::dynamic_pointer_cast(out); + } protected: TypePtr value_type_; @@ -98,14 +102,17 @@ TEST_F(TestListBuilder, Equality) { // setup two equal arrays ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); - array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&array)); ASSERT_OK(builder_->Append(equal_offsets.data(), equal_offsets.size())); ASSERT_OK(vb->Append(equal_values.data(), equal_values.size())); - equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&equal_array)); // now an unequal one ASSERT_OK(builder_->Append(unequal_offsets.data(), unequal_offsets.size())); ASSERT_OK(vb->Append(unequal_values.data(), unequal_values.size())); - unequal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_array)); // Test array equality EXPECT_TRUE(array->Equals(array)); diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 6334054caf8..ef2ec22cb53 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -25,7 +25,7 @@ bool ListArray::EqualsExact(const ListArray& other) const { if (null_count_ != other.null_count_) { return false; } bool equal_offsets = - offset_buf_->Equals(*other.offset_buf_, (length_ + 1) * sizeof(int32_t)); + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); if (!equal_offsets) { return false; } bool equal_null_bitmap = true; if (null_count_ > 0) { @@ -72,10 +72,10 @@ bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st Status ListArray::Validate() const { if (length_ < 0) { return Status::Invalid("Length was negative"); } - if (!offset_buf_) { return Status::Invalid("offset_buf_ was null"); } - if (offset_buf_->size() / static_cast(sizeof(int32_t)) < length_) { + if (!offset_buffer_) { return Status::Invalid("offset_buffer_ was null"); } + if (offset_buffer_->size() / static_cast(sizeof(int32_t)) < length_) { std::stringstream ss; - ss << "offset buffer size (bytes): " << offset_buf_->size() + ss << "offset buffer size (bytes): " << offset_buffer_->size() << " isn't large enough for length: " << length_; return Status::Invalid(ss.str()); } @@ -121,4 +121,38 @@ Status ListArray::Validate() const { return Status::OK(); } +Status ListBuilder::Init(int32_t elements) { + DCHECK_LT(elements, std::numeric_limits::max()); + RETURN_NOT_OK(ArrayBuilder::Init(elements)); + // one more then requested for offsets + return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); +} + +Status ListBuilder::Resize(int32_t capacity) { + DCHECK_LT(capacity, std::numeric_limits::max()); + // one more then requested for offsets + RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); +} + +Status ListBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr items = values_; + if (!items) { RETURN_NOT_OK(value_builder_->Finish(&items)); } + + RETURN_NOT_OK(offset_builder_.Append(items->length())); + std::shared_ptr offsets = offset_builder_.Finish(); + + *out = std::make_shared( + type_, length_, offsets, items, null_count_, null_bitmap_); + + Reset(); + + return Status::OK(); +} + +void ListBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index f3894510d09..9440ffed4bf 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -43,9 +43,9 @@ class ARROW_EXPORT ListArray : public Array { const ArrayPtr& values, int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) : Array(type, length, null_count, null_bitmap) { - offset_buf_ = offsets; - offsets_ = offsets == nullptr ? nullptr - : reinterpret_cast(offset_buf_->data()); + offset_buffer_ = offsets; + offsets_ = offsets == nullptr ? nullptr : reinterpret_cast( + offset_buffer_->data()); values_ = values; } @@ -57,7 +57,7 @@ class ARROW_EXPORT ListArray : public Array { // with this array. const std::shared_ptr& values() const { return values_; } const std::shared_ptr offset_buffer() const { - return std::static_pointer_cast(offset_buf_); + return std::static_pointer_cast(offset_buffer_); } const std::shared_ptr& value_type() const { return values_->type(); } @@ -77,7 +77,7 @@ class ARROW_EXPORT ListArray : public Array { const ArrayPtr& arr) const override; protected: - std::shared_ptr offset_buf_; + std::shared_ptr offset_buffer_; const int32_t* offsets_; ArrayPtr values_; }; @@ -119,19 +119,9 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { virtual ~ListBuilder() {} - Status Init(int32_t elements) override { - DCHECK_LT(elements, std::numeric_limits::max()); - RETURN_NOT_OK(ArrayBuilder::Init(elements)); - // one more then requested for offsets - return offset_builder_.Resize((elements + 1) * sizeof(int32_t)); - } - - Status Resize(int32_t capacity) override { - DCHECK_LT(capacity, std::numeric_limits::max()); - // one more then requested for offsets - RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); - } + Status Init(int32_t elements) override; + Status Resize(int32_t capacity) override; + Status Finish(std::shared_ptr* out) override; // Vector append // @@ -145,27 +135,6 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { return Status::OK(); } - // The same as Finalize but allows for overridding the c++ type - template - std::shared_ptr Transfer() { - std::shared_ptr items = values_; - if (!items) { items = value_builder_->Finish(); } - - offset_builder_.Append(items->length()); - - const auto offsets_buffer = offset_builder_.Finish(); - auto result = std::make_shared( - type_, length_, offsets_buffer, items, null_count_, null_bitmap_); - - // TODO(emkornfield) make a reset method - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; - - return result; - } - - std::shared_ptr Finish() override { return Transfer(); } - // Start a new variable-length list slot // // This function should be called before beginning to append elements to the @@ -188,6 +157,8 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { BufferBuilder offset_builder_; std::shared_ptr value_builder_; std::shared_ptr values_; + + void Reset(); }; } // namespace arrow diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 5ac2867932d..121bd4794f2 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -123,8 +123,11 @@ class TestPrimitiveBuilder : public TestBuilder { auto expected = std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr result = - std::dynamic_pointer_cast(builder->Finish()); + + std::shared_ptr out; + ASSERT_OK(builder->Finish(&out)); + + std::shared_ptr result = std::dynamic_pointer_cast(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -216,8 +219,10 @@ void TestPrimitiveBuilder::Check( auto expected = std::make_shared(size, ex_data, ex_null_count, ex_null_bitmap); - std::shared_ptr result = - std::dynamic_pointer_cast(builder->Finish()); + + std::shared_ptr out; + ASSERT_OK(builder->Finish(&out)); + std::shared_ptr result = std::dynamic_pointer_cast(out); // Builder is now reset ASSERT_EQ(0, builder->length()); @@ -254,7 +259,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestInit) { int n = 1000; ASSERT_OK(this->builder_->Reserve(n)); ASSERT_EQ(util::next_power2(n), this->builder_->capacity()); - ASSERT_EQ(util::next_power2(type_traits::bytes_required(n)), + ASSERT_EQ(util::next_power2(TypeTraits::bytes_required(n)), this->builder_->data()->size()); // unsure if this should go in all builder classes @@ -267,7 +272,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendNull) { ASSERT_OK(this->builder_->AppendNull()); } - auto result = this->builder_->Finish(); + std::shared_ptr result; + ASSERT_OK(this->builder_->Finish(&result)); for (int i = 0; i < size; ++i) { ASSERT_TRUE(result->IsNull(i)) << i; @@ -298,7 +304,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { } do { - std::shared_ptr result = this->builder_->Finish(); + std::shared_ptr result; + ASSERT_OK(this->builder_->Finish(&result)); } while (false); ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); @@ -315,8 +322,7 @@ Status MakeArray(const vector& valid_bytes, const vector& draws, int RETURN_NOT_OK(builder->AppendNull()); } } - *out = builder->Finish(); - return Status::OK(); + return builder->Finish(out); } TYPED_TEST(TestPrimitiveBuilder, Equality) { @@ -465,7 +471,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { ASSERT_OK(this->builder_->Reserve(cap)); ASSERT_EQ(cap, this->builder_->capacity()); - ASSERT_EQ(type_traits::bytes_required(cap), this->builder_->data()->size()); + ASSERT_EQ(TypeTraits::bytes_required(cap), this->builder_->data()->size()); ASSERT_EQ(util::bytes_for_bits(cap), this->builder_->null_bitmap()->size()); } diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 9ba2ebdcc2d..3a05ccfdf18 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -69,12 +69,25 @@ bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; +template class NumericArray; + template Status PrimitiveBuilder::Init(int32_t capacity) { RETURN_NOT_OK(ArrayBuilder::Init(capacity)); data_ = std::make_shared(pool_); - int64_t nbytes = type_traits::bytes_required(capacity); + int64_t nbytes = TypeTraits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(nbytes)); // TODO(emkornfield) valgrind complains without this memset(data_->mutable_data(), 0, nbytes); @@ -93,10 +106,9 @@ Status PrimitiveBuilder::Resize(int32_t capacity) { } else { RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); const int64_t old_bytes = data_->size(); - const int64_t new_bytes = type_traits::bytes_required(capacity); + const int64_t new_bytes = TypeTraits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); - memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); } return Status::OK(); @@ -108,7 +120,7 @@ Status PrimitiveBuilder::Append( RETURN_NOT_OK(Reserve(length)); if (length > 0) { - memcpy(raw_data_ + length_, values, type_traits::bytes_required(length)); + memcpy(raw_data_ + length_, values, TypeTraits::bytes_required(length)); } // length_ is update by these @@ -118,13 +130,18 @@ Status PrimitiveBuilder::Append( } template -std::shared_ptr PrimitiveBuilder::Finish() { - std::shared_ptr result = std::make_shared::ArrayType>( +Status PrimitiveBuilder::Finish(std::shared_ptr* out) { + const int64_t bytes_required = TypeTraits::bytes_required(length_); + if (bytes_required > 0 && bytes_required < data_->size()) { + // Trim buffers + RETURN_NOT_OK(data_->Resize(bytes_required)); + } + *out = std::make_shared::ArrayType>( type_, length_, data_, null_count_, null_bitmap_); data_ = null_bitmap_ = nullptr; capacity_ = length_ = null_count_ = 0; - return result; + return Status::OK(); } template <> diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c643783f681..f21470d96e4 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -91,7 +91,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int i) const { return raw_data()[i]; } }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass) using NAME = NumericArray; +#define NUMERIC_ARRAY_DECL(NAME, TypeClass) \ + using NAME = NumericArray; \ + extern template class ARROW_EXPORT NumericArray; NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type); NUMERIC_ARRAY_DECL(Int8Array, Int8Type); @@ -139,8 +141,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { Status Append( const value_type* values, int32_t length, const uint8_t* valid_bytes = nullptr); - std::shared_ptr Finish() override; - + Status Finish(std::shared_ptr* out) override; Status Init(int32_t capacity) override; // Increase the capacity of the builder to accommodate at least the indicated @@ -183,77 +184,77 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { }; template <> -struct type_traits { +struct TypeTraits { typedef UInt8Array ArrayType; static inline int bytes_required(int elements) { return elements; } }; template <> -struct type_traits { +struct TypeTraits { typedef Int8Array ArrayType; static inline int bytes_required(int elements) { return elements; } }; template <> -struct type_traits { +struct TypeTraits { typedef UInt16Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } }; template <> -struct type_traits { +struct TypeTraits { typedef Int16Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } }; template <> -struct type_traits { +struct TypeTraits { typedef UInt32Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } }; template <> -struct type_traits { +struct TypeTraits { typedef Int32Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } }; template <> -struct type_traits { +struct TypeTraits { typedef UInt64Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } }; template <> -struct type_traits { +struct TypeTraits { typedef Int64Array ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; template <> -struct type_traits { +struct TypeTraits { typedef TimestampArray ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; template <> -struct type_traits { +struct TypeTraits { typedef FloatArray ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(float); } }; template <> -struct type_traits { +struct TypeTraits { typedef DoubleArray ArrayType; static inline int bytes_required(int elements) { return elements * sizeof(double); } @@ -293,7 +294,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { }; template <> -struct type_traits { +struct TypeTraits { typedef BooleanArray ArrayType; static inline int bytes_required(int elements) { diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 6807b00e8ca..d897e30a3c6 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -66,18 +66,13 @@ class TestStringContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - int nchars = chars_.size(); - value_buf_ = test::to_buffer(chars_); - values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); - offsets_buf_ = test::to_buffer(offsets_); - null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } protected: @@ -94,7 +89,6 @@ class TestStringContainer : public ::testing::Test { int null_count_; int length_; - ArrayPtr values_; std::shared_ptr strings_; }; @@ -122,7 +116,7 @@ TEST_F(TestStringContainer, TestListFunctions) { TEST_F(TestStringContainer, TestDestructor) { auto arr = std::make_shared( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } TEST_F(TestStringContainer, TestGetString) { @@ -147,7 +141,10 @@ class TestStringBuilder : public TestBuilder { } void Done() { - result_ = std::dynamic_pointer_cast(builder_->Finish()); + std::shared_ptr out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast(out); result_->Validate(); } @@ -178,7 +175,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) { ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->values()->length()); + ASSERT_EQ(reps * 6, result_->data()->size()); int32_t length; int32_t pos = 0; @@ -218,18 +215,14 @@ class TestBinaryContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - int nchars = chars_.size(); - value_buf_ = test::to_buffer(chars_); - values_ = ArrayPtr(new UInt8Array(nchars, value_buf_)); - offsets_buf_ = test::to_buffer(offsets_); null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); strings_ = std::make_shared( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } protected: @@ -246,7 +239,6 @@ class TestBinaryContainer : public ::testing::Test { int null_count_; int length_; - ArrayPtr values_; std::shared_ptr strings_; }; @@ -274,7 +266,7 @@ TEST_F(TestBinaryContainer, TestListFunctions) { TEST_F(TestBinaryContainer, TestDestructor) { auto arr = std::make_shared( - length_, offsets_buf_, values_, null_count_, null_bitmap_); + length_, offsets_buf_, value_buf_, null_count_, null_bitmap_); } TEST_F(TestBinaryContainer, TestGetValue) { @@ -298,7 +290,10 @@ class TestBinaryBuilder : public TestBuilder { } void Done() { - result_ = std::dynamic_pointer_cast(builder_->Finish()); + std::shared_ptr out; + EXPECT_OK(builder_->Finish(&out)); + + result_ = std::dynamic_pointer_cast(out); result_->Validate(); } @@ -330,7 +325,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { ASSERT_OK(result_->Validate()); ASSERT_EQ(reps * N, result_->length()); ASSERT_EQ(reps, result_->null_count()); - ASSERT_EQ(reps * 6, result_->values()->length()); + ASSERT_EQ(reps * 6, result_->data()->size()); int32_t length; for (int i = 0; i < N * reps; ++i) { diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index 745ed8f7edb..d692e13773f 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -17,6 +17,7 @@ #include "arrow/types/string.h" +#include #include #include @@ -24,37 +25,77 @@ namespace arrow { -const std::shared_ptr BINARY(new BinaryType()); -const std::shared_ptr STRING(new StringType()); +static std::shared_ptr kBinary = std::make_shared(); +static std::shared_ptr kString = std::make_shared(); BinaryArray::BinaryArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) - : BinaryArray(BINARY, length, offsets, values, null_count, null_bitmap) {} + : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {} BinaryArray::BinaryArray(const TypePtr& type, int32_t length, - const std::shared_ptr& offsets, const ArrayPtr& values, int32_t null_count, - const std::shared_ptr& null_bitmap) - : ListArray(type, length, offsets, values, null_count, null_bitmap), - bytes_(std::dynamic_pointer_cast(values).get()), - raw_bytes_(bytes_->raw_data()) { - // Check in case the dynamic cast fails. - DCHECK(bytes_); + const std::shared_ptr& offsets, const std::shared_ptr& data, + int32_t null_count, const std::shared_ptr& null_bitmap) + : Array(type, length, null_count, null_bitmap), + offset_buffer_(offsets), + offsets_(reinterpret_cast(offset_buffer_->data())), + data_buffer_(data), + data_(nullptr) { + if (data_buffer_ != nullptr) { data_ = data_buffer_->data(); } } Status BinaryArray::Validate() const { - if (values()->null_count() > 0) { - std::stringstream ss; - ss << type()->ToString() << " can have null values in the value array"; - Status::Invalid(ss.str()); + // TODO(wesm): what to do here? + return Status::OK(); +} + +bool BinaryArray::EqualsExact(const BinaryArray& other) const { + if (!Array::EqualsExact(other)) { return false; } + + bool equal_offsets = + offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); + if (!equal_offsets) { return false; } + + return data_buffer_->Equals(*other.data_buffer_, data_buffer_->size()); +} + +bool BinaryArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (this->type_enum() != arr->type_enum()) { return false; } + return EqualsExact(*static_cast(arr.get())); +} + +bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + for (int32_t i = start_idx, o_i = other_start_idx; i < end_idx; ++i, ++o_i) { + const bool is_null = IsNull(i); + if (is_null != arr->IsNull(o_i)) { return false; } + if (is_null) continue; + const int32_t begin_offset = offset(i); + const int32_t end_offset = offset(i + 1); + const int32_t other_begin_offset = other->offset(o_i); + const int32_t other_end_offset = other->offset(o_i + 1); + // Underlying can't be equal if the size isn't equal + if (end_offset - begin_offset != other_end_offset - other_begin_offset) { + return false; + } + + if (std::memcmp(data_ + begin_offset, other->data_ + other_begin_offset, + end_offset - begin_offset)) { + return false; + } } - return ListArray::Validate(); + return true; } StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count, + const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) - : StringArray(STRING, length, offsets, values, null_count, null_bitmap) {} + : BinaryArray(kString, length, offsets, data, null_count, null_bitmap) {} Status StringArray::Validate() const { // TODO(emkornfield) Validate proper UTF8 code points? @@ -72,4 +113,28 @@ BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type) byte_builder_ = static_cast(value_builder_.get()); } +Status BinaryBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr result; + RETURN_NOT_OK(ListBuilder::Finish(&result)); + + const auto list = std::dynamic_pointer_cast(result); + auto values = std::dynamic_pointer_cast(list->values()); + + *out = std::make_shared(list->length(), list->offset_buffer(), + values->data(), list->null_count(), list->null_bitmap()); + return Status::OK(); +} + +Status StringBuilder::Finish(std::shared_ptr* out) { + std::shared_ptr result; + RETURN_NOT_OK(ListBuilder::Finish(&result)); + + const auto list = std::dynamic_pointer_cast(result); + auto values = std::dynamic_pointer_cast(list->values()); + + *out = std::make_shared(list->length(), list->offset_buffer(), + values->data(), list->null_count(), list->null_bitmap()); + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index bab0c58f617..aaba49c6023 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -35,15 +35,16 @@ namespace arrow { class Buffer; class MemoryPool; -class ARROW_EXPORT BinaryArray : public ListArray { +class ARROW_EXPORT BinaryArray : public Array { public: BinaryArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); + // Constructor that allows sub-classes/builders to propagate there logical type up the // class hierarchy. BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); // Return the pointer to the given elements bytes @@ -53,28 +54,38 @@ class ARROW_EXPORT BinaryArray : public ListArray { DCHECK(out_length); const int32_t pos = offsets_[i]; *out_length = offsets_[i + 1] - pos; - return raw_bytes_ + pos; + return data_ + pos; } + std::shared_ptr data() const { return data_buffer_; } + std::shared_ptr offsets() const { return offset_buffer_; } + + int32_t offset(int i) const { return offsets_[i]; } + + // Neither of these functions will perform boundschecking + int32_t value_offset(int i) const { return offsets_[i]; } + int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; } + + bool EqualsExact(const BinaryArray& other) const; + bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const ArrayPtr& arr) const override; + Status Validate() const override; private: - UInt8Array* bytes_; - const uint8_t* raw_bytes_; + std::shared_ptr offset_buffer_; + const int32_t* offsets_; + + std::shared_ptr data_buffer_; + const uint8_t* data_; }; class ARROW_EXPORT StringArray : public BinaryArray { public: StringArray(int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, + const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); - // Constructor that allows overriding the logical type, so subclasses can propagate - // there - // up the class hierarchy. - StringArray(const TypePtr& type, int32_t length, const std::shared_ptr& offsets, - const ArrayPtr& values, int32_t null_count = 0, - const std::shared_ptr& null_bitmap = nullptr) - : BinaryArray(type, length, offsets, values, null_count, null_bitmap) {} // Construct a std::string // TODO: std::bad_alloc possibility @@ -98,9 +109,7 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { return byte_builder_->Append(value, length); } - std::shared_ptr Finish() override { - return ListBuilder::Transfer(); - } + Status Finish(std::shared_ptr* out) override; protected: UInt8Builder* byte_builder_; @@ -112,6 +121,8 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : BinaryBuilder(pool, type) {} + Status Finish(std::shared_ptr* out) override; + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } Status Append(const char* value, int32_t length) { @@ -119,10 +130,6 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { } Status Append(const std::vector& values, uint8_t* null_bytes); - - std::shared_ptr Finish() override { - return ListBuilder::Transfer(); - } }; } // namespace arrow diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index ccf5a52dc83..8e82c389a94 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -119,7 +119,11 @@ class TestStructBuilder : public TestBuilder { ASSERT_EQ(2, static_cast(builder_->field_builders().size())); } - void Done() { result_ = std::dynamic_pointer_cast(builder_->Finish()); } + void Done() { + std::shared_ptr out; + ASSERT_OK(builder_->Finish(&out)); + result_ = std::dynamic_pointer_cast(out); + } protected: std::vector value_fields_; @@ -294,7 +298,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -308,7 +313,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - equal_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&equal_array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -323,7 +329,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - unequal_bitmap_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_bitmap_array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -339,7 +346,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : int_values) { int_vb->UnsafeAppend(value); } - unequal_offsets_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_offsets_array)); ASSERT_OK(builder_->Resize(list_lengths.size())); ASSERT_OK(char_vb->Resize(list_values.size())); @@ -354,7 +362,8 @@ TEST_F(TestStructBuilder, TestEquality) { for (int32_t value : unequal_int_values) { int_vb->UnsafeAppend(value); } - unequal_values_array = builder_->Finish(); + + ASSERT_OK(builder_->Finish(&unequal_values_array)); // Test array equality EXPECT_TRUE(array->Equals(array)); diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index e8176f08268..369c29d15ef 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -87,4 +87,18 @@ Status StructArray::Validate() const { return Status::OK(); } +Status StructBuilder::Finish(std::shared_ptr* out) { + std::vector> fields(field_builders_.size()); + for (size_t i = 0; i < field_builders_.size(); ++i) { + RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i])); + } + + *out = std::make_shared(type_, length_, fields, null_count_, null_bitmap_); + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 63955eb31bb..65b8daf214a 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -73,6 +73,8 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { field_builders_ = field_builders; } + Status Finish(std::shared_ptr* out) override; + // Null bitmap is of equal length to every child field, and any zero byte // will be considered as a null for that field, but users must using app- // end methods or advance methods of the child builders' independently to @@ -83,21 +85,6 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder { return Status::OK(); } - std::shared_ptr Finish() override { - std::vector fields; - for (auto it : field_builders_) { - fields.push_back(it->Finish()); - } - - auto result = - std::make_shared(type_, length_, fields, null_count_, null_bitmap_); - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - - return result; - } - // Append an element to the Struct. All child-builders' Append method must // be called independently to maintain data-structure consistency. Status Append(bool is_valid = true) { diff --git a/cpp/src/arrow/util/status.cc b/cpp/src/arrow/util/status.cc index 8dd07d0d064..08e9ae3946e 100644 --- a/cpp/src/arrow/util/status.cc +++ b/cpp/src/arrow/util/status.cc @@ -49,12 +49,18 @@ std::string Status::CodeAsString() const { case StatusCode::KeyError: type = "Key error"; break; + case StatusCode::TypeError: + type = "Type error"; + break; case StatusCode::Invalid: type = "Invalid"; break; case StatusCode::IOError: type = "IOError"; break; + case StatusCode::UnknownError: + type = "Unknown error"; + break; case StatusCode::NotImplemented: type = "NotImplemented"; break; diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index d5585313c72..05f5b749b60 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -78,9 +78,10 @@ enum class StatusCode : char { OK = 0, OutOfMemory = 1, KeyError = 2, - Invalid = 3, - IOError = 4, - + TypeError = 3, + Invalid = 4, + IOError = 5, + UnknownError = 9, NotImplemented = 10, }; @@ -106,6 +107,14 @@ class ARROW_EXPORT Status { return Status(StatusCode::KeyError, msg, -1); } + static Status TypeError(const std::string& msg) { + return Status(StatusCode::TypeError, msg, -1); + } + + static Status UnknownError(const std::string& msg) { + return Status(StatusCode::UnknownError, msg, -1); + } + static Status NotImplemented(const std::string& msg) { return Status(StatusCode::NotImplemented, msg, -1); } @@ -125,6 +134,8 @@ class ARROW_EXPORT Status { bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } bool IsIOError() const { return code() == StatusCode::IOError; } + + bool IsUnknownError() const { return code() == StatusCode::UnknownError; } bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } // Return a string representation of this status suitable for printing. diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 55f6d0543a1..4357fa05ff8 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -438,8 +438,6 @@ set(PYARROW_SRCS src/pyarrow/config.cc src/pyarrow/helpers.cc src/pyarrow/io.cc - src/pyarrow/status.cc - src/pyarrow/adapters/builtin.cc src/pyarrow/adapters/pandas.cc ) diff --git a/python/doc/index.rst b/python/doc/index.rst index 550e544eef9..88725badc1e 100644 --- a/python/doc/index.rst +++ b/python/doc/index.rst @@ -1,3 +1,20 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + Apache Arrow (Python) ===================== @@ -25,4 +42,3 @@ Indices and tables * :ref:`genindex` * :ref:`modindex` * :ref:`search` - diff --git a/python/pyarrow/error.pxd b/python/pyarrow/error.pxd index 891d1ac1c7e..4fb46c25faf 100644 --- a/python/pyarrow/error.pxd +++ b/python/pyarrow/error.pxd @@ -16,7 +16,5 @@ # under the License. from pyarrow.includes.libarrow cimport CStatus -from pyarrow.includes.pyarrow cimport PyStatus -cdef int check_cstatus(const CStatus& status) nogil except -1 -cdef int check_status(const PyStatus& status) nogil except -1 +cdef int check_status(const CStatus& status) nogil except -1 diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx index a2c53fed8c6..b8a82b3754c 100644 --- a/python/pyarrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -22,15 +22,7 @@ from pyarrow.compat import frombytes class ArrowException(Exception): pass -cdef int check_cstatus(const CStatus& status) nogil except -1: - if status.ok(): - return 0 - - cdef c_string c_message = status.ToString() - with gil: - raise ArrowException(frombytes(c_message)) - -cdef int check_status(const PyStatus& status) nogil except -1: +cdef int check_status(const CStatus& status) nogil except -1: if status.ok(): return 0 diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 7c47f21854e..e1da1914c57 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -25,36 +25,19 @@ cimport pyarrow.includes.libarrow_io as arrow_io cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: - # We can later add more of the common status factory methods as needed - cdef PyStatus PyStatus_OK "Status::OK"() - - cdef cppclass PyStatus "pyarrow::Status": - PyStatus() - - c_string ToString() - - c_bool ok() - c_bool IsOutOfMemory() - c_bool IsKeyError() - c_bool IsTypeError() - c_bool IsIOError() - c_bool IsValueError() - c_bool IsNotImplemented() - c_bool IsArrowError() - shared_ptr[CDataType] GetPrimitiveType(Type type) - PyStatus ConvertPySequence(object obj, shared_ptr[CArray]* out) + CStatus ConvertPySequence(object obj, shared_ptr[CArray]* out) - PyStatus PandasToArrow(MemoryPool* pool, object ao, - shared_ptr[CArray]* out) - PyStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, - shared_ptr[CArray]* out) + CStatus PandasToArrow(MemoryPool* pool, object ao, + shared_ptr[CArray]* out) + CStatus PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, + shared_ptr[CArray]* out) - PyStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, - object py_ref, PyObject** out) + CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, + object py_ref, PyObject** out) - PyStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, - object py_ref, PyObject** out) + CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, + object py_ref, PyObject** out) MemoryPool* get_memory_pool() diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 8970e06effd..16ebfa1138e 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -28,7 +28,7 @@ cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.includes.libarrow_io cimport * from pyarrow.compat import frombytes, tobytes -from pyarrow.error cimport check_cstatus +from pyarrow.error cimport check_status cimport cpython as cp @@ -57,9 +57,9 @@ cdef class NativeFile: if self.is_open: with nogil: if self.is_readonly: - check_cstatus(self.rd_file.get().Close()) + check_status(self.rd_file.get().Close()) else: - check_cstatus(self.wr_file.get().Close()) + check_status(self.wr_file.get().Close()) self.is_open = False cdef read_handle(self, shared_ptr[ReadableFileInterface]* file): @@ -88,22 +88,22 @@ cdef class NativeFile: cdef int64_t size self._assert_readable() with nogil: - check_cstatus(self.rd_file.get().GetSize(&size)) + check_status(self.rd_file.get().GetSize(&size)) return size def tell(self): cdef int64_t position with nogil: if self.is_readonly: - check_cstatus(self.rd_file.get().Tell(&position)) + check_status(self.rd_file.get().Tell(&position)) else: - check_cstatus(self.wr_file.get().Tell(&position)) + check_status(self.wr_file.get().Tell(&position)) return position def seek(self, int64_t position): self._assert_readable() with nogil: - check_cstatus(self.rd_file.get().Seek(position)) + check_status(self.rd_file.get().Seek(position)) def write(self, data): """ @@ -116,7 +116,7 @@ cdef class NativeFile: cdef const uint8_t* buf = cp.PyBytes_AS_STRING(data) cdef int64_t bufsize = len(data) with nogil: - check_cstatus(self.wr_file.get().Write(buf, bufsize)) + check_status(self.wr_file.get().Write(buf, bufsize)) def read(self, int nbytes): cdef: @@ -127,8 +127,7 @@ cdef class NativeFile: self._assert_readable() with nogil: - check_cstatus(self.rd_file.get() - .ReadB(nbytes, &out)) + check_status(self.rd_file.get().ReadB(nbytes, &out)) result = cp.PyBytes_FromStringAndSize( out.get().data(), out.get().size()) @@ -223,7 +222,7 @@ cdef class InMemoryOutputStream(NativeFile): def get_result(self): cdef Buffer result = Buffer() - check_cstatus(self.wr_file.get().Close()) + check_status(self.wr_file.get().Close()) result.init( self.buffer) self.is_open = False @@ -270,7 +269,7 @@ except ImportError: def have_libhdfs(): try: - check_cstatus(ConnectLibHdfs()) + check_status(ConnectLibHdfs()) return True except: return False @@ -304,7 +303,7 @@ cdef class HdfsClient: def close(self): self._ensure_client() with nogil: - check_cstatus(self.client.get().Disconnect()) + check_status(self.client.get().Disconnect()) self.is_open = False cdef _ensure_client(self): @@ -341,8 +340,7 @@ cdef class HdfsClient: conf.user = tobytes(user) with nogil: - check_cstatus( - CHdfsClient.Connect(&conf, &out.client)) + check_status(CHdfsClient.Connect(&conf, &out.client)) out.is_open = True return out @@ -383,8 +381,8 @@ cdef class HdfsClient: self._ensure_client() with nogil: - check_cstatus(self.client.get() - .ListDirectory(c_path, &listing)) + check_status(self.client.get() + .ListDirectory(c_path, &listing)) cdef const HdfsPathInfo* info for i in range( listing.size()): @@ -422,8 +420,8 @@ cdef class HdfsClient: cdef c_string c_path = tobytes(path) with nogil: - check_cstatus(self.client.get() - .CreateDirectory(c_path)) + check_status(self.client.get() + .CreateDirectory(c_path)) def delete(self, path, bint recursive=False): """ @@ -439,8 +437,8 @@ cdef class HdfsClient: cdef c_string c_path = tobytes(path) with nogil: - check_cstatus(self.client.get() - .Delete(c_path, recursive)) + check_status(self.client.get() + .Delete(c_path, recursive)) def open(self, path, mode='rb', buffer_size=None, replication=None, default_block_size=None): @@ -473,7 +471,7 @@ cdef class HdfsClient: append = True with nogil: - check_cstatus( + check_status( self.client.get() .OpenWriteable(c_path, append, c_buffer_size, c_replication, c_default_block_size, @@ -484,8 +482,8 @@ cdef class HdfsClient: out.is_readonly = False else: with nogil: - check_cstatus(self.client.get() - .OpenReadable(c_path, &rd_handle)) + check_status(self.client.get() + .OpenReadable(c_path, &rd_handle)) out.rd_file = rd_handle out.is_readonly = True @@ -579,9 +577,9 @@ cdef class HdfsFile(NativeFile): try: with nogil: while total_bytes < nbytes: - check_cstatus(self.rd_file.get() - .Read(rpc_chunksize, &bytes_read, - buf + total_bytes)) + check_status(self.rd_file.get() + .Read(rpc_chunksize, &bytes_read, + buf + total_bytes)) total_bytes += bytes_read @@ -647,8 +645,8 @@ cdef class HdfsFile(NativeFile): try: while True: with nogil: - check_cstatus(self.rd_file.get() - .Read(self.buffer_size, &bytes_read, buf)) + check_status(self.rd_file.get() + .Read(self.buffer_size, &bytes_read, buf)) total_bytes += bytes_read diff --git a/python/pyarrow/ipc.pyx b/python/pyarrow/ipc.pyx index f8da3a70da8..46deb5ad0c3 100644 --- a/python/pyarrow/ipc.pyx +++ b/python/pyarrow/ipc.pyx @@ -26,7 +26,7 @@ from pyarrow.includes.libarrow_io cimport * from pyarrow.includes.libarrow_ipc cimport * cimport pyarrow.includes.pyarrow as pyarrow -from pyarrow.error cimport check_cstatus +from pyarrow.error cimport check_status from pyarrow.io cimport NativeFile from pyarrow.schema cimport Schema from pyarrow.table cimport RecordBatch @@ -89,8 +89,8 @@ cdef class ArrowFileWriter: get_writer(sink, &self.sink) with nogil: - check_cstatus(CFileWriter.Open(self.sink.get(), schema.sp_schema, - &self.writer)) + check_status(CFileWriter.Open(self.sink.get(), schema.sp_schema, + &self.writer)) self.closed = False @@ -101,12 +101,12 @@ cdef class ArrowFileWriter: def write_record_batch(self, RecordBatch batch): cdef CRecordBatch* bptr = batch.batch with nogil: - check_cstatus(self.writer.get() - .WriteRecordBatch(bptr.columns(), bptr.num_rows())) + check_status(self.writer.get() + .WriteRecordBatch(bptr.columns(), bptr.num_rows())) def close(self): with nogil: - check_cstatus(self.writer.get().Close()) + check_status(self.writer.get().Close()) self.closed = True @@ -124,9 +124,9 @@ cdef class ArrowFileReader: with nogil: if offset != 0: - check_cstatus(CFileReader.Open2(reader, offset, &self.reader)) + check_status(CFileReader.Open2(reader, offset, &self.reader)) else: - check_cstatus(CFileReader.Open(reader, &self.reader)) + check_status(CFileReader.Open(reader, &self.reader)) property num_dictionaries: @@ -147,7 +147,7 @@ cdef class ArrowFileReader: raise ValueError('Batch number {0} out of range'.format(i)) with nogil: - check_cstatus(self.reader.get().GetRecordBatch(i, &batch)) + check_status(self.reader.get().GetRecordBatch(i, &batch)) result = RecordBatch() result.init(batch) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 2abe57b33ed..019dd2c1de4 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -26,7 +26,7 @@ cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.compat import tobytes from pyarrow.error import ArrowException -from pyarrow.error cimport check_cstatus +from pyarrow.error cimport check_status from pyarrow.io import NativeFile from pyarrow.table cimport Table @@ -62,7 +62,7 @@ cdef class ParquetReader: cdef shared_ptr[ReadableFileInterface] cpp_handle file.read_handle(&cpp_handle) - check_cstatus(OpenFile(cpp_handle, &self.allocator, &self.reader)) + check_status(OpenFile(cpp_handle, &self.allocator, &self.reader)) def read_all(self): cdef: @@ -70,8 +70,8 @@ cdef class ParquetReader: shared_ptr[CTable] ctable with nogil: - check_cstatus(self.reader.get() - .ReadFlatTable(&ctable)) + check_status(self.reader.get() + .ReadFlatTable(&ctable)) table.init(ctable) return table @@ -80,7 +80,7 @@ cdef class ParquetReader: def read_table(source, columns=None): """ Read a Table from Parquet format - + Returns ------- pyarrow.table.Table @@ -176,5 +176,5 @@ def write_table(table, filename, chunk_size=None, version=None, sink.reset(new LocalFileOutputStream(tobytes(filename))) with nogil: - check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, - chunk_size_, properties_builder.build())) + check_status(WriteFlatTable(ctable_, default_memory_pool(), sink, + chunk_size_, properties_builder.build())) diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index 680f3a539b5..c034fbd9777 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -20,13 +20,14 @@ #include "pyarrow/adapters/builtin.h" -#include +#include "arrow/api.h" +#include "arrow/util/status.h" #include "pyarrow/helpers.h" -#include "pyarrow/status.h" using arrow::ArrayBuilder; using arrow::DataType; +using arrow::Status; using arrow::Type; namespace pyarrow { @@ -129,7 +130,7 @@ class SeqVisitor { PyObject* item = item_ref.obj(); if (PyList_Check(item)) { - PY_RETURN_NOT_OK(Visit(item, level + 1)); + RETURN_NOT_OK(Visit(item, level + 1)); } else if (PyDict_Check(item)) { return Status::NotImplemented("No type inference for dicts"); } else { @@ -164,9 +165,9 @@ class SeqVisitor { Status Validate() const { if (scalars_.total_count() > 0) { if (num_nesting_levels() > 1) { - return Status::ValueError("Mixed nesting levels not supported"); + return Status::Invalid("Mixed nesting levels not supported"); } else if (max_observed_level() < max_nesting_level_) { - return Status::ValueError("Mixed nesting levels not supported"); + return Status::Invalid("Mixed nesting levels not supported"); } } return Status::OK(); @@ -216,8 +217,8 @@ static Status InferArrowType(PyObject* obj, int64_t* size, } SeqVisitor seq_visitor; - PY_RETURN_NOT_OK(seq_visitor.Visit(obj)); - PY_RETURN_NOT_OK(seq_visitor.Validate()); + RETURN_NOT_OK(seq_visitor.Visit(obj)); + RETURN_NOT_OK(seq_visitor.Validate()); *out_type = seq_visitor.GetType(); @@ -259,7 +260,7 @@ class BoolConverter : public TypedConverter { public: Status AppendData(PyObject* seq) override { Py_ssize_t size = PySequence_Size(seq); - RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -281,7 +282,7 @@ class Int64Converter : public TypedConverter { Status AppendData(PyObject* seq) override { int64_t val; Py_ssize_t size = PySequence_Size(seq); - RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -301,7 +302,7 @@ class DoubleConverter : public TypedConverter { Status AppendData(PyObject* seq) override { double val; Py_ssize_t size = PySequence_Size(seq); - RETURN_ARROW_NOT_OK(typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { @@ -330,7 +331,7 @@ class StringConverter : public TypedConverter { OwnedRef holder(item); if (item == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + RETURN_NOT_OK(typed_builder_->AppendNull()); continue; } else if (PyUnicode_Check(item)) { tmp.reset(PyUnicode_AsUTF8String(item)); @@ -344,7 +345,7 @@ class StringConverter : public TypedConverter { // No error checking length = PyBytes_GET_SIZE(bytes_obj); bytes = PyBytes_AS_STRING(bytes_obj); - RETURN_ARROW_NOT_OK(typed_builder_->Append(bytes, length)); + RETURN_NOT_OK(typed_builder_->Append(bytes, length)); } return Status::OK(); } @@ -359,10 +360,10 @@ class ListConverter : public TypedConverter { for (int64_t i = 0; i < size; ++i) { OwnedRef item(PySequence_GetItem(seq, i)); if (item.obj() == Py_None) { - RETURN_ARROW_NOT_OK(typed_builder_->AppendNull()); + RETURN_NOT_OK(typed_builder_->AppendNull()); } else { typed_builder_->Append(); - PY_RETURN_NOT_OK(value_converter_->AppendData(item.obj())); + RETURN_NOT_OK(value_converter_->AppendData(item.obj())); } } return Status::OK(); @@ -408,7 +409,7 @@ Status ListConverter::Init(const std::shared_ptr& builder) { Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { std::shared_ptr type; int64_t size; - PY_RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + RETURN_NOT_OK(InferArrowType(obj, &size, &type)); // Handle NA / NullType case if (type->type == Type::NA) { @@ -426,14 +427,12 @@ Status ConvertPySequence(PyObject* obj, std::shared_ptr* out) { // Give the sequence converter an array builder std::shared_ptr builder; - RETURN_ARROW_NOT_OK(arrow::MakeBuilder(get_memory_pool(), type, &builder)); + RETURN_NOT_OK(arrow::MakeBuilder(get_memory_pool(), type, &builder)); converter->Init(builder); - PY_RETURN_NOT_OK(converter->AppendData(obj)); + RETURN_NOT_OK(converter->AppendData(obj)); - *out = builder->Finish(); - - return Status::OK(); + return builder->Finish(out); } } // namespace pyarrow diff --git a/python/src/pyarrow/adapters/builtin.h b/python/src/pyarrow/adapters/builtin.h index 4e997e31dd6..2ddfdaaf441 100644 --- a/python/src/pyarrow/adapters/builtin.h +++ b/python/src/pyarrow/adapters/builtin.h @@ -30,14 +30,15 @@ #include "pyarrow/common.h" #include "pyarrow/visibility.h" -namespace arrow { class Array; } +namespace arrow { +class Array; +class Status; +} namespace pyarrow { -class Status; - PYARROW_EXPORT -Status ConvertPySequence(PyObject* obj, std::shared_ptr* out); +arrow::Status ConvertPySequence(PyObject* obj, std::shared_ptr* out); } // namespace pyarrow diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index b2fcd37aec9..5902b834169 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -31,10 +31,10 @@ #include "arrow/api.h" #include "arrow/util/bit-util.h" +#include "arrow/util/status.h" #include "pyarrow/common.h" #include "pyarrow/config.h" -#include "pyarrow/status.h" namespace pyarrow { @@ -42,6 +42,8 @@ using arrow::Array; using arrow::Column; using arrow::Field; using arrow::DataType; +using arrow::Status; + namespace util = arrow::util; // ---------------------------------------------------------------------- @@ -149,7 +151,7 @@ class ArrowSerializer { int null_bytes = util::bytes_for_bits(length_); null_bitmap_ = std::make_shared(pool_); - RETURN_ARROW_NOT_OK(null_bitmap_->Resize(null_bytes)); + RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); null_bitmap_data_ = null_bitmap_->mutable_data(); memset(null_bitmap_data_, 0, null_bytes); @@ -171,9 +173,9 @@ class ArrowSerializer { PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); arrow::TypePtr string_type(new arrow::StringType()); arrow::StringBuilder string_builder(pool_, string_type); - RETURN_ARROW_NOT_OK(string_builder.Resize(length_)); + RETURN_NOT_OK(string_builder.Resize(length_)); - arrow::Status s; + Status s; PyObject* obj; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; @@ -187,18 +189,16 @@ class ArrowSerializer { s = string_builder.Append(PyBytes_AS_STRING(obj), length); Py_DECREF(obj); if (!s.ok()) { - return Status::ArrowError(s.ToString()); + return s; } } else if (PyBytes_Check(obj)) { const int32_t length = PyBytes_GET_SIZE(obj); - RETURN_ARROW_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length)); + RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length)); } else { string_builder.AppendNull(); } } - *out = std::shared_ptr(string_builder.Finish()); - - return Status::OK(); + return string_builder.Finish(out); } Status ConvertBooleans(std::shared_ptr* out) { @@ -208,7 +208,7 @@ class ArrowSerializer { int nbytes = util::bytes_for_bits(length_); auto data = std::make_shared(pool_); - RETURN_ARROW_NOT_OK(data->Resize(nbytes)); + RETURN_NOT_OK(data->Resize(nbytes)); uint8_t* bitmap = data->mutable_data(); memset(bitmap, 0, nbytes); @@ -305,7 +305,7 @@ inline Status ArrowSerializer::MakeDataType(std::shared_ptrreset(new arrow::TimestampType(unit)); @@ -330,7 +330,7 @@ inline Status ArrowSerializer::Convert(std::shared_ptr* out) { RETURN_NOT_OK(ConvertData()); std::shared_ptr type; RETURN_NOT_OK(MakeDataType(&type)); - RETURN_ARROW_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_, out)); + RETURN_NOT_OK(MakePrimitiveArray(type, length_, data_, null_count, null_bitmap_, out)); return Status::OK(); } @@ -389,7 +389,7 @@ template inline Status ArrowSerializer::ConvertData() { // TODO(wesm): strided arrays if (is_strided()) { - return Status::ValueError("no support for strided data yet"); + return Status::Invalid("no support for strided data yet"); } data_ = std::make_shared(arr_); @@ -399,12 +399,12 @@ inline Status ArrowSerializer::ConvertData() { template <> inline Status ArrowSerializer::ConvertData() { if (is_strided()) { - return Status::ValueError("no support for strided data yet"); + return Status::Invalid("no support for strided data yet"); } int nbytes = util::bytes_for_bits(length_); auto buffer = std::make_shared(pool_); - RETURN_ARROW_NOT_OK(buffer->Resize(nbytes)); + RETURN_NOT_OK(buffer->Resize(nbytes)); const uint8_t* values = reinterpret_cast(PyArray_DATA(arr_)); @@ -446,7 +446,7 @@ Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, } if (PyArray_NDIM(arr) != 1) { - return Status::ValueError("only handle 1-dimensional arrays"); + return Status::Invalid("only handle 1-dimensional arrays"); } switch(PyArray_DESCR(arr)->type_num) { diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index 141d1219e64..532495dd792 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -32,27 +32,26 @@ namespace arrow { class Array; class Column; class MemoryPool; +class Status; } // namespace arrow namespace pyarrow { -class Status; - PYARROW_EXPORT -Status ConvertArrayToPandas(const std::shared_ptr& arr, PyObject* py_ref, - PyObject** out); +arrow::Status ConvertArrayToPandas(const std::shared_ptr& arr, + PyObject* py_ref, PyObject** out); PYARROW_EXPORT -Status ConvertColumnToPandas(const std::shared_ptr& col, PyObject* py_ref, - PyObject** out); +arrow::Status ConvertColumnToPandas(const std::shared_ptr& col, + PyObject* py_ref, PyObject** out); PYARROW_EXPORT -Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, +arrow::Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out); PYARROW_EXPORT -Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, +arrow::Status PandasToArrow(arrow::MemoryPool* pool, PyObject* ao, std::shared_ptr* out); } // namespace pyarrow diff --git a/python/src/pyarrow/api.h b/python/src/pyarrow/api.h index 72be6afe02c..6dbbc45d40c 100644 --- a/python/src/pyarrow/api.h +++ b/python/src/pyarrow/api.h @@ -18,8 +18,6 @@ #ifndef PYARROW_API_H #define PYARROW_API_H -#include "pyarrow/status.h" - #include "pyarrow/helpers.h" #include "pyarrow/adapters/builtin.h" diff --git a/python/src/pyarrow/common.cc b/python/src/pyarrow/common.cc index 09f3efb5a03..fa875f2b9ab 100644 --- a/python/src/pyarrow/common.cc +++ b/python/src/pyarrow/common.cc @@ -21,10 +21,10 @@ #include #include -#include -#include +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" -#include "pyarrow/status.h" +using arrow::Status; namespace pyarrow { @@ -33,18 +33,18 @@ class PyArrowMemoryPool : public arrow::MemoryPool { PyArrowMemoryPool() : bytes_allocated_(0) {} virtual ~PyArrowMemoryPool() {} - arrow::Status Allocate(int64_t size, uint8_t** out) override { + Status Allocate(int64_t size, uint8_t** out) override { std::lock_guard guard(pool_lock_); *out = static_cast(std::malloc(size)); if (*out == nullptr) { std::stringstream ss; ss << "malloc of size " << size << " failed"; - return arrow::Status::OutOfMemory(ss.str()); + return Status::OutOfMemory(ss.str()); } bytes_allocated_ += size; - return arrow::Status::OK(); + return Status::OK(); } int64_t bytes_allocated() const override { diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index 50c2577b93c..7f3131ef03d 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -29,13 +29,6 @@ namespace pyarrow { #define PYARROW_IS_PY2 PY_MAJOR_VERSION <= 2 -#define RETURN_ARROW_NOT_OK(s) do { \ - arrow::Status _s = (s); \ - if (!_s.ok()) { \ - return Status::ArrowError(s.ToString()); \ - } \ - } while (0); - class OwnedRef { public: OwnedRef() : obj_(nullptr) {} diff --git a/python/src/pyarrow/io.cc b/python/src/pyarrow/io.cc index 7bf32ffa8d2..e6dbc12d429 100644 --- a/python/src/pyarrow/io.cc +++ b/python/src/pyarrow/io.cc @@ -20,12 +20,13 @@ #include #include -#include -#include -#include +#include "arrow/io/memory.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" #include "pyarrow/common.h" -#include "pyarrow/status.h" + +using arrow::Status; namespace pyarrow { @@ -41,7 +42,7 @@ PythonFile::~PythonFile() { Py_DECREF(file_); } -static arrow::Status CheckPyError() { +static Status CheckPyError() { if (PyErr_Occurred()) { PyObject *exc_type, *exc_value, *traceback; PyErr_Fetch(&exc_type, &exc_value, &traceback); @@ -51,35 +52,35 @@ static arrow::Status CheckPyError() { Py_XDECREF(exc_value); Py_XDECREF(traceback); PyErr_Clear(); - return arrow::Status::IOError(message); + return Status::IOError(message); } - return arrow::Status::OK(); + return Status::OK(); } -arrow::Status PythonFile::Close() { +Status PythonFile::Close() { // whence: 0 for relative to start of file, 2 for end of file PyObject* result = PyObject_CallMethod(file_, "close", "()"); Py_XDECREF(result); ARROW_RETURN_NOT_OK(CheckPyError()); - return arrow::Status::OK(); + return Status::OK(); } -arrow::Status PythonFile::Seek(int64_t position, int whence) { +Status PythonFile::Seek(int64_t position, int whence) { // whence: 0 for relative to start of file, 2 for end of file PyObject* result = PyObject_CallMethod(file_, "seek", "(ii)", position, whence); Py_XDECREF(result); ARROW_RETURN_NOT_OK(CheckPyError()); - return arrow::Status::OK(); + return Status::OK(); } -arrow::Status PythonFile::Read(int64_t nbytes, PyObject** out) { +Status PythonFile::Read(int64_t nbytes, PyObject** out) { PyObject* result = PyObject_CallMethod(file_, "read", "(i)", nbytes); ARROW_RETURN_NOT_OK(CheckPyError()); *out = result; - return arrow::Status::OK(); + return Status::OK(); } -arrow::Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { +Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { PyObject* py_data = PyBytes_FromStringAndSize( reinterpret_cast(data), nbytes); ARROW_RETURN_NOT_OK(CheckPyError()); @@ -88,10 +89,10 @@ arrow::Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { Py_XDECREF(py_data); Py_XDECREF(result); ARROW_RETURN_NOT_OK(CheckPyError()); - return arrow::Status::OK(); + return Status::OK(); } -arrow::Status PythonFile::Tell(int64_t* position) { +Status PythonFile::Tell(int64_t* position) { PyObject* result = PyObject_CallMethod(file_, "tell", "()"); ARROW_RETURN_NOT_OK(CheckPyError()); @@ -101,7 +102,7 @@ arrow::Status PythonFile::Tell(int64_t* position) { // PyLong_AsLongLong can raise OverflowError ARROW_RETURN_NOT_OK(CheckPyError()); - return arrow::Status::OK(); + return Status::OK(); } // ---------------------------------------------------------------------- @@ -113,22 +114,22 @@ PyReadableFile::PyReadableFile(PyObject* file) { PyReadableFile::~PyReadableFile() {} -arrow::Status PyReadableFile::Close() { +Status PyReadableFile::Close() { PyGILGuard lock; return file_->Close(); } -arrow::Status PyReadableFile::Seek(int64_t position) { +Status PyReadableFile::Seek(int64_t position) { PyGILGuard lock; return file_->Seek(position, 0); } -arrow::Status PyReadableFile::Tell(int64_t* position) { +Status PyReadableFile::Tell(int64_t* position) { PyGILGuard lock; return file_->Tell(position); } -arrow::Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { +Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { PyGILGuard lock; PyObject* bytes_obj; ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj)); @@ -137,10 +138,10 @@ arrow::Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* std::memcpy(out, PyBytes_AS_STRING(bytes_obj), *bytes_read); Py_DECREF(bytes_obj); - return arrow::Status::OK(); + return Status::OK(); } -arrow::Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { +Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr* out) { PyGILGuard lock; PyObject* bytes_obj; @@ -149,10 +150,10 @@ arrow::Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr(bytes_obj); Py_DECREF(bytes_obj); - return arrow::Status::OK(); + return Status::OK(); } -arrow::Status PyReadableFile::GetSize(int64_t* size) { +Status PyReadableFile::GetSize(int64_t* size) { PyGILGuard lock; int64_t current_position;; @@ -167,7 +168,7 @@ arrow::Status PyReadableFile::GetSize(int64_t* size) { ARROW_RETURN_NOT_OK(file_->Seek(current_position, 0)); *size = file_size; - return arrow::Status::OK(); + return Status::OK(); } bool PyReadableFile::supports_zero_copy() const { @@ -183,17 +184,17 @@ PyOutputStream::PyOutputStream(PyObject* file) { PyOutputStream::~PyOutputStream() {} -arrow::Status PyOutputStream::Close() { +Status PyOutputStream::Close() { PyGILGuard lock; return file_->Close(); } -arrow::Status PyOutputStream::Tell(int64_t* position) { +Status PyOutputStream::Tell(int64_t* position) { PyGILGuard lock; return file_->Tell(position); } -arrow::Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { +Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { PyGILGuard lock; return file_->Write(data, nbytes); } diff --git a/python/src/pyarrow/status.cc b/python/src/pyarrow/status.cc deleted file mode 100644 index 1cd54f6a785..00000000000 --- a/python/src/pyarrow/status.cc +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Status encapsulates the result of an operation. It may indicate success, -// or it may indicate an error with an associated error message. -// -// Multiple threads can invoke const methods on a Status without -// external synchronization, but if any of the threads may call a -// non-const method, all threads accessing the same Status must use -// external synchronization. - -#include "pyarrow/status.h" - -#include -#include -#include - -namespace pyarrow { - -Status::Status(StatusCode code, const std::string& msg, int16_t posix_code) { - assert(code != StatusCode::OK); - const uint32_t size = msg.size(); - char* result = new char[size + 7]; - memcpy(result, &size, sizeof(size)); - result[4] = static_cast(code); - memcpy(result + 5, &posix_code, sizeof(posix_code)); - memcpy(result + 7, msg.c_str(), msg.size()); - state_ = result; -} - -const char* Status::CopyState(const char* state) { - uint32_t size; - memcpy(&size, state, sizeof(size)); - char* result = new char[size + 7]; - memcpy(result, state, size + 7); - return result; -} - -std::string Status::CodeAsString() const { - if (state_ == NULL) { - return "OK"; - } - - const char* type; - switch (code()) { - case StatusCode::OK: - type = "OK"; - break; - case StatusCode::OutOfMemory: - type = "Out of memory"; - break; - case StatusCode::KeyError: - type = "Key error"; - break; - case StatusCode::TypeError: - type = "Value error"; - break; - case StatusCode::ValueError: - type = "Value error"; - break; - case StatusCode::IOError: - type = "IO error"; - break; - case StatusCode::NotImplemented: - type = "Not implemented"; - break; - case StatusCode::ArrowError: - type = "Arrow C++ error"; - break; - case StatusCode::UnknownError: - type = "Unknown error"; - break; - } - return std::string(type); -} - -std::string Status::ToString() const { - std::string result(CodeAsString()); - if (state_ == NULL) { - return result; - } - - result.append(": "); - - uint32_t length; - memcpy(&length, state_, sizeof(length)); - result.append(reinterpret_cast(state_ + 7), length); - return result; -} - -} // namespace pyarrow diff --git a/python/src/pyarrow/status.h b/python/src/pyarrow/status.h deleted file mode 100644 index 67cd66c58ee..00000000000 --- a/python/src/pyarrow/status.h +++ /dev/null @@ -1,146 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// -// A Status encapsulates the result of an operation. It may indicate success, -// or it may indicate an error with an associated error message. -// -// Multiple threads can invoke const methods on a Status without -// external synchronization, but if any of the threads may call a -// non-const method, all threads accessing the same Status must use -// external synchronization. - -#ifndef PYARROW_STATUS_H_ -#define PYARROW_STATUS_H_ - -#include -#include -#include - -#include "pyarrow/visibility.h" - -namespace pyarrow { - -#define PY_RETURN_NOT_OK(s) do { \ - Status _s = (s); \ - if (!_s.ok()) return _s; \ - } while (0); - -enum class StatusCode: char { - OK = 0, - OutOfMemory = 1, - KeyError = 2, - TypeError = 3, - ValueError = 4, - IOError = 5, - NotImplemented = 6, - - ArrowError = 7, - - UnknownError = 10 -}; - -class PYARROW_EXPORT Status { - public: - // Create a success status. - Status() : state_(NULL) { } - ~Status() { delete[] state_; } - - // Copy the specified status. - Status(const Status& s); - void operator=(const Status& s); - - // Return a success status. - static Status OK() { return Status(); } - - // Return error status of an appropriate type. - static Status OutOfMemory(const std::string& msg, int16_t posix_code = -1) { - return Status(StatusCode::OutOfMemory, msg, posix_code); - } - - static Status KeyError(const std::string& msg) { - return Status(StatusCode::KeyError, msg, -1); - } - - static Status TypeError(const std::string& msg) { - return Status(StatusCode::TypeError, msg, -1); - } - - static Status IOError(const std::string& msg) { - return Status(StatusCode::IOError, msg, -1); - } - - static Status ValueError(const std::string& msg) { - return Status(StatusCode::ValueError, msg, -1); - } - - static Status NotImplemented(const std::string& msg) { - return Status(StatusCode::NotImplemented, msg, -1); - } - - static Status UnknownError(const std::string& msg) { - return Status(StatusCode::UnknownError, msg, -1); - } - - static Status ArrowError(const std::string& msg) { - return Status(StatusCode::ArrowError, msg, -1); - } - - // Returns true iff the status indicates success. - bool ok() const { return (state_ == NULL); } - - bool IsOutOfMemory() const { return code() == StatusCode::OutOfMemory; } - bool IsKeyError() const { return code() == StatusCode::KeyError; } - bool IsIOError() const { return code() == StatusCode::IOError; } - bool IsTypeError() const { return code() == StatusCode::TypeError; } - bool IsValueError() const { return code() == StatusCode::ValueError; } - - bool IsUnknownError() const { return code() == StatusCode::UnknownError; } - - bool IsArrowError() const { return code() == StatusCode::ArrowError; } - - // Return a string representation of this status suitable for printing. - // Returns the string "OK" for success. - std::string ToString() const; - - // Return a string representation of the status code, without the message - // text or posix code information. - std::string CodeAsString() const; - - // Get the POSIX code associated with this Status, or -1 if there is none. - int16_t posix_code() const; - - private: - // OK status has a NULL state_. Otherwise, state_ is a new[] array - // of the following form: - // state_[0..3] == length of message - // state_[4] == code - // state_[5..6] == posix_code - // state_[7..] == message - const char* state_; - - StatusCode code() const { - return ((state_ == NULL) ? - StatusCode::OK : static_cast(state_[4])); - } - - Status(StatusCode code, const std::string& msg, int16_t posix_code); - static const char* CopyState(const char* s); -}; - -inline Status::Status(const Status& s) { - state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); -} - -inline void Status::operator=(const Status& s) { - // The following condition catches both aliasing (when this == &s), - // and the common case where both s and *this are ok. - if (state_ != s.state_) { - delete[] state_; - state_ = (s.state_ == NULL) ? NULL : CopyState(s.state_); - } -} - -} // namespace pyarrow - -#endif // PYARROW_STATUS_H_ From 676c32ccea6274c75b2750453c1ddbc5f645c037 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Oct 2016 21:18:30 -0400 Subject: [PATCH 176/210] ARROW-317: Add Slice, Copy methods to Buffer There's also a little bit of naming cleanup in `bit-util.h`, pardon the diff noise. Author: Wes McKinney Closes #177 from wesm/ARROW-317 and squashes the following commits: 0666b22 [Wes McKinney] Fix up pyarrow usage of BitUtil 3ab4e7a [Wes McKinney] Add Slice, Copy methods to Buffer cb9519d [Wes McKinney] Use more conforming names in bit-util.h --- cpp/src/arrow/array.cc | 3 +- cpp/src/arrow/array.h | 2 +- cpp/src/arrow/builder.cc | 12 ++++---- cpp/src/arrow/column-benchmark.cc | 2 +- cpp/src/arrow/ipc/adapter.cc | 5 ++-- cpp/src/arrow/ipc/test-common.h | 3 +- cpp/src/arrow/test-util.h | 6 ++-- cpp/src/arrow/types/list.cc | 2 +- cpp/src/arrow/types/primitive-test.cc | 16 +++++------ cpp/src/arrow/types/primitive.cc | 13 +++++---- cpp/src/arrow/types/primitive.h | 12 ++++---- cpp/src/arrow/util/bit-util-test.cc | 36 +++++++++++------------ cpp/src/arrow/util/bit-util.cc | 10 +++---- cpp/src/arrow/util/bit-util.h | 29 +++++++++---------- cpp/src/arrow/util/buffer-test.cc | 41 +++++++++++++++++++++++++++ cpp/src/arrow/util/buffer.cc | 28 +++++++++++++++++- cpp/src/arrow/util/buffer.h | 23 +++++++++++---- python/src/pyarrow/adapters/pandas.cc | 20 ++++++------- 18 files changed, 173 insertions(+), 90 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index d6b081f3155..e432a53781f 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -19,6 +19,7 @@ #include +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -43,7 +44,7 @@ bool Array::EqualsExact(const Array& other) const { return false; } if (null_count_ > 0) { - return null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + return null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); } return true; } diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index c7ffb23ca18..ff37323f605 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -45,7 +45,7 @@ class ARROW_EXPORT Array { // Determine if a slot is null. For inner loops. Does *not* boundscheck bool IsNull(int i) const { - return null_count_ > 0 && util::bit_not_set(null_bitmap_data_, i); + return null_count_ > 0 && BitUtil::BitNotSet(null_bitmap_data_, i); } int32_t length() const { return length_; } diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 1fba9616922..151b257a3d8 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -31,7 +31,7 @@ Status ArrayBuilder::AppendToBitmap(bool is_valid) { // TODO(emkornfield) doubling isn't great default allocation practice // see https://github.com/facebook/folly/blob/master/folly/docs/FBVector.md // fo discussion - RETURN_NOT_OK(Resize(util::next_power2(capacity_ + 1))); + RETURN_NOT_OK(Resize(BitUtil::NextPower2(capacity_ + 1))); } UnsafeAppendToBitmap(is_valid); return Status::OK(); @@ -45,7 +45,7 @@ Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int32_t length) } Status ArrayBuilder::Init(int32_t capacity) { - int32_t to_alloc = util::ceil_byte(capacity) / 8; + int32_t to_alloc = BitUtil::CeilByte(capacity) / 8; null_bitmap_ = std::make_shared(pool_); RETURN_NOT_OK(null_bitmap_->Resize(to_alloc)); // Buffers might allocate more then necessary to satisfy padding requirements @@ -58,7 +58,7 @@ Status ArrayBuilder::Init(int32_t capacity) { Status ArrayBuilder::Resize(int32_t new_bits) { if (!null_bitmap_) { return Init(new_bits); } - int32_t new_bytes = util::ceil_byte(new_bits) / 8; + int32_t new_bytes = BitUtil::CeilByte(new_bits) / 8; int32_t old_bytes = null_bitmap_->size(); RETURN_NOT_OK(null_bitmap_->Resize(new_bytes)); null_bitmap_data_ = null_bitmap_->mutable_data(); @@ -82,7 +82,7 @@ Status ArrayBuilder::Advance(int32_t elements) { Status ArrayBuilder::Reserve(int32_t elements) { if (length_ + elements > capacity_) { // TODO(emkornfield) power of 2 growth is potentially suboptimal - int32_t new_capacity = util::next_power2(length_ + elements); + int32_t new_capacity = BitUtil::NextPower2(length_ + elements); return Resize(new_capacity); } return Status::OK(); @@ -96,7 +96,7 @@ Status ArrayBuilder::SetNotNull(int32_t length) { void ArrayBuilder::UnsafeAppendToBitmap(bool is_valid) { if (is_valid) { - util::set_bit(null_bitmap_data_, length_); + BitUtil::SetBit(null_bitmap_data_, length_); } else { ++null_count_; } @@ -118,7 +118,7 @@ void ArrayBuilder::UnsafeSetNotNull(int32_t length) { const int32_t new_length = length + length_; // TODO(emkornfield) Optimize for large values of length? for (int32_t i = length_; i < new_length; ++i) { - util::set_bit(null_bitmap_data_, i); + BitUtil::SetBit(null_bitmap_data_, i); } length_ = new_length; } diff --git a/cpp/src/arrow/column-benchmark.cc b/cpp/src/arrow/column-benchmark.cc index edea0948860..f429a813c6f 100644 --- a/cpp/src/arrow/column-benchmark.cc +++ b/cpp/src/arrow/column-benchmark.cc @@ -29,7 +29,7 @@ std::shared_ptr MakePrimitive(int32_t length, int32_t null_count = 0) { auto data = std::make_shared(pool); auto null_bitmap = std::make_shared(pool); data->Resize(length * sizeof(typename ArrayType::value_type)); - null_bitmap->Resize(util::bytes_for_bits(length)); + null_bitmap->Resize(BitUtil::BytesForBits(length)); return std::make_shared(length, data, 10, null_bitmap); } } // anonymous namespace diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index f84cb264f70..74786bf85ff 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -37,6 +37,7 @@ #include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" #include "arrow/util/status.h" @@ -49,7 +50,7 @@ namespace ipc { namespace { Status CheckMultipleOf64(int64_t size) { - if (util::is_multiple_of_64(size)) { return Status::OK(); } + if (BitUtil::IsMultipleOf64(size)) { return Status::OK(); } return Status::Invalid( "Attempted to write a buffer that " "wasn't a multiple of 64 bytes"); @@ -155,7 +156,7 @@ class RecordBatchWriter { // The buffer might be null if we are handling zero row lengths. if (buffer) { size = buffer->size(); - padding = util::RoundUpToMultipleOf64(size) - size; + padding = BitUtil::RoundUpToMultipleOf64(size) - size; } // TODO(wesm): We currently have no notion of shared memory page id's, diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 13bbbebde8a..784e238e977 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -31,6 +31,7 @@ #include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" @@ -263,7 +264,7 @@ Status MakeStruct(std::shared_ptr* out) { std::vector null_bytes(list_batch->num_rows(), 1); null_bytes[0] = 0; std::shared_ptr null_bitmask; - RETURN_NOT_OK(util::bytes_to_bits(null_bytes, &null_bitmask)); + RETURN_NOT_OK(BitUtil::BytesToBits(null_bytes, &null_bitmask)); ArrayPtr with_nulls( new StructArray(type, list_batch->num_rows(), columns, 1, null_bitmask)); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index e632ffb1d89..ac56f5ed087 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -69,7 +69,7 @@ class TestBase : public ::testing::Test { auto data = std::make_shared(pool_); auto null_bitmap = std::make_shared(pool_); EXPECT_OK(data->Resize(length * sizeof(typename ArrayType::value_type))); - EXPECT_OK(null_bitmap->Resize(util::bytes_for_bits(length))); + EXPECT_OK(null_bitmap->Resize(BitUtil::BytesForBits(length))); return std::make_shared(length, data, 10, null_bitmap); } @@ -152,7 +152,7 @@ static inline int bitmap_popcount(const uint8_t* data, int length) { // versions of popcount but the code complexity is likely not worth it) const int loop_tail_index = fast_counts * pop_len; for (int i = loop_tail_index; i < length; ++i) { - if (util::get_bit(data, i)) { ++count; } + if (BitUtil::GetBit(data, i)) { ++count; } } return count; @@ -170,7 +170,7 @@ std::shared_ptr bytes_to_null_buffer(const std::vector& bytes) std::shared_ptr out; // TODO(wesm): error checking - util::bytes_to_bits(bytes, &out); + BitUtil::BytesToBits(bytes, &out); return out; } diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index ef2ec22cb53..4b1e8214727 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -30,7 +30,7 @@ bool ListArray::EqualsExact(const ListArray& other) const { bool equal_null_bitmap = true; if (null_count_ > 0) { equal_null_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); } if (!equal_null_bitmap) { return false; } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 121bd4794f2..e47f6dc74fb 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -236,7 +236,7 @@ void TestPrimitiveBuilder::Check( for (int i = 0; i < result->length(); ++i) { if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; } - bool actual = util::get_bit(result->raw_data(), i); + bool actual = BitUtil::GetBit(result->raw_data(), i); ASSERT_EQ(static_cast(draws_[i]), actual) << i; } ASSERT_TRUE(result->EqualsExact(*expected.get())); @@ -258,8 +258,8 @@ TYPED_TEST(TestPrimitiveBuilder, TestInit) { int n = 1000; ASSERT_OK(this->builder_->Reserve(n)); - ASSERT_EQ(util::next_power2(n), this->builder_->capacity()); - ASSERT_EQ(util::next_power2(TypeTraits::bytes_required(n)), + ASSERT_EQ(BitUtil::NextPower2(n), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(TypeTraits::bytes_required(n)), this->builder_->data()->size()); // unsure if this should go in all builder classes @@ -409,10 +409,10 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) { } ASSERT_EQ(size, this->builder_->length()); - ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity()); ASSERT_EQ(size, this->builder_nn_->length()); - ASSERT_EQ(util::next_power2(size), this->builder_nn_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_nn_->capacity()); this->Check(this->builder_, true); this->Check(this->builder_nn_, false); @@ -444,7 +444,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestAppendVector) { ASSERT_OK(this->builder_nn_->Append(draws.data() + K, size - K)); ASSERT_EQ(size, this->builder_->length()); - ASSERT_EQ(util::next_power2(size), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(size), this->builder_->capacity()); this->Check(this->builder_, true); this->Check(this->builder_nn_, false); @@ -472,7 +472,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestResize) { ASSERT_EQ(cap, this->builder_->capacity()); ASSERT_EQ(TypeTraits::bytes_required(cap), this->builder_->data()->size()); - ASSERT_EQ(util::bytes_for_bits(cap), this->builder_->null_bitmap()->size()); + ASSERT_EQ(BitUtil::BytesForBits(cap), this->builder_->null_bitmap()->size()); } TYPED_TEST(TestPrimitiveBuilder, TestReserve) { @@ -484,7 +484,7 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) { ASSERT_OK(this->builder_->Advance(100)); ASSERT_OK(this->builder_->Reserve(kMinBuilderCapacity)); - ASSERT_EQ(util::next_power2(kMinBuilderCapacity + 100), this->builder_->capacity()); + ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity()); } } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 3a05ccfdf18..d2288bafa71 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -19,6 +19,7 @@ #include +#include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" @@ -41,7 +42,7 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { if (null_count_ > 0) { bool equal_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, util::ceil_byte(length_) / 8); + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::CeilByte(length_) / 8); if (!equal_bitmap) { return false; } const uint8_t* this_data = raw_data_; @@ -156,9 +157,9 @@ Status PrimitiveBuilder::Append( if ((valid_bytes != nullptr) && !valid_bytes[i]) continue; if (values[i] > 0) { - util::set_bit(raw_data_, length_ + i); + BitUtil::SetBit(raw_data_, length_ + i); } else { - util::clear_bit(raw_data_, length_ + i); + BitUtil::ClearBit(raw_data_, length_ + i); } } @@ -196,20 +197,20 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const { if (null_count_ > 0) { bool equal_bitmap = - null_bitmap_->Equals(*other.null_bitmap_, util::bytes_for_bits(length_)); + null_bitmap_->Equals(*other.null_bitmap_, BitUtil::BytesForBits(length_)); if (!equal_bitmap) { return false; } const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && util::get_bit(this_data, i) != util::get_bit(other_data, i)) { + if (!IsNull(i) && BitUtil::GetBit(this_data, i) != BitUtil::GetBit(other_data, i)) { return false; } } return true; } else { - return data_->Equals(*other.data_, util::bytes_for_bits(length_)); + return data_->Equals(*other.data_, BitUtil::BytesForBits(length_)); } } diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index f21470d96e4..c71df584ffe 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -173,7 +173,7 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { // Does not capacity-check; make sure to call Reserve beforehand void UnsafeAppend(value_type val) { - util::set_bit(null_bitmap_data_, length_); + BitUtil::SetBit(null_bitmap_data_, length_); raw_data_[length_++] = val; } @@ -290,7 +290,7 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } - bool Value(int i) const { return util::get_bit(raw_data(), i); } + bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); } }; template <> @@ -298,7 +298,7 @@ struct TypeTraits { typedef BooleanArray ArrayType; static inline int bytes_required(int elements) { - return util::bytes_for_bits(elements); + return BitUtil::BytesForBits(elements); } }; @@ -314,11 +314,11 @@ class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { // Scalar append Status Append(bool val) { Reserve(1); - util::set_bit(null_bitmap_data_, length_); + BitUtil::SetBit(null_bitmap_data_, length_); if (val) { - util::set_bit(raw_data_, length_); + BitUtil::SetBit(raw_data_, length_); } else { - util::clear_bit(raw_data_, length_); + BitUtil::ClearBit(raw_data_, length_); } ++length_; return Status::OK(); diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index e1d8a0808b4..cfdee04f6e2 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -22,33 +22,33 @@ namespace arrow { TEST(UtilTests, TestIsMultipleOf64) { - using util::is_multiple_of_64; - EXPECT_TRUE(is_multiple_of_64(64)); - EXPECT_TRUE(is_multiple_of_64(0)); - EXPECT_TRUE(is_multiple_of_64(128)); - EXPECT_TRUE(is_multiple_of_64(192)); - EXPECT_FALSE(is_multiple_of_64(23)); - EXPECT_FALSE(is_multiple_of_64(32)); + using BitUtil::IsMultipleOf64; + EXPECT_TRUE(IsMultipleOf64(64)); + EXPECT_TRUE(IsMultipleOf64(0)); + EXPECT_TRUE(IsMultipleOf64(128)); + EXPECT_TRUE(IsMultipleOf64(192)); + EXPECT_FALSE(IsMultipleOf64(23)); + EXPECT_FALSE(IsMultipleOf64(32)); } TEST(UtilTests, TestNextPower2) { - using util::next_power2; + using BitUtil::NextPower2; - ASSERT_EQ(8, next_power2(6)); - ASSERT_EQ(8, next_power2(8)); + ASSERT_EQ(8, NextPower2(6)); + ASSERT_EQ(8, NextPower2(8)); - ASSERT_EQ(1, next_power2(1)); - ASSERT_EQ(256, next_power2(131)); + ASSERT_EQ(1, NextPower2(1)); + ASSERT_EQ(256, NextPower2(131)); - ASSERT_EQ(1024, next_power2(1000)); + ASSERT_EQ(1024, NextPower2(1000)); - ASSERT_EQ(4096, next_power2(4000)); + ASSERT_EQ(4096, NextPower2(4000)); - ASSERT_EQ(65536, next_power2(64000)); + ASSERT_EQ(65536, NextPower2(64000)); - ASSERT_EQ(1LL << 32, next_power2((1LL << 32) - 1)); - ASSERT_EQ(1LL << 31, next_power2((1LL << 31) - 1)); - ASSERT_EQ(1LL << 62, next_power2((1LL << 62) - 1)); + ASSERT_EQ(1LL << 32, NextPower2((1LL << 32) - 1)); + ASSERT_EQ(1LL << 31, NextPower2((1LL << 31) - 1)); + ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1)); } } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index 475576e87ca..7e1cb186717 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -24,20 +24,20 @@ namespace arrow { -void util::bytes_to_bits(const std::vector& bytes, uint8_t* bits) { +void BitUtil::BytesToBits(const std::vector& bytes, uint8_t* bits) { for (size_t i = 0; i < bytes.size(); ++i) { - if (bytes[i] > 0) { set_bit(bits, i); } + if (bytes[i] > 0) { SetBit(bits, i); } } } -Status util::bytes_to_bits( +Status BitUtil::BytesToBits( const std::vector& bytes, std::shared_ptr* out) { - int bit_length = util::bytes_for_bits(bytes.size()); + int bit_length = BitUtil::BytesForBits(bytes.size()); auto buffer = std::make_shared(); RETURN_NOT_OK(buffer->Resize(bit_length)); memset(buffer->mutable_data(), 0, bit_length); - bytes_to_bits(bytes, buffer->mutable_data()); + BytesToBits(bytes, buffer->mutable_data()); *out = buffer; return Status::OK(); diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index c33ef272f05..13b7e19593d 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -30,39 +30,39 @@ namespace arrow { class Buffer; class Status; -namespace util { +namespace BitUtil { -static inline int64_t ceil_byte(int64_t size) { +static inline int64_t CeilByte(int64_t size) { return (size + 7) & ~7; } -static inline int64_t bytes_for_bits(int64_t size) { - return ceil_byte(size) / 8; +static inline int64_t BytesForBits(int64_t size) { + return CeilByte(size) / 8; } -static inline int64_t ceil_2bytes(int64_t size) { +static inline int64_t Ceil2Bytes(int64_t size) { return (size + 15) & ~15; } static constexpr uint8_t kBitmask[] = {1, 2, 4, 8, 16, 32, 64, 128}; -static inline bool get_bit(const uint8_t* bits, int i) { +static inline bool GetBit(const uint8_t* bits, int i) { return static_cast(bits[i / 8] & kBitmask[i % 8]); } -static inline bool bit_not_set(const uint8_t* bits, int i) { +static inline bool BitNotSet(const uint8_t* bits, int i) { return (bits[i / 8] & kBitmask[i % 8]) == 0; } -static inline void clear_bit(uint8_t* bits, int i) { +static inline void ClearBit(uint8_t* bits, int i) { bits[i / 8] &= ~kBitmask[i % 8]; } -static inline void set_bit(uint8_t* bits, int i) { +static inline void SetBit(uint8_t* bits, int i) { bits[i / 8] |= kBitmask[i % 8]; } -static inline int64_t next_power2(int64_t n) { +static inline int64_t NextPower2(int64_t n) { n--; n |= n >> 1; n |= n >> 2; @@ -74,7 +74,7 @@ static inline int64_t next_power2(int64_t n) { return n; } -static inline bool is_multiple_of_64(int64_t n) { +static inline bool IsMultipleOf64(int64_t n) { return (n & 63) == 0; } @@ -90,11 +90,10 @@ inline int64_t RoundUpToMultipleOf64(int64_t num) { return num; } -void bytes_to_bits(const std::vector& bytes, uint8_t* bits); -ARROW_EXPORT Status bytes_to_bits(const std::vector&, std::shared_ptr*); - -} // namespace util +void BytesToBits(const std::vector& bytes, uint8_t* bits); +ARROW_EXPORT Status BytesToBits(const std::vector&, std::shared_ptr*); +} // namespace BitUtil } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H diff --git a/cpp/src/arrow/util/buffer-test.cc b/cpp/src/arrow/util/buffer-test.cc index cc4ec98e4fb..095b07b7ab3 100644 --- a/cpp/src/arrow/util/buffer-test.cc +++ b/cpp/src/arrow/util/buffer-test.cc @@ -31,6 +31,18 @@ namespace arrow { class TestBuffer : public ::testing::Test {}; +TEST_F(TestBuffer, IsMutableFlag) { + Buffer buf(nullptr, 0); + + ASSERT_FALSE(buf.is_mutable()); + + MutableBuffer mbuf(nullptr, 0); + ASSERT_TRUE(mbuf.is_mutable()); + + PoolBuffer pbuf; + ASSERT_TRUE(pbuf.is_mutable()); +} + TEST_F(TestBuffer, Resize) { PoolBuffer buf; @@ -96,4 +108,33 @@ TEST_F(TestBuffer, EqualsWithSameBuffer) { pool->Free(rawBuffer, bufferSize); } +TEST_F(TestBuffer, Copy) { + std::string data_str = "some data to copy"; + + auto data = reinterpret_cast(data_str.c_str()); + + Buffer buf(data, data_str.size()); + + std::shared_ptr out; + + ASSERT_OK(buf.Copy(5, 4, &out)); + + Buffer expected(data + 5, 4); + ASSERT_TRUE(out->Equals(expected)); +} + +TEST_F(TestBuffer, SliceBuffer) { + std::string data_str = "some data to slice"; + + auto data = reinterpret_cast(data_str.c_str()); + + auto buf = std::make_shared(data, data_str.size()); + + std::shared_ptr out = SliceBuffer(buf, 5, 4); + Buffer expected(data + 5, 4); + ASSERT_TRUE(out->Equals(expected)); + + ASSERT_EQ(2, buf.use_count()); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/buffer.cc b/cpp/src/arrow/util/buffer.cc index 6faa048e4e5..a230259e593 100644 --- a/cpp/src/arrow/util/buffer.cc +++ b/cpp/src/arrow/util/buffer.cc @@ -36,6 +36,32 @@ Buffer::Buffer(const std::shared_ptr& parent, int64_t offset, int64_t si Buffer::~Buffer() {} +Status Buffer::Copy( + int64_t start, int64_t nbytes, MemoryPool* pool, std::shared_ptr* out) const { + // Sanity checks + DCHECK_LT(start, size_); + DCHECK_LE(nbytes, size_ - start); + + auto new_buffer = std::make_shared(pool); + RETURN_NOT_OK(new_buffer->Resize(nbytes)); + + std::memcpy(new_buffer->mutable_data(), data() + start, nbytes); + + *out = new_buffer; + return Status::OK(); +} + +Status Buffer::Copy(int64_t start, int64_t nbytes, std::shared_ptr* out) const { + return Copy(start, nbytes, default_memory_pool(), out); +} + +std::shared_ptr SliceBuffer( + const std::shared_ptr& buffer, int64_t offset, int64_t length) { + DCHECK_LT(offset, buffer->size()); + DCHECK_LE(length, buffer->size() - offset); + return std::make_shared(buffer, offset, length); +} + std::shared_ptr MutableBuffer::GetImmutableView() { return std::make_shared(this->get_shared_ptr(), 0, size()); } @@ -52,7 +78,7 @@ PoolBuffer::~PoolBuffer() { Status PoolBuffer::Reserve(int64_t new_capacity) { if (!mutable_data_ || new_capacity > capacity_) { uint8_t* new_data; - new_capacity = util::RoundUpToMultipleOf64(new_capacity); + new_capacity = BitUtil::RoundUpToMultipleOf64(new_capacity); if (mutable_data_) { RETURN_NOT_OK(pool_->Allocate(new_capacity, &new_data)); memcpy(new_data, mutable_data_, size_); diff --git a/cpp/src/arrow/util/buffer.h b/cpp/src/arrow/util/buffer.h index bc0df86221c..04ad6c2dffd 100644 --- a/cpp/src/arrow/util/buffer.h +++ b/cpp/src/arrow/util/buffer.h @@ -43,7 +43,8 @@ class Status; // The following invariant is always true: Size < Capacity class ARROW_EXPORT Buffer : public std::enable_shared_from_this { public: - Buffer(const uint8_t* data, int64_t size) : data_(data), size_(size), capacity_(size) {} + Buffer(const uint8_t* data, int64_t size) + : is_mutable_(false), data_(data), size_(size), capacity_(size) {} virtual ~Buffer(); // An offset into data that is owned by another buffer, but we want to be @@ -57,6 +58,8 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this { std::shared_ptr get_shared_ptr() { return shared_from_this(); } + bool is_mutable() const { return is_mutable_; } + // Return true if both buffers are the same size and contain the same bytes // up to the number of compared bytes bool Equals(const Buffer& other, int64_t nbytes) const { @@ -71,18 +74,22 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this { (data_ == other.data_ || !memcmp(data_, other.data_, size_))); } + // Copy section of buffer into a new Buffer + Status Copy(int64_t start, int64_t nbytes, MemoryPool* pool, + std::shared_ptr* out) const; + + // Default memory pool + Status Copy(int64_t start, int64_t nbytes, std::shared_ptr* out) const; + int64_t capacity() const { return capacity_; } const uint8_t* data() const { return data_; } int64_t size() const { return size_; } - // Returns true if this Buffer is referencing memory (possibly) owned by some - // other buffer - bool is_shared() const { return static_cast(parent_); } - const std::shared_ptr parent() const { return parent_; } protected: + bool is_mutable_; const uint8_t* data_; int64_t size_; int64_t capacity_; @@ -94,10 +101,16 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this { DISALLOW_COPY_AND_ASSIGN(Buffer); }; +// Construct a view on passed buffer at the indicated offset and length. This +// function cannot fail and does not error checking (except in debug builds) +std::shared_ptr SliceBuffer( + const std::shared_ptr& buffer, int64_t offset, int64_t length); + // A Buffer whose contents can be mutated. May or may not own its data. class ARROW_EXPORT MutableBuffer : public Buffer { public: MutableBuffer(uint8_t* data, int64_t size) : Buffer(data, size) { + is_mutable_ = true; mutable_data_ = data; } diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 5902b834169..7e70be75da5 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -44,7 +44,7 @@ using arrow::Field; using arrow::DataType; using arrow::Status; -namespace util = arrow::util; +namespace BitUtil = arrow::BitUtil; // ---------------------------------------------------------------------- // Serialization @@ -148,7 +148,7 @@ class ArrowSerializer { } Status InitNullBitmap() { - int null_bytes = util::bytes_for_bits(length_); + int null_bytes = BitUtil::BytesForBits(length_); null_bitmap_ = std::make_shared(pool_); RETURN_NOT_OK(null_bitmap_->Resize(null_bytes)); @@ -206,7 +206,7 @@ class ArrowSerializer { PyObject** objects = reinterpret_cast(PyArray_DATA(arr_)); - int nbytes = util::bytes_for_bits(length_); + int nbytes = BitUtil::BytesForBits(length_); auto data = std::make_shared(pool_); RETURN_NOT_OK(data->Resize(nbytes)); uint8_t* bitmap = data->mutable_data(); @@ -215,12 +215,12 @@ class ArrowSerializer { int64_t null_count = 0; for (int64_t i = 0; i < length_; ++i) { if (objects[i] == Py_True) { - util::set_bit(bitmap, i); - util::set_bit(null_bitmap_data_, i); + BitUtil::SetBit(bitmap, i); + BitUtil::SetBit(null_bitmap_data_, i); } else if (objects[i] != Py_False) { ++null_count; } else { - util::set_bit(null_bitmap_data_, i); + BitUtil::SetBit(null_bitmap_data_, i); } } @@ -253,7 +253,7 @@ static int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap if (mask_values[i]) { ++null_count; } else { - util::set_bit(bitmap, i); + BitUtil::SetBit(bitmap, i); } } return null_count; @@ -272,7 +272,7 @@ static int64_t ValuesToBitmap(const void* data, int64_t length, uint8_t* bitmap) if (traits::isnull(values[i])) { ++null_count; } else { - util::set_bit(bitmap, i); + BitUtil::SetBit(bitmap, i); } } @@ -402,7 +402,7 @@ inline Status ArrowSerializer::ConvertData() { return Status::Invalid("no support for strided data yet"); } - int nbytes = util::bytes_for_bits(length_); + int nbytes = BitUtil::BytesForBits(length_); auto buffer = std::make_shared(pool_); RETURN_NOT_OK(buffer->Resize(nbytes)); @@ -413,7 +413,7 @@ inline Status ArrowSerializer::ConvertData() { memset(bitmap, 0, nbytes); for (int i = 0; i < length_; ++i) { if (values[i] > 0) { - util::set_bit(bitmap, i); + BitUtil::SetBit(bitmap, i); } } From e2c0a18316504a0177129cb66b25a9dc54291587 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Oct 2016 22:46:44 -0400 Subject: [PATCH 177/210] ARROW-327: [Python] Remove conda builds from Travis CI setup We'll do these builds in conda-forge Author: Wes McKinney Closes #178 from wesm/ARROW-327 and squashes the following commits: 1303d6e [Wes McKinney] Remove conda builds --- .travis.yml | 18 ---------------- ci/travis_conda_build.sh | 45 ---------------------------------------- 2 files changed, 63 deletions(-) delete mode 100755 ci/travis_conda_build.sh diff --git a/.travis.yml b/.travis.yml index 97229b1ceb3..a53756c962e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -41,24 +41,6 @@ matrix: script: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh - - compiler: gcc - env: ARROW_TEST_GROUP=packaging - os: linux - before_script: - - export CC="gcc-4.9" - - export CXX="g++-4.9" - script: - - $TRAVIS_BUILD_DIR/ci/travis_conda_build.sh - - os: osx - env: ARROW_TEST_GROUP=packaging - language: objective-c - osx_image: xcode6.4 - compiler: clang - addons: - before_script: - before_install: - script: - - $TRAVIS_BUILD_DIR/ci/travis_conda_build.sh - language: java os: linux jdk: oraclejdk7 diff --git a/ci/travis_conda_build.sh b/ci/travis_conda_build.sh deleted file mode 100755 index 17a33ae9717..00000000000 --- a/ci/travis_conda_build.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. See accompanying LICENSE file. - -set -ex - -source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh - -# Build libarrow - -cd $TRAVIS_BUILD_DIR/cpp - -conda build conda.recipe --channel apache/channel/dev -CONDA_PACKAGE=`conda build --output conda.recipe | grep bz2` - -if [ $TRAVIS_BRANCH == "master" ] && [ $TRAVIS_PULL_REQUEST == "false" ]; then - anaconda --token $ANACONDA_TOKEN upload $CONDA_PACKAGE --user apache --channel dev; -fi - -# Build pyarrow - -cd $TRAVIS_BUILD_DIR/python - -build_for_python_version() { - PY_VERSION=$1 - conda build conda.recipe --python $PY_VERSION --channel apache/channel/dev - CONDA_PACKAGE=`conda build --python $PY_VERSION --output conda.recipe | grep bz2` - - if [ $TRAVIS_BRANCH == "master" ] && [ $TRAVIS_PULL_REQUEST == "false" ]; then - anaconda --token $ANACONDA_TOKEN upload $CONDA_PACKAGE --user apache --channel dev; - fi -} - -build_for_python_version 2.7 -build_for_python_version 3.5 From 446ec9bd628244bf675887f5a030d3a94c07645e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 17 Oct 2016 22:49:56 -0400 Subject: [PATCH 178/210] ARROW-334: [Python] Remove INSTALL_RPATH_USE_LINK_PATH Will try to verify whether this resolves the issue. See https://travis-ci.org/conda-forge/staged-recipes/builds/166897102 Author: Wes McKinney Closes #171 from wesm/ARROW-334 and squashes the following commits: ed8fa39 [Wes McKinney] Switch by to xcode 6.4 b8224ce [Wes McKinney] Escape dollar sign in ORIGIN b76b7ac [Wes McKinney] Fix LD_LIBRARY_PATH 3c8d2dd [Wes McKinney] Clean up Travis CI scripts a bit. Put in LD_LIBRARY_PATH 30488d7 [Wes McKinney] Don't conda install arrow-cpp during Travis build afb1dc0 [Wes McKinney] Remove INSTALL_RPATH_USE_LINK_PATH --- .travis.yml | 1 - ci/travis_before_script_cpp.sh | 4 ---- ci/travis_script_python.sh | 15 ++++++--------- python/CMakeLists.txt | 4 +--- 4 files changed, 7 insertions(+), 17 deletions(-) diff --git a/.travis.yml b/.travis.yml index a53756c962e..052c22ccc37 100644 --- a/.travis.yml +++ b/.travis.yml @@ -32,7 +32,6 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh - compiler: clang - language: objective-c osx_image: xcode6.4 os: osx addons: diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 2d4224b3333..20307736e67 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -15,10 +15,6 @@ set -ex -source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh -conda install -y --channel apache/channel/dev parquet-cpp -export PARQUET_HOME=$MINICONDA - : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build} mkdir $CPP_BUILD_DIR diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 55cb2a76f6d..179567b5954 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -14,12 +14,16 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh + PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ export MINICONDA=$HOME/miniconda export PATH="$MINICONDA/bin:$PATH" -export PARQUET_HOME=$MINICONDA + +export ARROW_HOME=$ARROW_CPP_INSTALL +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ARROW_CPP_INSTALL/lib pushd $PYTHON_DIR @@ -38,17 +42,10 @@ python_version_tests() { # Expensive dependencies install from Continuum package repo conda install -y pip numpy pandas cython - # conda install -y parquet-cpp - - conda install -y arrow-cpp -c apache/channel/dev - # Other stuff pip install pip install -r requirements.txt - export ARROW_HOME=$ARROW_CPP_INSTALL - - python setup.py build_ext \ - --inplace + python setup.py build_ext --inplace python -m pytest -vv -r sxX pyarrow diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 4357fa05ff8..b8be8665af0 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -417,8 +417,6 @@ if (UNIX) set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) endif() -SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) @@ -494,7 +492,7 @@ foreach(module ${CYTHON_EXTENSIONS}) if(APPLE) set(module_install_rpath "@loader_path") else() - set(module_install_rpath "$ORIGIN") + set(module_install_rpath "\$ORIGIN") endif() list(LENGTH directories i) while(${i} GREATER 0) From 2f84493371bd8fae30b8e042984c9d6ba5419c5f Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 21 Oct 2016 16:27:00 -0400 Subject: [PATCH 179/210] ARROW-342: Set Python version on release Author: Uwe L. Korn Closes #179 from xhochy/ARROW-342 and squashes the following commits: 15d0ce3 [Uwe L. Korn] ARROW-342: Set Python version on release --- dev/release/00-prepare.sh | 9 +++++++-- python/.gitignore | 1 + python/pyarrow/__init__.py | 1 + python/setup.py | 24 ++++++++++++++++++++---- 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/dev/release/00-prepare.sh b/dev/release/00-prepare.sh index 3c1fb9a0938..3423a3e6c5b 100644 --- a/dev/release/00-prepare.sh +++ b/dev/release/00-prepare.sh @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -43,4 +43,9 @@ mvn release:prepare -Dtag=${tag} -DreleaseVersion=${version} -DautoVersionSubmod cd - +cd "${SOURCE_DIR}/../../python" +sed -i "s/VERSION = '[^']*'/VERSION = '${version}'/g" setup.py +sed -i "s/ISRELEASED = False/ISRELEASED = True/g" setup.py +cd - + echo "Finish staging binary artifacts by running: sh dev/release/01-perform.sh" diff --git a/python/.gitignore b/python/.gitignore index 7e2e360557a..07f28355a25 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -25,6 +25,7 @@ MANIFEST # Generated sources *.c *.cpp +pyarrow/version.py # Python files # setup.py working directory diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 8b131aaa8f4..775ce7ec475 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -42,3 +42,4 @@ DataType, Field, Schema, schema) from pyarrow.table import Column, RecordBatch, Table, from_pandas_dataframe +from pyarrow.version import version as __version__ diff --git a/python/setup.py b/python/setup.py index d040ea7e892..99049777514 100644 --- a/python/setup.py +++ b/python/setup.py @@ -50,10 +50,25 @@ if Cython.__version__ < '0.19.1': raise Exception('Please upgrade to Cython 0.19.1 or newer') -MAJOR = 0 -MINOR = 1 -MICRO = 0 -VERSION = '%d.%d.%ddev' % (MAJOR, MINOR, MICRO) +VERSION = '0.1.0' +ISRELEASED = False + +if not ISRELEASED: + VERSION += '.dev' + +setup_dir = os.path.abspath(os.path.dirname(__file__)) + + +def write_version_py(filename=os.path.join(setup_dir, 'pyarrow/version.py')): + a = open(filename, 'w') + file_content = "\n".join(["", + "# THIS FILE IS GENERATED FROM SETUP.PY", + "version = '%(version)s'", + "isrelease = '%(isrelease)s'"]) + + a.write(file_content % {'version': VERSION, + 'isrelease': str(ISRELEASED)}) + a.close() class clean(_clean): @@ -238,6 +253,7 @@ def get_outputs(self): return [self._get_cmake_ext_path(name) for name in self.get_names()] +write_version_py() DESC = """\ Python library for Apache Arrow""" From 3d2e4df219d6b06a3d78821bbca6ba17188908c2 Mon Sep 17 00:00:00 2001 From: adeneche Date: Wed, 26 Oct 2016 12:09:26 -0700 Subject: [PATCH 180/210] =?UTF-8?q?ARROW-337:=20UnionListWriter.list()=20i?= =?UTF-8?q?s=20doing=20more=20than=20it=20should,=20this=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …can cause data corruption The general idea is to use the "inner" writer's position to update the offset. This involves making sure various writers do indeed update their positions. UnionListWriter.startList() should explicitly set the inner writer position in case setPosition() was called to move the union list writer's position Author: adeneche Closes #183 from adeneche/ARROW-337 and squashes the following commits: 1ae7e00 [adeneche] updated TestComplexWriter to ensure position is set properly by the various writers 7d5aefc [adeneche] ARROW-337: UnionListWriter.list() is doing more than it should, this can cause data corruption --- .../AbstractPromotableFieldWriter.java | 2 + .../main/codegen/templates/MapWriters.java | 1 + .../codegen/templates/UnionListWriter.java | 32 +-- .../apache/arrow/vector/TestListVector.java | 4 - .../complex/writer/TestComplexWriter.java | 201 +++++++++++++----- 5 files changed, 154 insertions(+), 86 deletions(-) diff --git a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java index d21dcd0f646..60dd0c7b7ad 100644 --- a/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java +++ b/java/vector/src/main/codegen/templates/AbstractPromotableFieldWriter.java @@ -58,6 +58,7 @@ public void start() { @Override public void end() { getWriter(MinorType.MAP).end(); + setPosition(idx() + 1); } @Override @@ -68,6 +69,7 @@ public void startList() { @Override public void endList() { getWriter(MinorType.LIST).endList(); + setPosition(idx() + 1); } <#list vv.types as type><#list type.minor as minor><#assign name = minor.class?cap_first /> diff --git a/java/vector/src/main/codegen/templates/MapWriters.java b/java/vector/src/main/codegen/templates/MapWriters.java index 51327b43af0..f41b60072c8 100644 --- a/java/vector/src/main/codegen/templates/MapWriters.java +++ b/java/vector/src/main/codegen/templates/MapWriters.java @@ -185,6 +185,7 @@ public void start() { @Override public void end() { + setPosition(idx()+1); } <#list vv.types as type><#list type.minor as minor> diff --git a/java/vector/src/main/codegen/templates/UnionListWriter.java b/java/vector/src/main/codegen/templates/UnionListWriter.java index 04531a72128..bb39fe8d294 100644 --- a/java/vector/src/main/codegen/templates/UnionListWriter.java +++ b/java/vector/src/main/codegen/templates/UnionListWriter.java @@ -101,11 +101,7 @@ public void setPosition(int index) { public ${name}Writer <#if uncappedName == "int">integer<#else>${uncappedName}(String name) { // assert inMap; mapName = name; - final int nextOffset = offsets.getAccessor().get(idx() + 1); - vector.getMutator().setNotNull(idx()); - writer.setPosition(nextOffset); - ${name}Writer ${uncappedName}Writer = writer.<#if uncappedName == "int">integer<#else>${uncappedName}(name); - return ${uncappedName}Writer; + return writer.<#if uncappedName == "int">integer<#else>${uncappedName}(name); } @@ -120,18 +116,11 @@ public MapWriter map() { @Override public ListWriter list() { - final int nextOffset = offsets.getAccessor().get(idx() + 1); - vector.getMutator().setNotNull(idx()); - offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); - writer.setPosition(nextOffset); return writer; } @Override public ListWriter list(String name) { - final int nextOffset = offsets.getAccessor().get(idx() + 1); - vector.getMutator().setNotNull(idx()); - writer.setPosition(nextOffset); ListWriter listWriter = writer.list(name); return listWriter; } @@ -145,30 +134,26 @@ public MapWriter map(String name) { @Override public void startList() { vector.getMutator().startNewValue(idx()); + writer.setPosition(offsets.getAccessor().get(idx() + 1)); } @Override public void endList() { - + offsets.getMutator().set(idx() + 1, writer.idx()); + setPosition(idx() + 1); } @Override public void start() { // assert inMap; - final int nextOffset = offsets.getAccessor().get(idx() + 1); - vector.getMutator().setNotNull(idx()); - offsets.getMutator().setSafe(idx() + 1, nextOffset); - writer.setPosition(nextOffset); writer.start(); } @Override public void end() { // if (inMap) { - writer.end(); - inMap = false; - final int nextOffset = offsets.getAccessor().get(idx() + 1); - offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); + writer.end(); + inMap = false; // } } @@ -181,11 +166,8 @@ public void end() { @Override public void write${name}(<#list fields as field>${field.type} ${field.name}<#if field_has_next>, ) { // assert !inMap; - final int nextOffset = offsets.getAccessor().get(idx() + 1); - vector.getMutator().setNotNull(idx()); - writer.setPosition(nextOffset); writer.write${name}(<#list fields as field>${field.name}<#if field_has_next>, ); - offsets.getMutator().setSafe(idx() + 1, nextOffset + 1); + writer.setPosition(writer.idx()+1); } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java index bb710336555..1f0baaed776 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java @@ -19,18 +19,14 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.vector.complex.ListVector; -import org.apache.arrow.vector.complex.impl.ComplexCopier; -import org.apache.arrow.vector.complex.impl.UnionListReader; import org.apache.arrow.vector.complex.impl.UnionListWriter; import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.types.pojo.Field; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; public class TestListVector { - private final static String EMPTY_SCHEMA_PATH = ""; private BufferAllocator allocator; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java index 9419f88de5b..6e0e617f299 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/complex/writer/TestComplexWriter.java @@ -65,10 +65,10 @@ public void simpleNestedTypes() { IntWriter intWriter = rootWriter.integer("int"); BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); for (int i = 0; i < COUNT; i++) { - intWriter.setPosition(i); + rootWriter.start(); intWriter.writeInt(i); - bigIntWriter.setPosition(i); bigIntWriter.writeBigInt(i); + rootWriter.end(); } writer.setValueCount(COUNT); MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); @@ -83,23 +83,52 @@ public void simpleNestedTypes() { @Test public void nullableMap() { - MapVector parent = new MapVector("parent", allocator, null); - ComplexWriter writer = new ComplexWriterImpl("root", parent); - MapWriter rootWriter = writer.rootAsMap(); - for (int i = 0; i < COUNT; i++) { - rootWriter.setPosition(i); - rootWriter.start(); - if (i % 2 == 0) { - MapWriter mapWriter = rootWriter.map("map"); - mapWriter.setPosition(i); - mapWriter.start(); - mapWriter.bigInt("nested").writeBigInt(i); - mapWriter.end(); + try (MapVector mapVector = new MapVector("parent", allocator, null)) { + ComplexWriter writer = new ComplexWriterImpl("root", mapVector); + MapWriter rootWriter = writer.rootAsMap(); + for (int i = 0; i < COUNT; i++) { + rootWriter.start(); + if (i % 2 == 0) { + MapWriter mapWriter = rootWriter.map("map"); + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.bigInt("nested").writeBigInt(i); + mapWriter.end(); + } + rootWriter.end(); } - rootWriter.end(); + writer.setValueCount(COUNT); + checkNullableMap(mapVector); } - writer.setValueCount(COUNT); - MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + } + + /** + * This test is similar to {@link #nullableMap()} ()} but we get the inner map writer once at the beginning + */ + @Test + public void nullableMap2() { + try (MapVector mapVector = new MapVector("parent", allocator, null)) { + ComplexWriter writer = new ComplexWriterImpl("root", mapVector); + MapWriter rootWriter = writer.rootAsMap(); + MapWriter mapWriter = rootWriter.map("map"); + + for (int i = 0; i < COUNT; i++) { + rootWriter.start(); + if (i % 2 == 0) { + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.bigInt("nested").writeBigInt(i); + mapWriter.end(); + } + rootWriter.end(); + } + writer.setValueCount(COUNT); + checkNullableMap(mapVector); + } + } + + private void checkNullableMap(MapVector mapVector) { + MapReader rootReader = new SingleMapReaderImpl(mapVector).reader("root"); for (int i = 0; i < COUNT; i++) { rootReader.setPosition(i); assertTrue("index is set: " + i, rootReader.isSet()); @@ -113,11 +142,10 @@ public void nullableMap() { assertNull("index is not set: " + i, map.readObject()); } } - parent.close(); } @Test - public void listOfLists() { + public void testList() { MapVector parent = new MapVector("parent", allocator, null); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); @@ -129,7 +157,6 @@ public void listOfLists() { rootWriter.list("list").endList(); rootWriter.end(); - rootWriter.setPosition(1); rootWriter.start(); rootWriter.bigInt("int").writeBigInt(1); rootWriter.end(); @@ -152,7 +179,6 @@ public void listScalarType() { listVector.allocateNew(); UnionListWriter listWriter = new UnionListWriter(listVector); for (int i = 0; i < COUNT; i++) { - listWriter.setPosition(i); listWriter.startList(); for (int j = 0; j < i % 7; j++) { listWriter.writeInt(j); @@ -206,7 +232,6 @@ public void listMapType() { UnionListWriter listWriter = new UnionListWriter(listVector); MapWriter mapWriter = listWriter.map(); for (int i = 0; i < COUNT; i++) { - listWriter.setPosition(i); listWriter.startList(); for (int j = 0; j < i % 7; j++) { mapWriter.start(); @@ -230,23 +255,53 @@ public void listMapType() { @Test public void listListType() { - ListVector listVector = new ListVector("list", allocator, null); - listVector.allocateNew(); - UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - listWriter.setPosition(i); - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - ListWriter innerListWriter = listWriter.list(); - innerListWriter.startList(); - for (int k = 0; k < i % 13; k++) { - innerListWriter.integer().writeInt(k); + try (ListVector listVector = new ListVector("list", allocator, null)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listWriter.list(); + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + innerListWriter.integer().writeInt(k); + } + innerListWriter.endList(); } - innerListWriter.endList(); + listWriter.endList(); } - listWriter.endList(); + listWriter.setValueCount(COUNT); + checkListOfLists(listVector); } - listWriter.setValueCount(COUNT); + } + + /** + * This test is similar to {@link #listListType()} but we get the inner list writer once at the beginning + */ + @Test + public void listListType2() { + try (ListVector listVector = new ListVector("list", allocator, null)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + ListWriter innerListWriter = listWriter.list(); + + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + innerListWriter.integer().writeInt(k); + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + checkListOfLists(listVector); + } + } + + private void checkListOfLists(final ListVector listVector) { UnionListReader listReader = new UnionListReader(listVector); for (int i = 0; i < COUNT; i++) { listReader.setPosition(i); @@ -259,32 +314,65 @@ public void listListType() { } } } - listVector.clear(); } @Test public void unionListListType() { - ListVector listVector = new ListVector("list", allocator, null); - listVector.allocateNew(); - UnionListWriter listWriter = new UnionListWriter(listVector); - for (int i = 0; i < COUNT; i++) { - listWriter.setPosition(i); - listWriter.startList(); - for (int j = 0; j < i % 7; j++) { - ListWriter innerListWriter = listWriter.list(); - innerListWriter.startList(); - for (int k = 0; k < i % 13; k++) { - if (k % 2 == 0) { - innerListWriter.integer().writeInt(k); - } else { - innerListWriter.bigInt().writeBigInt(k); + try (ListVector listVector = new ListVector("list", allocator, null)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + ListWriter innerListWriter = listWriter.list(); + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + if (k % 2 == 0) { + innerListWriter.integer().writeInt(k); + } else { + innerListWriter.bigInt().writeBigInt(k); + } } + innerListWriter.endList(); } - innerListWriter.endList(); + listWriter.endList(); } - listWriter.endList(); + listWriter.setValueCount(COUNT); + checkUnionList(listVector); } - listWriter.setValueCount(COUNT); + } + + /** + * This test is similar to {@link #unionListListType()} but we get the inner list writer once at the beginning + */ + @Test + public void unionListListType2() { + try (ListVector listVector = new ListVector("list", allocator, null)) { + listVector.allocateNew(); + UnionListWriter listWriter = new UnionListWriter(listVector); + ListWriter innerListWriter = listWriter.list(); + + for (int i = 0; i < COUNT; i++) { + listWriter.startList(); + for (int j = 0; j < i % 7; j++) { + innerListWriter.startList(); + for (int k = 0; k < i % 13; k++) { + if (k % 2 == 0) { + innerListWriter.integer().writeInt(k); + } else { + innerListWriter.bigInt().writeBigInt(k); + } + } + innerListWriter.endList(); + } + listWriter.endList(); + } + listWriter.setValueCount(COUNT); + checkUnionList(listVector); + } + } + + private void checkUnionList(ListVector listVector) { UnionListReader listReader = new UnionListReader(listVector); for (int i = 0; i < COUNT; i++) { listReader.setPosition(i); @@ -301,7 +389,6 @@ public void unionListListType() { } } } - listVector.clear(); } @Test @@ -384,8 +471,8 @@ public void promotableWriterSchema() { MapVector parent = new MapVector("parent", allocator, null); ComplexWriter writer = new ComplexWriterImpl("root", parent); MapWriter rootWriter = writer.rootAsMap(); - BigIntWriter bigIntWriter = rootWriter.bigInt("a"); - VarCharWriter varCharWriter = rootWriter.varChar("a"); + rootWriter.bigInt("a"); + rootWriter.varChar("a"); Field field = parent.getField().getChildren().get(0).getChildren().get(0); Assert.assertEquals("a", field.getName()); From 6178bf7b0f0cf66f52536f5d5fb5ee104e696f3c Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Fri, 28 Oct 2016 21:13:02 -0400 Subject: [PATCH 181/210] ARROW-350: Added Kerberos to HDFS client Author: Christopher C. Aycock Closes #185 from chrisaycock/ARROW-350 and squashes the following commits: c2a4e64 [Christopher C. Aycock] Renamed 'kerb' parameter to 'kerb_ticket' f1d63de [Christopher C. Aycock] ARROW-350: Added Kerberos to HDFS client 8f1052f [Christopher C. Aycock] ARROW-345: Proper locations of libhdfs and libjvm on Mac --- cpp/doc/HDFS.md | 22 ++++++- cpp/src/arrow/io/hdfs.cc | 16 ++++- cpp/src/arrow/io/hdfs.h | 9 +-- cpp/src/arrow/io/libhdfs_shim.cc | 87 ++++++++++++++++++------- python/pyarrow/includes/libarrow_io.pxd | 1 + python/pyarrow/io.pyx | 29 ++++++--- 6 files changed, 124 insertions(+), 40 deletions(-) diff --git a/cpp/doc/HDFS.md b/cpp/doc/HDFS.md index 83311db2d2d..6b1bb8c4524 100644 --- a/cpp/doc/HDFS.md +++ b/cpp/doc/HDFS.md @@ -43,7 +43,7 @@ LD_LIBRARY_PATH), and relies on some environment variables. export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` ``` -#### Setting $JAVA_HOME automatically on OS X +### Mac Specifics The installed location of Java on OS X can vary, however the following snippet will set it automatically for you: @@ -51,3 +51,23 @@ will set it automatically for you: ```shell export JAVA_HOME=$(/usr/libexec/java_home) ``` + +Homebrew's Hadoop does not have native libs. Apache doesn't build these, so +users must build Hadoop to get the native libs. See this Stack Overflow +answer for details: + +http://stackoverflow.com/a/40051353/478288 + +Be sure to include the path to the native libs in `JAVA_LIBRARY_PATH`: + +```shell +export JAVA_LIBRARY_PATH=$HADOOP_HOME/lib/native:$JAVA_LIBRARY_PATH +``` + +If you get an error about needing to install Java 6, then add *BundledApp* and +*JNI* to the `JVMCapabilities` in `$JAVA_HOME/../Info.plist`. See + +https://oliverdowling.com.au/2015/10/09/oracles-jre-8-on-mac-os-x-el-capitan/ + +https://derflounder.wordpress.com/2015/08/08/modifying-oracles-java-sdk-to-run-java-applications-on-os-x/ + diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index b74f84604f1..6490a7574ee 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -287,12 +287,25 @@ class HdfsClient::HdfsClientImpl { Status Connect(const HdfsConnectionConfig* config) { RETURN_NOT_OK(ConnectLibHdfs()); - fs_ = hdfsConnectAsUser(config->host.c_str(), config->port, config->user.c_str()); + // connect to HDFS with the builder object + hdfsBuilder* builder = hdfsNewBuilder(); + if (!config->host.empty()) { + hdfsBuilderSetNameNode(builder, config->host.c_str()); + } + hdfsBuilderSetNameNodePort(builder, config->port); + if (!config->user.empty()) { + hdfsBuilderSetUserName(builder, config->user.c_str()); + } + if (!config->kerb_ticket.empty()) { + hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); + } + fs_ = hdfsBuilderConnect(builder); if (fs_ == nullptr) { return Status::IOError("HDFS connection failed"); } namenode_host_ = config->host; port_ = config->port; user_ = config->user; + kerb_ticket_ = config->kerb_ticket; return Status::OK(); } @@ -425,6 +438,7 @@ class HdfsClient::HdfsClientImpl { std::string namenode_host_; std::string user_; int port_; + std::string kerb_ticket_; hdfsFS fs_; }; diff --git a/cpp/src/arrow/io/hdfs.h b/cpp/src/arrow/io/hdfs.h index 4a4e3ec5f51..48699c91450 100644 --- a/cpp/src/arrow/io/hdfs.h +++ b/cpp/src/arrow/io/hdfs.h @@ -60,19 +60,16 @@ struct HdfsConnectionConfig { std::string host; int port; std::string user; - - // TODO: Kerberos, etc. + std::string kerb_ticket; }; class ARROW_EXPORT HdfsClient : public FileSystemClient { public: ~HdfsClient(); - // Connect to an HDFS cluster at indicated host, port, and as user + // Connect to an HDFS cluster given a configuration // - // @param host (in) - // @param port (in) - // @param user (in): user to identify as + // @param config (in): configuration for connecting // @param fs (out): the created client // @returns Status static Status Connect( diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index f256c31b4f4..07eb6250bbe 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -73,9 +73,17 @@ static HINSTANCE libjvm_handle = NULL; // NOTE(wesm): cpplint does not like use of short and other imprecise C types -static hdfsFS (*ptr_hdfsConnectAsUser)( - const char* host, tPort port, const char* user) = NULL; -static hdfsFS (*ptr_hdfsConnect)(const char* host, tPort port) = NULL; +static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL; +static void (*ptr_hdfsBuilderSetNameNode)( + hdfsBuilder* bld, const char* nn) = NULL; +static void (*ptr_hdfsBuilderSetNameNodePort)( + hdfsBuilder* bld, tPort port) = NULL; +static void (*ptr_hdfsBuilderSetUserName)( + hdfsBuilder* bld, const char* userName) = NULL; +static void (*ptr_hdfsBuilderSetKerbTicketCachePath)( + hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL; +static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL; + static int (*ptr_hdfsDisconnect)(hdfsFS fs) = NULL; static hdfsFile (*ptr_hdfsOpenFile)(hdfsFS fs, const char* path, int flags, @@ -149,18 +157,29 @@ static void* get_symbol(const char* symbol) { #endif } -hdfsFS hdfsConnectAsUser(const char* host, tPort port, const char* user) { - return ptr_hdfsConnectAsUser(host, port, user); +hdfsBuilder* hdfsNewBuilder(void) { + return ptr_hdfsNewBuilder(); } -// Returns NULL on failure -hdfsFS hdfsConnect(const char* host, tPort port) { - if (ptr_hdfsConnect) { - return ptr_hdfsConnect(host, port); - } else { - // TODO: error reporting when shim setup fails - return NULL; - } +void hdfsBuilderSetNameNode(hdfsBuilder* bld, const char* nn) { + ptr_hdfsBuilderSetNameNode(bld, nn); +} + +void hdfsBuilderSetNameNodePort(hdfsBuilder* bld, tPort port) { + ptr_hdfsBuilderSetNameNodePort(bld, port); +} + +void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) { + ptr_hdfsBuilderSetUserName(bld, userName); +} + +void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld, + const char* kerbTicketCachePath) { + ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath); +} + +hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) { + return ptr_hdfsBuilderConnect(bld); } int hdfsDisconnect(hdfsFS fs) { @@ -342,18 +361,36 @@ int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime) { } static std::vector get_potential_libhdfs_paths() { - std::vector libhdfs_potential_paths = { - // find one in the local directory - fs::path("./libhdfs.so"), fs::path("./hdfs.dll"), - // find a global libhdfs.so - fs::path("libhdfs.so"), fs::path("hdfs.dll"), + std::vector libhdfs_potential_paths; + std::string file_name; + + // OS-specific file name +#ifdef __WIN32 + file_name = "hdfs.dll"; +#elif __APPLE__ + file_name = "libhdfs.dylib"; +#else + file_name = "libhdfs.so"; +#endif + + // Common paths + std::vector search_paths = { + fs::path(""), + fs::path(".") }; + // Path from environment variable const char* hadoop_home = std::getenv("HADOOP_HOME"); if (hadoop_home != nullptr) { - auto path = fs::path(hadoop_home) / "lib/native/libhdfs.so"; - libhdfs_potential_paths.push_back(path); + auto path = fs::path(hadoop_home) / "lib/native"; + search_paths.push_back(path); } + + // All paths with file name + for (auto& path : search_paths) { + libhdfs_potential_paths.push_back(path / file_name); + } + return libhdfs_potential_paths; } @@ -371,7 +408,7 @@ static std::vector get_potential_libjvm_paths() { file_name = "jvm.dll"; #elif __APPLE__ search_prefixes = {""}; - search_suffixes = {""}; + search_suffixes = {"", "/jre/lib/server"}; file_name = "libjvm.dylib"; // SFrame uses /usr/libexec/java_home to find JAVA_HOME; for now we are @@ -513,8 +550,12 @@ Status ARROW_EXPORT ConnectLibHdfs() { return Status::IOError("Prior attempt to load libhdfs failed"); } - GET_SYMBOL_REQUIRED(hdfsConnect); - GET_SYMBOL_REQUIRED(hdfsConnectAsUser); + GET_SYMBOL_REQUIRED(hdfsNewBuilder); + GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNode); + GET_SYMBOL_REQUIRED(hdfsBuilderSetNameNodePort); + GET_SYMBOL_REQUIRED(hdfsBuilderSetUserName); + GET_SYMBOL_REQUIRED(hdfsBuilderSetKerbTicketCachePath); + GET_SYMBOL_REQUIRED(hdfsBuilderConnect); GET_SYMBOL_REQUIRED(hdfsCreateDirectory); GET_SYMBOL_REQUIRED(hdfsDelete); GET_SYMBOL_REQUIRED(hdfsDisconnect); diff --git a/python/pyarrow/includes/libarrow_io.pxd b/python/pyarrow/includes/libarrow_io.pxd index 8074915508f..77034159d2f 100644 --- a/python/pyarrow/includes/libarrow_io.pxd +++ b/python/pyarrow/includes/libarrow_io.pxd @@ -93,6 +93,7 @@ cdef extern from "arrow/io/hdfs.h" namespace "arrow::io" nogil: c_string host int port c_string user + c_string kerb_ticket cdef cppclass HdfsPathInfo: ObjectType kind; diff --git a/python/pyarrow/io.pyx b/python/pyarrow/io.pyx index 16ebfa1138e..0e6b81e9844 100644 --- a/python/pyarrow/io.pyx +++ b/python/pyarrow/io.pyx @@ -288,9 +288,6 @@ cdef class HdfsClient: shared_ptr[CHdfsClient] client cdef readonly: - object host - int port - object user bint is_open def __cinit__(self): @@ -301,6 +298,9 @@ cdef class HdfsClient: self.close() def close(self): + """ + Disconnect from the HDFS cluster + """ self._ensure_client() with nogil: check_status(self.client.get().Disconnect()) @@ -313,14 +313,21 @@ cdef class HdfsClient: raise IOError('HDFS client is closed') @classmethod - def connect(cls, host, port, user): + def connect(cls, host="default", port=0, user=None, kerb_ticket=None): """ + Connect to an HDFS cluster. All parameters are optional and should + only be set if the defaults need to be overridden. + + Authentication should be automatic if the HDFS cluster uses Kerberos. + However, if a username is specified, then the ticket cache will likely + be required. Parameters ---------- - host : - port : - user : + host : NameNode. Set to "default" for fs.defaultFS from core-site.xml. + port : NameNode's port. Set to 0 for default or logical (HA) nodes. + user : Username when connecting to HDFS; None implies login user. + kerb_ticket : Path to Kerberos ticket cache. Notes ----- @@ -335,9 +342,13 @@ cdef class HdfsClient: HdfsClient out = HdfsClient() HdfsConnectionConfig conf - conf.host = tobytes(host) + if host is not None: + conf.host = tobytes(host) conf.port = port - conf.user = tobytes(user) + if user is not None: + conf.user = tobytes(user) + if kerb_ticket is not None: + conf.kerb_ticket = tobytes(kerb_ticket) with nogil: check_status(CHdfsClient.Connect(&conf, &out.client)) From da24c1a0a2aba7ccd42cc3cbcf240eeb22d7ffb6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Sat, 29 Oct 2016 10:02:15 +0200 Subject: [PATCH 182/210] ARROW-339: Python 3 compatibility in merge_arrow_pr.py Author: Wes McKinney Closes #188 from wesm/ARROW-339 and squashes the following commits: 1f3617f [Wes McKinney] Remove cherry-picking cruft 6b99632 [Wes McKinney] Python 3 compatibility in merge_arrow_pr.py --- dev/merge_arrow_pr.py | 193 +++++++++++++++++++----------------------- 1 file changed, 88 insertions(+), 105 deletions(-) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index 8f47f93b26d..aa899edd62c 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -17,22 +17,24 @@ # limitations under the License. # -# Utility for creating well-formed pull request merges and pushing them to Apache. +# Utility for creating well-formed pull request merges and pushing them to +# Apache. # usage: ./apache-pr-merge.py (see config env vars below) # # This utility assumes you already have a local Arrow git clone and that you # have added remotes corresponding to both (i) the Github Apache Arrow mirror # and (ii) the apache git repo. -import json import os import re import subprocess import sys -import tempfile -import urllib2 +import requests import getpass +from six.moves import input +import six + try: import jira.client JIRA_IMPORTED = True @@ -42,8 +44,8 @@ # Location of your Arrow git clone ARROW_HOME = os.path.abspath(__file__).rsplit("/", 2)[0] PROJECT_NAME = ARROW_HOME.rsplit("/", 1)[1] -print "ARROW_HOME = " + ARROW_HOME -print "PROJECT_NAME = " + PROJECT_NAME +print("ARROW_HOME = " + ARROW_HOME) +print("PROJECT_NAME = " + PROJECT_NAME) # Remote name which points to the Gihub site PR_REMOTE_NAME = os.environ.get("PR_REMOTE_NAME", "apache-github") @@ -65,46 +67,38 @@ def get_json(url): - try: - from urllib2 import urlopen, Request - env_var = 'ARROW_GITHUB_API_TOKEN' - - if env_var in os.environ: - token = os.environ[env_var] - request = Request(url) - request.add_header('Authorization', 'token %s' % token) - response = urlopen(request) - else: - response = urlopen(url) - return json.load(response) - except urllib2.HTTPError as e: - print "Unable to fetch URL, exiting: %s" % url - sys.exit(-1) + req = requests.get(url) + return req.json() def fail(msg): - print msg + print(msg) clean_up() sys.exit(-1) def run_cmd(cmd): + if isinstance(cmd, six.string_types): + cmd = cmd.split(' ') + try: - if isinstance(cmd, list): - return subprocess.check_output(cmd) - else: - return subprocess.check_output(cmd.split(" ")) + output = subprocess.check_output(cmd) except subprocess.CalledProcessError as e: # this avoids hiding the stdout / stderr of failed processes - print 'Command failed: %s' % cmd - print 'With output:' - print '--------------' - print e.output - print '--------------' + print('Command failed: %s' % cmd) + print('With output:') + print('--------------') + print(e.output) + print('--------------') raise e + if isinstance(output, six.binary_type): + output = output.decode('utf-8') + return output + + def continue_maybe(prompt): - result = raw_input("\n%s (y/n): " % prompt) + result = input("\n%s (y/n): " % prompt) if result.lower() != "y": fail("Okay, exiting") @@ -113,38 +107,44 @@ def continue_maybe(prompt): def clean_up(): - print "Restoring head pointer to %s" % original_head + print("Restoring head pointer to %s" % original_head) run_cmd("git checkout %s" % original_head) branches = run_cmd("git branch").replace(" ", "").split("\n") - for branch in filter(lambda x: x.startswith(BRANCH_PREFIX), branches): - print "Deleting local branch %s" % branch + for branch in [x for x in branches if x.startswith(BRANCH_PREFIX)]: + print("Deleting local branch %s" % branch) run_cmd("git branch -D %s" % branch) # merge the requested PR and return the merge hash def merge_pr(pr_num, target_ref): pr_branch_name = "%s_MERGE_PR_%s" % (BRANCH_PREFIX, pr_num) - target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, target_ref.upper()) - run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, pr_branch_name)) - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, target_branch_name)) + target_branch_name = "%s_MERGE_PR_%s_%s" % (BRANCH_PREFIX, pr_num, + target_ref.upper()) + run_cmd("git fetch %s pull/%s/head:%s" % (PR_REMOTE_NAME, pr_num, + pr_branch_name)) + run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, target_ref, + target_branch_name)) run_cmd("git checkout %s" % target_branch_name) had_conflicts = False try: run_cmd(['git', 'merge', pr_branch_name, '--squash']) except Exception as e: - msg = "Error merging: %s\nWould you like to manually fix-up this merge?" % e + msg = ("Error merging: %s\nWould you like to " + "manually fix-up this merge?" % e) continue_maybe(msg) - msg = "Okay, please fix any conflicts and 'git add' conflicting files... Finished?" + msg = ("Okay, please fix any conflicts and 'git add' " + "conflicting files... Finished?") continue_maybe(msg) had_conflicts = True commit_authors = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, '--pretty=format:%an <%ae>']).split("\n") distinct_authors = sorted(set(commit_authors), - key=lambda x: commit_authors.count(x), reverse=True) + key=lambda x: commit_authors.count(x), + reverse=True) primary_author = distinct_authors[0] commits = run_cmd(['git', 'log', 'HEAD..%s' % pr_branch_name, '--pretty=format:%h [%an] %s']).split("\n\n") @@ -152,7 +152,7 @@ def merge_pr(pr_num, target_ref): merge_message_flags = [] merge_message_flags += ["-m", title] - if body != None: + if body is not None: merge_message_flags += ["-m", body] authors = "\n".join(["Author: %s" % a for a in distinct_authors]) @@ -162,14 +162,17 @@ def merge_pr(pr_num, target_ref): if had_conflicts: committer_name = run_cmd("git config --get user.name").strip() committer_email = run_cmd("git config --get user.email").strip() - message = "This patch had conflicts when merged, resolved by\nCommitter: %s <%s>" % ( - committer_name, committer_email) + message = ("This patch had conflicts when merged, " + "resolved by\nCommitter: %s <%s>" % + (committer_name, committer_email)) merge_message_flags += ["-m", message] - # The string "Closes #%s" string is required for GitHub to correctly close the PR + # The string "Closes #%s" string is required for GitHub to correctly close + # the PR merge_message_flags += [ "-m", - "Closes #%s from %s and squashes the following commits:" % (pr_num, pr_repo_desc)] + "Closes #%s from %s and squashes the following commits:" + % (pr_num, pr_repo_desc)] for c in commits: merge_message_flags += ["-m", c] @@ -182,7 +185,8 @@ def merge_pr(pr_num, target_ref): target_branch_name, PUSH_REMOTE_NAME)) try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, target_ref)) + run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, target_branch_name, + target_ref)) except Exception as e: clean_up() fail("Exception while pushing: %s" % e) @@ -194,65 +198,42 @@ def merge_pr(pr_num, target_ref): return merge_hash -def cherry_pick(pr_num, merge_hash, default_branch): - pick_ref = raw_input("Enter a branch name [%s]: " % default_branch) - if pick_ref == "": - pick_ref = default_branch - - pick_branch_name = "%s_PICK_PR_%s_%s" % (BRANCH_PREFIX, pr_num, pick_ref.upper()) - - run_cmd("git fetch %s %s:%s" % (PUSH_REMOTE_NAME, pick_ref, pick_branch_name)) - run_cmd("git checkout %s" % pick_branch_name) - run_cmd("git cherry-pick -sx %s" % merge_hash) - - continue_maybe("Pick complete (local ref %s). Push to %s?" % ( - pick_branch_name, PUSH_REMOTE_NAME)) - - try: - run_cmd('git push %s %s:%s' % (PUSH_REMOTE_NAME, pick_branch_name, pick_ref)) - except Exception as e: - clean_up() - fail("Exception while pushing: %s" % e) - - pick_hash = run_cmd("git rev-parse %s" % pick_branch_name)[:8] - clean_up() - - print("Pull request #%s picked into %s!" % (pr_num, pick_ref)) - print("Pick hash: %s" % pick_hash) - return pick_ref - - def fix_version_from_branch(branch, versions): - # Note: Assumes this is a sorted (newest->oldest) list of un-released versions + # Note: Assumes this is a sorted (newest->oldest) list of un-released + # versions if branch == "master": return versions[0] else: branch_ver = branch.replace("branch-", "") - return filter(lambda x: x.name.startswith(branch_ver), versions)[-1] + return [x for x in versions if x.name.startswith(branch_ver)][-1] + def exctract_jira_id(title): m = re.search(r'^(ARROW-[0-9]+)\b.*$', title) if m and m.groups > 0: return m.group(1) else: - fail("PR title should be prefixed by a jira id \"ARROW-XXX: ...\", found: \"%s\"" % title) + fail("PR title should be prefixed by a jira id " + "\"ARROW-XXX: ...\", found: \"%s\"" % title) + def check_jira(title): jira_id = exctract_jira_id(title) asf_jira = jira.client.JIRA({'server': JIRA_API_BASE}, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) try: - issue = asf_jira.issue(jira_id) + asf_jira.issue(jira_id) except Exception as e: fail("ASF JIRA could not find %s\n%s" % (jira_id, e)) + def resolve_jira(title, merge_branches, comment): asf_jira = jira.client.JIRA({'server': JIRA_API_BASE}, basic_auth=(JIRA_USERNAME, JIRA_PASSWORD)) default_jira_id = exctract_jira_id(title) - jira_id = raw_input("Enter a JIRA id [%s]: " % default_jira_id) + jira_id = input("Enter a JIRA id [%s]: " % default_jira_id) if jira_id == "": jira_id = default_jira_id @@ -271,30 +252,33 @@ def resolve_jira(title, merge_branches, comment): if cur_status == "Resolved" or cur_status == "Closed": fail("JIRA issue %s already has status '%s'" % (jira_id, cur_status)) - print ("=== JIRA %s ===" % jira_id) - print ("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%s\n" % ( - cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) + print("=== JIRA %s ===" % jira_id) + print("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%sf\n" + % (cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) resolve = filter(lambda a: a['name'] == "Resolve Issue", asf_jira.transitions(jira_id))[0] asf_jira.transition_issue(jira_id, resolve["id"], comment=comment) - print "Succesfully resolved %s!" % (jira_id) + print("Succesfully resolved %s!" % (jira_id)) if not JIRA_USERNAME: - JIRA_USERNAME = raw_input("Env JIRA_USERNAME not set, please enter your JIRA username:") + JIRA_USERNAME = input("Env JIRA_USERNAME not set, " + "please enter your JIRA username:") if not JIRA_PASSWORD: - JIRA_PASSWORD = getpass.getpass("Env JIRA_PASSWORD not set, please enter your JIRA password:") + JIRA_PASSWORD = getpass.getpass("Env JIRA_PASSWORD not set, please enter " + "your JIRA password:") branches = get_json("%s/branches" % GITHUB_API_BASE) -branch_names = filter(lambda x: x.startswith("branch-"), [x['name'] for x in branches]) +branch_names = [x['name'] for x in branches if x['name'].startswith('branch-')] + # Assumes branch names can be sorted lexicographically # Julien: I commented this out as we don't have any "branch-*" branch yet -#latest_branch = sorted(branch_names, reverse=True)[0] +# latest_branch = sorted(branch_names, reverse=True)[0] -pr_num = raw_input("Which pull request would you like to merge? (e.g. 34): ") +pr_num = input("Which pull request would you like to merge? (e.g. 34): ") pr = get_json("%s/pulls/%s" % (GITHUB_API_BASE, pr_num)) url = pr["url"] @@ -307,42 +291,41 @@ def resolve_jira(title, merge_branches, comment): pr_repo_desc = "%s/%s" % (user_login, base_ref) if pr["merged"] is True: - print "Pull request %s has already been merged, assuming you want to backport" % pr_num + print("Pull request %s has already been merged, " + "assuming you want to backport" % pr_num) merge_commit_desc = run_cmd([ 'git', 'log', '--merges', '--first-parent', '--grep=pull request #%s' % pr_num, '--oneline']).split("\n")[0] if merge_commit_desc == "": - fail("Couldn't find any merge commit for #%s, you may need to update HEAD." % pr_num) + fail("Couldn't find any merge commit for #%s, " + "you may need to update HEAD." % pr_num) merge_hash = merge_commit_desc[:7] message = merge_commit_desc[8:] - print "Found: %s" % message - maybe_cherry_pick(pr_num, merge_hash, latest_branch) + print("Found: %s" % message) sys.exit(0) if not bool(pr["mergeable"]): - msg = "Pull request %s is not mergeable in its current form.\n" % pr_num + \ - "Continue? (experts only!)" + msg = ("Pull request %s is not mergeable in its current form.\n" + % pr_num + "Continue? (experts only!)") continue_maybe(msg) -print ("\n=== Pull Request #%s ===" % pr_num) -print ("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" % ( - title, pr_repo_desc, target_ref, url)) +print("\n=== Pull Request #%s ===" % pr_num) +print("title\t%s\nsource\t%s\ntarget\t%s\nurl\t%s" + % (title, pr_repo_desc, target_ref, url)) continue_maybe("Proceed with merging pull request #%s?" % pr_num) merged_refs = [target_ref] merge_hash = merge_pr(pr_num, target_ref) -pick_prompt = "Would you like to pick %s into another branch?" % merge_hash -while raw_input("\n%s (y/n): " % pick_prompt).lower() == "y": - merged_refs = merged_refs + [cherry_pick(pr_num, merge_hash, latest_branch)] - if JIRA_IMPORTED: continue_maybe("Would you like to update the associated JIRA?") - jira_comment = "Issue resolved by pull request %s\n[%s/%s]" % (pr_num, GITHUB_BASE, pr_num) + jira_comment = ("Issue resolved by pull request %s\n[%s/%s]" + % (pr_num, GITHUB_BASE, pr_num)) resolve_jira(title, merged_refs, jira_comment) else: - print "Could not find jira-python library. Run 'sudo pip install jira-python' to install." - print "Exiting without trying to close the associated JIRA." + print("Could not find jira-python library. " + "Run 'sudo pip install jira-python' to install.") + print("Exiting without trying to close the associated JIRA.") From d946e7917d55cb220becd6469ae93430f2e60764 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 29 Oct 2016 04:36:03 -0400 Subject: [PATCH 183/210] ARROW-354: Fix comparison of arrays of empty strings Author: Uwe L. Korn Closes #189 from xhochy/ARROW-354 and squashes the following commits: 8f75d78 [Uwe L. Korn] ARROW-354: Fix comparison of arrays of empty strings --- cpp/src/arrow/types/string-test.cc | 12 ++++++++++++ cpp/src/arrow/types/string.cc | 2 ++ 2 files changed, 14 insertions(+) diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index d897e30a3c6..af87a14a8b3 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -129,6 +129,18 @@ TEST_F(TestStringContainer, TestGetString) { } } +TEST_F(TestStringContainer, TestEmptyStringComparison) { + offsets_ = {0, 0, 0, 0, 0, 0}; + offsets_buf_ = test::to_buffer(offsets_); + length_ = offsets_.size() - 1; + + auto strings_a = std::make_shared( + length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + auto strings_b = std::make_shared( + length_, offsets_buf_, nullptr, null_count_, null_bitmap_); + ASSERT_TRUE(strings_a->Equals(strings_b)); +} + // ---------------------------------------------------------------------- // String builder tests diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index d692e13773f..f6d26df3167 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -56,6 +56,8 @@ bool BinaryArray::EqualsExact(const BinaryArray& other) const { offset_buffer_->Equals(*other.offset_buffer_, (length_ + 1) * sizeof(int32_t)); if (!equal_offsets) { return false; } + if (!data_buffer_ && !(other.data_buffer_)) { return true; } + return data_buffer_->Equals(*other.data_buffer_, data_buffer_->size()); } From 772bc6ea6e5d452ccff1df8d5e83299e434c0d04 Mon Sep 17 00:00:00 2001 From: Peter Hoffmann Date: Sun, 30 Oct 2016 11:11:28 +0100 Subject: [PATCH 184/210] ARROW-349: Add six as a requirement fixes https://issues.apache.org/jira/browse/ARROW-349 Author: Peter Hoffmann Closes #184 from hoffmann/patch-1 and squashes the following commits: 1bffc69 [Peter Hoffmann] Add six as a requirement --- python/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index 99049777514..cdfdc243e25 100644 --- a/python/setup.py +++ b/python/setup.py @@ -271,7 +271,7 @@ def get_outputs(self): 'clean': clean, 'build_ext': build_ext }, - install_requires=['cython >= 0.23', 'numpy >= 1.9'], + install_requires=['cython >= 0.23', 'numpy >= 1.9', 'six >= 1.0.0'], description=DESC, license='Apache License, Version 2.0', maintainer="Apache Arrow Developers", From ca088dd19eb4283c71252de39782d811f985649a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 31 Oct 2016 21:16:29 -0400 Subject: [PATCH 185/210] ARROW-339: [Dev] Lingering Python 3 fixes I missed a couple Python 3 things. I'll leave this open until one of us successfully merged another patch with this before we merge it. Author: Wes McKinney Closes #191 from wesm/ARROW-339-2 and squashes the following commits: 78bf094 [Wes McKinney] Lingering Python 3 fixes --- dev/merge_arrow_pr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dev/merge_arrow_pr.py b/dev/merge_arrow_pr.py index aa899edd62c..f7e7a37c36e 100755 --- a/dev/merge_arrow_pr.py +++ b/dev/merge_arrow_pr.py @@ -210,7 +210,7 @@ def fix_version_from_branch(branch, versions): def exctract_jira_id(title): m = re.search(r'^(ARROW-[0-9]+)\b.*$', title) - if m and m.groups > 0: + if m: return m.group(1) else: fail("PR title should be prefixed by a jira id " @@ -256,8 +256,8 @@ def resolve_jira(title, merge_branches, comment): print("summary\t\t%s\nassignee\t%s\nstatus\t\t%s\nurl\t\t%s/%sf\n" % (cur_summary, cur_assignee, cur_status, JIRA_BASE, jira_id)) - resolve = filter(lambda a: a['name'] == "Resolve Issue", - asf_jira.transitions(jira_id))[0] + resolve = [x for x in asf_jira.transitions(jira_id) + if x['name'] == "Resolve Issue"][0] asf_jira.transition_issue(jira_id, resolve["id"], comment=comment) print("Succesfully resolved %s!" % (jira_id)) From d4148759a266d90dacd1ca2b7b7ff0df7e02578a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 1 Nov 2016 14:21:07 -0400 Subject: [PATCH 186/210] ARROW-348: [Python] Add build-type command line option to setup.py, build CMake extensions in a build type subdirectory This also resolves ARROW-230. Author: Wes McKinney Closes #187 from wesm/ARROW-348 and squashes the following commits: 3cdaeaf [Wes McKinney] Cast build_type to lowercase in case env variable is uppercase 74bfa71 [Wes McKinney] Pull default build type from environment variable d0b3154 [Wes McKinney] Tweak readme 6017948 [Wes McKinney] Add built-type command line option to setup.py, build extensions in release type subdirectory to avoid conflicts with setuptools --- python/CMakeLists.txt | 3 +-- python/README.md | 9 +++++++++ python/setup.py | 34 ++++++++++++++++------------------ 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b8be8665af0..179f02fbc9d 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -203,8 +203,7 @@ if (${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_BINARY_DIR}) EXECUTE_PROCESS(COMMAND ln ${MORE_ARGS} -sf ${BUILD_OUTPUT_ROOT_DIRECTORY} ${CMAKE_CURRENT_BINARY_DIR}/build/latest) else() - set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}") - # set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}/") + set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}/") endif() # where to put generated archives (.a files) diff --git a/python/README.md b/python/README.md index e11f6456455..2a3e1ba9542 100644 --- a/python/README.md +++ b/python/README.md @@ -48,6 +48,15 @@ python setup.py build_ext --inplace py.test pyarrow ``` +To change the build type, use the `--build-type` option: + +```bash +python setup.py build_ext --build-type=release --inplace +``` + +To pass through other build options to CMake, set the environment variable +`$PYARROW_CMAKE_OPTIONS`. + #### Build the documentation ```bash diff --git a/python/setup.py b/python/setup.py index cdfdc243e25..b3012e69424 100644 --- a/python/setup.py +++ b/python/setup.py @@ -39,14 +39,6 @@ # Check if we're running 64-bit Python is_64_bit = sys.maxsize > 2**32 -# Check if this is a debug build of Python. -# if hasattr(sys, 'gettotalrefcount'): -# build_type = 'Debug' -# else: -# build_type = 'Release' - -build_type = 'Debug' - if Cython.__version__ < '0.19.1': raise Exception('Please upgrade to Cython 0.19.1 or newer') @@ -104,13 +96,14 @@ def run(self): # github.com/libdynd/dynd-python description = "Build the C-extensions for arrow" - user_options = ([('extra-cmake-args=', None, - 'extra arguments for CMake')] + - _build_ext.user_options) + user_options = ([('extra-cmake-args=', None, 'extra arguments for CMake'), + ('build-type=', None, 'build type (debug or release)')] + + _build_ext.user_options) def initialize_options(self): _build_ext.initialize_options(self) self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') + self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower() CYTHON_MODULE_NAMES = [ 'array', @@ -152,9 +145,12 @@ def _run_cmake(self): static_lib_option = '' build_tests_option = '' + build_type_option = '-DCMAKE_BUILD_TYPE={0}'.format(self.build_type) + if sys.platform != 'win32': cmake_command = ['cmake', self.extra_cmake_args, pyexe_option, build_tests_option, + build_type_option, static_lib_option, source] self.spawn(cmake_command) @@ -170,7 +166,8 @@ def _run_cmake(self): # Generate the build files extra_cmake_args = shlex.split(self.extra_cmake_args) cmake_command = (['cmake'] + extra_cmake_args + - [source, pyexe_option, + [source, + pyexe_option, static_lib_option, build_tests_option, '-G', cmake_generator]) @@ -179,7 +176,7 @@ def _run_cmake(self): self.spawn(cmake_command) # Do the build - self.spawn(['cmake', '--build', '.', '--config', build_type]) + self.spawn(['cmake', '--build', '.', '--config', self.build_type]) if self.inplace: # a bit hacky @@ -188,14 +185,15 @@ def _run_cmake(self): # Move the built libpyarrow library to the place expected by the Python # build if sys.platform != 'win32': - name, = glob.glob('libpyarrow.*') + name, = glob.glob(pjoin(self.build_type, 'libpyarrow.*')) try: os.makedirs(pjoin(build_lib, 'pyarrow')) except OSError: pass - shutil.move(name, pjoin(build_lib, 'pyarrow', name)) + shutil.move(name, + pjoin(build_lib, 'pyarrow', os.path.split(name)[1])) else: - shutil.move(pjoin(build_type, 'pyarrow.dll'), + shutil.move(pjoin(self.build_type, 'pyarrow.dll'), pjoin(build_lib, 'pyarrow', 'pyarrow.dll')) # Move the built C-extension to the place expected by the Python build @@ -239,10 +237,10 @@ def get_ext_built(self, name): if sys.platform == 'win32': head, tail = os.path.split(name) suffix = sysconfig.get_config_var('SO') - return pjoin(head, build_type, tail + suffix) + return pjoin(head, self.build_type, tail + suffix) else: suffix = sysconfig.get_config_var('SO') - return name + suffix + return pjoin(self.build_type, name + suffix) def get_names(self): return self._found_names From c7db80e729c4b3e984c3ef5630ccbff43f3042b8 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 1 Nov 2016 14:25:01 -0400 Subject: [PATCH 187/210] ARROW-355: Add tests for serialising arrays of empty strings to Parquet Depends on https://issues.apache.org/jira/browse/PARQUET-759 Author: Uwe L. Korn Closes #190 from xhochy/ARROW-355 and squashes the following commits: e5099ce [Uwe L. Korn] ARROW-355: Add tests for serialising arrays of empty strings to Parquet --- python/pyarrow/tests/test_parquet.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 0f9f2e40813..922ad3aa9ff 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -73,7 +73,8 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'), 'str': [str(x) for x in range(size)], - 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None] + 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], + 'empty_str': [''] * size }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True) @@ -98,7 +99,10 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): 'int64': np.arange(size, dtype=np.int64), 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), - 'bool': np.random.randn(size) > 0 + 'bool': np.random.randn(size) > 0, + 'str': [str(x) for x in range(size)], + 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], + 'empty_str': [''] * size }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) From e70d97dbc8dc86161083e94c45d5828f79211f6b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 2 Nov 2016 08:06:29 +0100 Subject: [PATCH 188/210] ARROW-358: Add explicit environment variable to locate libhdfs in one's environment Author: Wes McKinney Closes #195 from wesm/ARROW-358 and squashes the following commits: c00d251 [Wes McKinney] Add explicit environment variable to locate libhdfs in one's environment --- cpp/src/arrow/io/libhdfs_shim.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index 07eb6250bbe..1fee595d071 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -386,6 +386,11 @@ static std::vector get_potential_libhdfs_paths() { search_paths.push_back(path); } + const char* libhdfs_dir = std::getenv("ARROW_LIBHDFS_DIR"); + if (libhdfs_dir != nullptr) { + search_paths.push_back(fs::path(libhdfs_dir)); + } + // All paths with file name for (auto& path : search_paths) { libhdfs_potential_paths.push_back(path / file_name); From 2a059bd277c58bca80412cbda258a253b801d1a4 Mon Sep 17 00:00:00 2001 From: "Christopher C. Aycock" Date: Wed, 2 Nov 2016 12:15:53 -0400 Subject: [PATCH 189/210] ARROW-359: Document ARROW_LIBHDFS_DIR Author: Christopher C. Aycock Closes #196 from chrisaycock/ARROW-359 and squashes the following commits: 52ec78e [Christopher C. Aycock] ARROW-359: Document ARROW_LIBHDFS_DIR --- cpp/doc/HDFS.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cpp/doc/HDFS.md b/cpp/doc/HDFS.md index 6b1bb8c4524..180d31e54d5 100644 --- a/cpp/doc/HDFS.md +++ b/cpp/doc/HDFS.md @@ -33,16 +33,18 @@ interface to the Java Hadoop client. This library is loaded **at runtime** (rather than at link / library load time, since the library may not be in your LD_LIBRARY_PATH), and relies on some environment variables. -* `HADOOP_HOME`: the root of your installed Hadoop distribution. Check in the - `lib/native` directory to look for `libhdfs.so` if you have any questions - about which directory you're after. -* `JAVA_HOME`: the location of your Java SDK installation +* `HADOOP_HOME`: the root of your installed Hadoop distribution. Often has +`lib/native/libhdfs.so`. +* `JAVA_HOME`: the location of your Java SDK installation. * `CLASSPATH`: must contain the Hadoop jars. You can set these using: ```shell export CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath --glob` ``` +* `ARROW_LIBHDFS_DIR` (optional): explicit location of `libhdfs.so` if it is +installed somewhere other than `$HADOOP_HOME/lib/native`. + ### Mac Specifics The installed location of Java on OS X can vary, however the following snippet From 17c9ae7c4ceb328c897fb6c9025c763a879ebefa Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 2 Nov 2016 12:20:15 -0400 Subject: [PATCH 190/210] ARROW-357: Use a single RowGroup for Parquet files as default. This is not the optimal choice, we should rather have an option to optimise for the underlying block size of the filesystem but without the infrastructure for that in ``parquet-cpp``, writing a single RowGroup is the much better choice. Author: Uwe L. Korn Closes #192 from xhochy/ARROW-357 and squashes the following commits: 9eccefd [Uwe L. Korn] ARROW-357: Use a single RowGroup for Parquet files as default. --- python/pyarrow/parquet.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 019dd2c1de4..a56c1e1456d 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -106,7 +106,8 @@ def write_table(table, filename, chunk_size=None, version=None, table : pyarrow.Table filename : string chunk_size : int - The maximum number of rows in each Parquet RowGroup + The maximum number of rows in each Parquet RowGroup. As a default, + we will write a single RowGroup per file. version : {"1.0", "2.0"}, default "1.0" The Parquet format version, defaults to 1.0 use_dictionary : bool or list @@ -121,7 +122,7 @@ def write_table(table, filename, chunk_size=None, version=None, cdef WriterProperties.Builder properties_builder cdef int64_t chunk_size_ = 0 if chunk_size is None: - chunk_size_ = min(ctable_.num_rows(), int(2**16)) + chunk_size_ = ctable_.num_rows() else: chunk_size_ = chunk_size From 25e010607542aa7330bd881e145180fe606776c5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 3 Nov 2016 13:22:19 -0400 Subject: [PATCH 191/210] ARROW-323: [Python] Opt-in to pyarrow.parquet extension rather than attempting and failing silently Added a couple ways to do this, either via the `--with-parquet` command line option (preferred) or by passing through an option to CMake Author: Wes McKinney Closes #194 from wesm/ARROW-323 and squashes the following commits: 07c05cc [Wes McKinney] Update readme to illustrate proper use of with build_ext 3bd9a8d [Wes McKinney] Add --with-parquet option to setup.py 374e254 [Wes McKinney] Add to README about building the parquet extension cab55cb [Wes McKinney] Opt in to building the pyarrow.parquet extension, do not silently fail --- python/CMakeLists.txt | 8 +++++++- python/README.md | 20 +++++++++++++++++++- python/setup.py | 38 ++++++++++++++++++++++++-------------- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 179f02fbc9d..6ad55f8c9a7 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -50,6 +50,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PYARROW_BUILD_TESTS "Build the PyArrow C++ googletest unit tests" OFF) + option(PYARROW_BUILD_PARQUET + "Build the PyArrow Parquet integration" + OFF) endif() find_program(CCACHE_FOUND ccache) @@ -445,7 +448,10 @@ set(LINK_LIBS arrow_ipc ) -if(PARQUET_FOUND AND PARQUET_ARROW_FOUND) +if (PYARROW_BUILD_PARQUET) + if(NOT (PARQUET_FOUND AND PARQUET_ARROW_FOUND)) + message(FATAL_ERROR "Unable to locate Parquet libraries") + endif() ADD_THIRDPARTY_LIB(parquet_arrow SHARED_LIB ${PARQUET_ARROW_SHARED_LIB}) set(LINK_LIBS diff --git a/python/README.md b/python/README.md index 2a3e1ba9542..4fce0d26b28 100644 --- a/python/README.md +++ b/python/README.md @@ -48,7 +48,8 @@ python setup.py build_ext --inplace py.test pyarrow ``` -To change the build type, use the `--build-type` option: +To change the build type, use the `--build-type` option or set +`$PYARROW_BUILD_TYPE`: ```bash python setup.py build_ext --build-type=release --inplace @@ -57,9 +58,26 @@ python setup.py build_ext --build-type=release --inplace To pass through other build options to CMake, set the environment variable `$PYARROW_CMAKE_OPTIONS`. +#### Build the pyarrow Parquet file extension + +To build the integration with [parquet-cpp][1], pass `--with-parquet` to +the `build_ext` option in setup.py: + +``` +python setup.py build_ext --with-parquet install +``` + +Alternately, add `-DPYARROW_BUILD_PARQUET=on` to the general CMake options. + +``` +export PYARROW_CMAKE_OPTIONS=-DPYARROW_BUILD_PARQUET=on +``` + #### Build the documentation ```bash pip install -r doc/requirements.txt python setup.py build_sphinx ``` + +[1]: https://github.com/apache/parquet-cpp \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index b3012e69424..341cc64aa2c 100644 --- a/python/setup.py +++ b/python/setup.py @@ -97,13 +97,15 @@ def run(self): description = "Build the C-extensions for arrow" user_options = ([('extra-cmake-args=', None, 'extra arguments for CMake'), - ('build-type=', None, 'build type (debug or release)')] - + _build_ext.user_options) + ('build-type=', None, 'build type (debug or release)'), + ('with-parquet', None, 'build the Parquet extension')] + + _build_ext.user_options) def initialize_options(self): _build_ext.initialize_options(self) self.extra_cmake_args = os.environ.get('PYARROW_CMAKE_OPTIONS', '') self.build_type = os.environ.get('PYARROW_BUILD_TYPE', 'debug').lower() + self.with_parquet = False CYTHON_MODULE_NAMES = [ 'array', @@ -116,8 +118,6 @@ def initialize_options(self): 'schema', 'table'] - CYTHON_ALLOWED_FAILURES = ['parquet'] - def _run_cmake(self): # The directory containing this setup.py source = osp.dirname(osp.abspath(__file__)) @@ -141,17 +141,24 @@ def _run_cmake(self): if (cachedir != build_temp): return - pyexe_option = '-DPYTHON_EXECUTABLE=%s' % sys.executable static_lib_option = '' build_tests_option = '' - build_type_option = '-DCMAKE_BUILD_TYPE={0}'.format(self.build_type) + cmake_options = [ + '-DPYTHON_EXECUTABLE=%s' % sys.executable, + static_lib_option, + build_tests_option, + ] + + if self.with_parquet: + cmake_options.append('-DPYARROW_BUILD_PARQUET=on') if sys.platform != 'win32': - cmake_command = ['cmake', self.extra_cmake_args, pyexe_option, - build_tests_option, - build_type_option, - static_lib_option, source] + cmake_options.append('-DCMAKE_BUILD_TYPE={0}' + .format(self.build_type)) + + cmake_command = (['cmake', self.extra_cmake_args] + + cmake_options + [source]) self.spawn(cmake_command) args = ['make', 'VERBOSE=1'] @@ -166,10 +173,8 @@ def _run_cmake(self): # Generate the build files extra_cmake_args = shlex.split(self.extra_cmake_args) cmake_command = (['cmake'] + extra_cmake_args + + cmake_options + [source, - pyexe_option, - static_lib_option, - build_tests_option, '-G', cmake_generator]) if "-G" in self.extra_cmake_args: cmake_command = cmake_command[:-2] @@ -202,7 +207,7 @@ def _run_cmake(self): built_path = self.get_ext_built(name) if not os.path.exists(built_path): print(built_path) - if name in self.CYTHON_ALLOWED_FAILURES: + if self._failure_permitted(name): print('Cython module {0} failure permitted'.format(name)) continue raise RuntimeError('libpyarrow C-extension failed to build:', @@ -219,6 +224,11 @@ def _run_cmake(self): os.chdir(saved_cwd) + def _failure_permitted(self, name): + if name == 'parquet' and not self.with_parquet: + return True + return False + def _get_inplace_dir(self): pass From e8bc1fe3ba7f94b39f38571a435f93f387e67d37 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sun, 6 Nov 2016 12:10:06 +0100 Subject: [PATCH 192/210] ARROW-368: Added note for LD_LIBRARY_PATH in Python README Added note to use LD_LIBRARY_PATH env var to add $ARROW_HOME/lib path so PyArrow can locate Arrow-Cpp shared libs. Author: Bryan Cutler Closes #199 from BryanCutler/pyarrow-README-note-LD_LIBRARY_PATH-ARROW-368 and squashes the following commits: 15861c4 [Bryan Cutler] Added note for LD_LIBRARY_PATH in Python README --- python/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/README.md b/python/README.md index 4fce0d26b28..88ab17e7173 100644 --- a/python/README.md +++ b/python/README.md @@ -33,12 +33,19 @@ These are the various projects that PyArrow depends on. 1. **g++ and gcc Version >= 4.8** 2. **cmake > 2.8.6** 3. **boost** -4. **Arrow-cpp and its dependencies*** +4. **Arrow-cpp and its dependencies** The Arrow C++ library must be built with all options enabled and installed with ``ARROW_HOME`` environment variable set to the installation location. Look at (https://github.com/apache/arrow/blob/master/cpp/README.md) for instructions. +Ensure PyArrow can locate the Arrow-cpp shared libraries by setting the +LD_LIBRARY_PATH environment variable. + +```bash +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ARROW_HOME/lib +``` + 5. **Python dependencies: numpy, pandas, cython, pytest** #### Build pyarrow and run the unit tests From 121e82682344b04bdb26edf16344a9fb2cee240c Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 6 Nov 2016 16:08:44 -0500 Subject: [PATCH 193/210] ARROW-361: Python: Support reading a column-selection from Parquet files Author: Uwe L. Korn Closes #197 from xhochy/ARROW-361 and squashes the following commits: c1fb939 [Uwe L. Korn] Cache column indices 0c32213 [Uwe L. Korn] ARROW-361: Python: Support reading a column-selection from Parquet files --- python/pyarrow/includes/parquet.pxd | 25 ++++++++++--- python/pyarrow/parquet.pyx | 53 +++++++++++++++++++++++++++- python/pyarrow/tests/test_parquet.py | 16 +++++++++ 3 files changed, 89 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index 754eeccecc8..57c35ba8944 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -18,7 +18,7 @@ # distutils: language = c++ from pyarrow.includes.common cimport * -from pyarrow.includes.libarrow cimport CSchema, CStatus, CTable, MemoryPool +from pyarrow.includes.libarrow cimport CArray, CSchema, CStatus, CTable, MemoryPool from pyarrow.includes.libarrow_io cimport ReadableFileInterface @@ -32,6 +32,9 @@ cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: cdef cppclass PrimitiveNode(Node): pass + cdef cppclass ColumnPath: + c_string ToDotString() + cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: enum ParquetVersion" parquet::ParquetVersion::type": PARQUET_1_0" parquet::ParquetVersion::PARQUET_1_0" @@ -44,13 +47,14 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: LZO" parquet::Compression::LZO" BROTLI" parquet::Compression::BROTLI" + cdef cppclass ColumnDescriptor: + shared_ptr[ColumnPath] path() + cdef cppclass SchemaDescriptor: + const ColumnDescriptor* Column(int i) shared_ptr[Node] schema() GroupNode* group() - cdef cppclass ColumnDescriptor: - pass - cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: @@ -80,10 +84,21 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass RowGroupReader: pass + cdef cppclass FileMetaData: + uint32_t size() + int num_columns() + int64_t num_rows() + int num_row_groups() + int32_t version() + const c_string created_by() + int num_schema_elements() + const SchemaDescriptor* schema() + cdef cppclass ParquetFileReader: # TODO: Some default arguments are missing @staticmethod unique_ptr[ParquetFileReader] OpenFile(const c_string& path) + const FileMetaData* metadata(); cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: @@ -124,7 +139,9 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: cdef cppclass FileReader: FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader) + CStatus ReadFlatColumn(int i, shared_ptr[CArray]* out); CStatus ReadFlatTable(shared_ptr[CTable]* out); + const ParquetFileReader* parquet_reader(); cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index a56c1e1456d..2152f894741 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -24,6 +24,7 @@ from pyarrow.includes.parquet cimport * from pyarrow.includes.libarrow_io cimport ReadableFileInterface cimport pyarrow.includes.pyarrow as pyarrow +from pyarrow.array cimport Array from pyarrow.compat import tobytes from pyarrow.error import ArrowException from pyarrow.error cimport check_status @@ -43,6 +44,7 @@ cdef class ParquetReader: cdef: ParquetAllocator allocator unique_ptr[FileReader] reader + column_idx_map def __cinit__(self): self.allocator.set_pool(default_memory_pool()) @@ -76,11 +78,55 @@ cdef class ParquetReader: table.init(ctable) return table + def column_name_idx(self, column_name): + """ + Find the matching index of a column in the schema. + + Parameter + --------- + column_name: str + Name of the column, separation of nesting levels is done via ".". + + Returns + ------- + column_idx: int + Integer index of the position of the column + """ + cdef: + const FileMetaData* metadata = self.reader.get().parquet_reader().metadata() + int i = 0 + + if self.column_idx_map is None: + self.column_idx_map = {} + for i in range(0, metadata.num_columns()): + self.column_idx_map[str(metadata.schema().Column(i).path().get().ToDotString())] = i + + return self.column_idx_map[column_name] + + def read_column(self, int column_index): + cdef: + Array array = Array() + shared_ptr[CArray] carray + + with nogil: + check_status(self.reader.get().ReadFlatColumn(column_index, &carray)) + + array.init(carray) + return array + def read_table(source, columns=None): """ Read a Table from Parquet format + Parameters + ---------- + source: str or pyarrow.io.NativeFile + Readable source. For passing Python file objects or byte buffers, see + pyarrow.io.PythonFileInterface or pyarrow.io.BytesReader. + columns: list + If not None, only these columns will be read from the file. + Returns ------- pyarrow.table.Table @@ -93,7 +139,12 @@ def read_table(source, columns=None): elif isinstance(source, NativeFile): reader.open_native_file(source) - return reader.read_all() + if columns is None: + return reader.read_all() + else: + column_idxs = [reader.column_name_idx(column) for column in columns] + arrays = [reader.read_column(column_idx) for column_idx in column_idxs] + return Table.from_arrays(columns, arrays) def write_table(table, filename, chunk_size=None, version=None, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 922ad3aa9ff..c1d44ce0d42 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -115,6 +115,22 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): pdt.assert_frame_equal(df, df_read) +@parquet +def test_pandas_column_selection(tmpdir): + size = 10000 + np.random.seed(0) + df = pd.DataFrame({ + 'uint8': np.arange(size, dtype=np.uint8), + 'uint16': np.arange(size, dtype=np.uint16) + }) + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = A.from_pandas_dataframe(df) + A.parquet.write_table(arrow_table, filename.strpath) + table_read = pq.read_table(filename.strpath, columns=['uint8']) + df_read = table_read.to_pandas() + + pdt.assert_frame_equal(df[['uint8']], df_read) + @parquet def test_pandas_parquet_configuration_options(tmpdir): size = 10000 From 79344b335849c2eb43954b0751018051814019d6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 7 Nov 2016 13:52:32 -0500 Subject: [PATCH 194/210] ARROW-362: Fix memory leak in zero-copy arrow to NumPy/pandas conversion close #198 Author: Wes McKinney Author: Uwe L. Korn Closes #200 from wesm/ARROW-362 and squashes the following commits: 99df96b [Wes McKinney] Force gc to avoid non-deterministic failure d85228f [Wes McKinney] Be more careful about reference counts in zero-copy handoff, add pyarrow.Array.to_pandas method cc7a6b3 [Uwe L. Korn] ARROW-362: Remove redunant reference count --- python/pyarrow/array.pyx | 21 ++++++++++++++ python/pyarrow/includes/common.pxd | 7 +++++ python/pyarrow/includes/pyarrow.pxd | 4 +-- python/pyarrow/table.pyx | 18 ++++++++---- python/pyarrow/tests/test_array.py | 29 ++++++++++++++++++++ python/pyarrow/tests/test_convert_builtin.py | 4 +++ python/src/pyarrow/adapters/pandas.cc | 4 +-- 7 files changed, 76 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 84ab4a48c9b..fbe4e387906 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -22,6 +22,7 @@ import numpy as np from pyarrow.includes.libarrow cimport * +from pyarrow.includes.common cimport PyObject_to_object cimport pyarrow.includes.pyarrow as pyarrow import pyarrow.config @@ -35,6 +36,8 @@ from pyarrow.scalar import NA from pyarrow.schema cimport Schema import pyarrow.schema as schema +cimport cpython + def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.get_memory_pool() @@ -111,6 +114,24 @@ cdef class Array: def slice(self, start, end): pass + def to_pandas(self): + """ + Convert to an array object suitable for use in pandas + + See also + -------- + Column.to_pandas + Table.to_pandas + RecordBatch.to_pandas + """ + cdef: + PyObject* np_arr + + check_status(pyarrow.ConvertArrayToPandas( + self.sp_array, self, &np_arr)) + + return PyObject_to_object(np_arr) + cdef class NullArray(Array): pass diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 05c0123ee7b..f689bdc3fd8 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -47,3 +47,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_bool IsKeyError() c_bool IsNotImplemented() c_bool IsInvalid() + + +cdef inline object PyObject_to_object(PyObject* o): + # Cast to "object" increments reference count + cdef object result = o + cpython.Py_DECREF(result) + return result diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index e1da1914c57..a5444c236bc 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -34,10 +34,10 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: shared_ptr[CArray]* out) CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr, - object py_ref, PyObject** out) + PyObject* py_ref, PyObject** out) CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr, - object py_ref, PyObject** out) + PyObject* py_ref, PyObject** out) MemoryPool* get_memory_pool() diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 969571262ca..c71bc712bff 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -22,6 +22,7 @@ from cython.operator cimport dereference as deref from pyarrow.includes.libarrow cimport * +from pyarrow.includes.common cimport PyObject_to_object cimport pyarrow.includes.pyarrow as pyarrow import pyarrow.config @@ -32,6 +33,7 @@ from pyarrow.schema cimport box_data_type, box_schema from pyarrow.compat import frombytes, tobytes +cimport cpython cdef class ChunkedArray: ''' @@ -100,8 +102,10 @@ cdef class Column: import pandas as pd - check_status(pyarrow.ConvertColumnToPandas(self.sp_column, self, &arr)) - return pd.Series(arr, name=self.name) + check_status(pyarrow.ConvertColumnToPandas(self.sp_column, + self, &arr)) + + return pd.Series(PyObject_to_object(arr), name=self.name) cdef _check_nullptr(self): if self.column == NULL: @@ -248,9 +252,10 @@ cdef class RecordBatch: data = [] for i in range(self.batch.num_columns()): arr = self.batch.column(i) - check_status(pyarrow.ConvertArrayToPandas(arr, self, &np_arr)) + check_status(pyarrow.ConvertArrayToPandas(arr, self, + &np_arr)) names.append(frombytes(self.batch.column_name(i))) - data.append( np_arr) + data.append(PyObject_to_object(np_arr)) return pd.DataFrame(dict(zip(names, data)), columns=names) @@ -375,9 +380,10 @@ cdef class Table: for i in range(self.table.num_columns()): col = self.table.column(i) column = self.column(i) - check_status(pyarrow.ConvertColumnToPandas(col, column, &arr)) + check_status(pyarrow.ConvertColumnToPandas( + col, column, &arr)) names.append(frombytes(col.get().name())) - data.append( arr) + data.append(PyObject_to_object(arr)) return pd.DataFrame(dict(zip(names, data)), columns=names) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 0a17f691ccd..ead17dbec4e 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +import sys + import pyarrow import pyarrow.formatting as fmt @@ -71,3 +73,30 @@ def test_long_array_format(): 99 ]""" assert result == expected + + +def test_to_pandas_zero_copy(): + import gc + + arr = pyarrow.from_pylist(range(10)) + + for i in range(10): + np_arr = arr.to_pandas() + assert sys.getrefcount(np_arr) == 2 + np_arr = None # noqa + + assert sys.getrefcount(arr) == 2 + + for i in range(10): + arr = pyarrow.from_pylist(range(10)) + np_arr = arr.to_pandas() + arr = None + gc.collect() + + # Ensure base is still valid + + # Because of py.test's assert inspection magic, if you put getrefcount + # on the line being examined, it will be 1 higher than you expect + base_refcount = sys.getrefcount(np_arr.base) + assert base_refcount == 2 + np_arr.sum() diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 2beb6b39d73..8937f8db694 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -47,6 +47,10 @@ def test_integer(self): def test_garbage_collection(self): import gc + + # Force the cyclic garbage collector to run + gc.collect() + bytes_before = pyarrow.total_allocated_bytes() pyarrow.from_pylist([1, None, 3, None]) gc.collect() diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 7e70be75da5..6a3966b7488 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -628,8 +628,6 @@ class ArrowDeserializer { PyAcquireGIL lock; // Zero-Copy. We can pass the data pointer directly to NumPy. - Py_INCREF(py_ref_); - OwnedRef py_ref(py_ref_); npy_intp dims[1] = {col_->length()}; out_ = reinterpret_cast(PyArray_SimpleNewFromData(1, dims, type, data)); @@ -646,7 +644,7 @@ class ArrowDeserializer { return Status::OK(); } else { // PyArray_SetBaseObject steals our reference to py_ref_ - py_ref.release(); + Py_INCREF(py_ref_); } // Arrow data is immutable. From 6996c17f70dc13659c37dfaa39bc28e7777ca6a6 Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Tue, 8 Nov 2016 13:29:34 -0500 Subject: [PATCH 195/210] ARROW-312: [Java] IPC file round trip tool for integration testing Author: Julien Le Dem Author: Wes McKinney Closes #186 from wesm/roundtrip-tool and squashes the following commits: aee552a [Julien Le Dem] missing file 9d5c078 [Julien Le Dem] fix read-write bug 7f20b36 [Julien Le Dem] simple roundtrip a04091f [Wes McKinney] Drafting file round trip helper executable --- .../main/java/io/netty/buffer/ArrowBuf.java | 7 +- .../arrow/memory/TestBaseAllocator.java | 24 ++- java/pom.xml | 1 + java/tools/pom.xml | 73 ++++++++ .../org/apache/arrow/tools/FileRoundtrip.java | 135 +++++++++++++++ .../apache/arrow/tools/TestFileRoundtrip.java | 159 ++++++++++++++++++ java/vector/pom.xml | 32 ++-- .../templates/NullableValueVectors.java | 2 +- .../org/apache/arrow/vector/VectorLoader.java | 21 +-- .../apache/arrow/vector/VectorSchemaRoot.java | 140 +++++++++++++++ .../apache/arrow/vector/VectorUnloader.java | 13 +- .../arrow/vector/schema/ArrowBuffer.java | 6 + .../arrow/vector/schema/ArrowRecordBatch.java | 8 + .../arrow/vector/TestVectorUnloadLoad.java | 42 +++-- .../arrow/vector/file/TestArrowFile.java | 149 ++++++++-------- 15 files changed, 681 insertions(+), 131 deletions(-) create mode 100644 java/tools/pom.xml create mode 100644 java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java create mode 100644 java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java diff --git a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java index a5989c1518d..95d2be5a43a 100644 --- a/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java +++ b/java/memory/src/main/java/io/netty/buffer/ArrowBuf.java @@ -179,7 +179,10 @@ public ArrowBuf retain(BufferAllocator target) { historicalLog.recordEvent("retain(%s)", target.getName()); } final BufferLedger otherLedger = this.ledger.getLedgerForAllocator(target); - return otherLedger.newArrowBuf(offset, length, null); + ArrowBuf newArrowBuf = otherLedger.newArrowBuf(offset, length, null); + newArrowBuf.readerIndex(this.readerIndex); + newArrowBuf.writerIndex(this.writerIndex); + return newArrowBuf; } /** @@ -214,6 +217,8 @@ public TransferResult transferOwnership(BufferAllocator target) { final BufferLedger otherLedger = this.ledger.getLedgerForAllocator(target); final ArrowBuf newBuf = otherLedger.newArrowBuf(offset, length, null); + newBuf.readerIndex(this.readerIndex); + newBuf.writerIndex(this.writerIndex); final boolean allocationFit = this.ledger.transferBalance(otherLedger); return new TransferResult(allocationFit, newBuf); } diff --git a/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java index aa6b70c5c74..3c96d57f4e6 100644 --- a/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java +++ b/java/memory/src/test/java/org/apache/arrow/memory/TestBaseAllocator.java @@ -22,16 +22,13 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -import io.netty.buffer.ArrowBuf; -import io.netty.buffer.ArrowBuf.TransferResult; -import org.apache.arrow.memory.AllocationReservation; -import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.OutOfMemoryException; -import org.apache.arrow.memory.RootAllocator; import org.junit.Ignore; import org.junit.Test; +import io.netty.buffer.ArrowBuf; +import io.netty.buffer.ArrowBuf.TransferResult; + public class TestBaseAllocator { // private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TestBaseAllocator.class); @@ -134,6 +131,7 @@ public void testAllocator_transferOwnership() throws Exception { final ArrowBuf arrowBuf1 = childAllocator1.buffer(MAX_ALLOCATION / 4); rootAllocator.verify(); TransferResult transferOwnership = arrowBuf1.transferOwnership(childAllocator2); + assertEquiv(arrowBuf1, transferOwnership.buffer); final boolean allocationFit = transferOwnership.allocationFit; rootAllocator.verify(); assertTrue(allocationFit); @@ -160,6 +158,7 @@ public void testAllocator_shareOwnership() throws Exception { rootAllocator.verify(); assertNotNull(arrowBuf2); assertNotEquals(arrowBuf2, arrowBuf1); + assertEquiv(arrowBuf1, arrowBuf2); // release original buffer (thus transferring ownership to allocator 2. (should leave allocator 1 in empty state) arrowBuf1.release(); @@ -172,6 +171,7 @@ public void testAllocator_shareOwnership() throws Exception { assertNotNull(arrowBuf3); assertNotEquals(arrowBuf3, arrowBuf1); assertNotEquals(arrowBuf3, arrowBuf2); + assertEquiv(arrowBuf1, arrowBuf3); rootAllocator.verify(); arrowBuf2.release(); @@ -452,8 +452,10 @@ public void testAllocator_transferSliced() throws Exception { rootAllocator.verify(); TransferResult result1 = arrowBuf2s.transferOwnership(childAllocator1); + assertEquiv(arrowBuf2s, result1.buffer); rootAllocator.verify(); TransferResult result2 = arrowBuf1s.transferOwnership(childAllocator2); + assertEquiv(arrowBuf1s, result2.buffer); rootAllocator.verify(); result1.buffer.release(); @@ -482,7 +484,9 @@ public void testAllocator_shareSliced() throws Exception { rootAllocator.verify(); final ArrowBuf arrowBuf2s1 = arrowBuf2s.retain(childAllocator1); + assertEquiv(arrowBuf2s, arrowBuf2s1); final ArrowBuf arrowBuf1s2 = arrowBuf1s.retain(childAllocator2); + assertEquiv(arrowBuf1s, arrowBuf1s2); rootAllocator.verify(); arrowBuf1s.release(); // releases arrowBuf1 @@ -512,11 +516,13 @@ public void testAllocator_transferShared() throws Exception { rootAllocator.verify(); assertNotNull(arrowBuf2); assertNotEquals(arrowBuf2, arrowBuf1); + assertEquiv(arrowBuf1, arrowBuf2); TransferResult result = arrowBuf1.transferOwnership(childAllocator3); allocationFit = result.allocationFit; final ArrowBuf arrowBuf3 = result.buffer; assertTrue(allocationFit); + assertEquiv(arrowBuf1, arrowBuf3); rootAllocator.verify(); // Since childAllocator3 now has childAllocator1's buffer, 1, can close @@ -533,6 +539,7 @@ public void testAllocator_transferShared() throws Exception { allocationFit = result.allocationFit; final ArrowBuf arrowBuf4 = result2.buffer; assertTrue(allocationFit); + assertEquiv(arrowBuf3, arrowBuf4); rootAllocator.verify(); arrowBuf3.release(); @@ -645,4 +652,9 @@ public void multiple() throws Exception { } } + + public void assertEquiv(ArrowBuf origBuf, ArrowBuf newBuf) { + assertEquals(origBuf.readerIndex(), newBuf.readerIndex()); + assertEquals(origBuf.writerIndex(), newBuf.writerIndex()); + } } diff --git a/java/pom.xml b/java/pom.xml index 0147de70357..7221a140d96 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -467,5 +467,6 @@ format memory vector + tools diff --git a/java/tools/pom.xml b/java/tools/pom.xml new file mode 100644 index 00000000000..84b0b5eb425 --- /dev/null +++ b/java/tools/pom.xml @@ -0,0 +1,73 @@ + + + + 4.0.0 + + org.apache.arrow + arrow-java-root + 0.1.1-SNAPSHOT + + arrow-tools + Arrow Tools + + + + org.apache.arrow + arrow-format + ${project.version} + + + org.apache.arrow + arrow-memory + ${project.version} + + + org.apache.arrow + arrow-vector + ${project.version} + + + org.apache.commons + commons-lang3 + 3.4 + + + commons-cli + commons-cli + 1.2 + + + + + + + maven-assembly-plugin + 2.6 + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + + + diff --git a/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java new file mode 100644 index 00000000000..db7a1c23f9c --- /dev/null +++ b/java/tools/src/main/java/org/apache/arrow/tools/FileRoundtrip.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.arrow.tools; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.file.ArrowBlock; +import org.apache.arrow.vector.file.ArrowFooter; +import org.apache.arrow.vector.file.ArrowReader; +import org.apache.arrow.vector.file.ArrowWriter; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FileRoundtrip { + private static final Logger LOGGER = LoggerFactory.getLogger(FileRoundtrip.class); + + public static void main(String[] args) { + System.exit(new FileRoundtrip(System.out, System.err).run(args)); + } + + private final Options options; + private final PrintStream out; + private final PrintStream err; + + FileRoundtrip(PrintStream out, PrintStream err) { + this.out = out; + this.err = err; + this.options = new Options(); + this.options.addOption("i", "in", true, "input file"); + this.options.addOption("o", "out", true, "output file"); + + } + + private File validateFile(String type, String fileName) { + if (fileName == null) { + throw new IllegalArgumentException("missing " + type + " file parameter"); + } + File f = new File(fileName); + if (!f.exists() || f.isDirectory()) { + throw new IllegalArgumentException(type + " file not found: " + f.getAbsolutePath()); + } + return f; + } + + int run(String[] args) { + try { + CommandLineParser parser = new PosixParser(); + CommandLine cmd = parser.parse(options, args, false); + + String inFileName = cmd.getOptionValue("in"); + String outFileName = cmd.getOptionValue("out"); + + File inFile = validateFile("input", inFileName); + File outFile = validateFile("output", outFileName); + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); // TODO: close + try( + FileInputStream fileInputStream = new FileInputStream(inFile); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), allocator);) { + + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + LOGGER.debug("Input file size: " + inFile.length()); + LOGGER.debug("Found schema: " + schema); + + try ( + FileOutputStream fileOutputStream = new FileOutputStream(outFile); + ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); + ) { + + // initialize vectors + + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch inRecordBatch = arrowReader.readRecordBatch(rbBlock); + VectorSchemaRoot root = new VectorSchemaRoot(schema, allocator);) { + + VectorLoader vectorLoader = new VectorLoader(root); + vectorLoader.load(inRecordBatch); + + VectorUnloader vectorUnloader = new VectorUnloader(root); + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + arrowWriter.writeRecordBatch(recordBatch); + } + } + } + LOGGER.debug("Output file size: " + outFile.length()); + } + } catch (ParseException e) { + return fatalError("Invalid parameters", e); + } catch (IOException e) { + return fatalError("Error accessing files", e); + } + return 0; + } + + private int fatalError(String message, Throwable e) { + err.println(message); + LOGGER.error(message, e); + return 1; + } + +} diff --git a/java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java b/java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java new file mode 100644 index 00000000000..339725e5af1 --- /dev/null +++ b/java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java @@ -0,0 +1,159 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.arrow.tools; + +import static org.junit.Assert.assertEquals; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.file.ArrowBlock; +import org.apache.arrow.vector.file.ArrowFooter; +import org.apache.arrow.vector.file.ArrowReader; +import org.apache.arrow.vector.file.ArrowWriter; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestFileRoundtrip { + private static final int COUNT = 10; + + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void tearDown() { + allocator.close(); + } + + private void writeData(int count, MapVector parent) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + writer.setValueCount(count); + } + + @Test + public void test() throws Exception { + File testInFile = testFolder.newFile("testIn.arrow"); + File testOutFile = testFolder.newFile("testOut.arrow"); + + writeInput(testInFile); + + String[] args = { "-i", testInFile.getAbsolutePath(), "-o", testOutFile.getAbsolutePath()}; + int result = new FileRoundtrip(System.out, System.err).run(args); + assertEquals(0, result); + + validateOutput(testOutFile); + } + + private void validateOutput(File testOutFile) throws Exception { + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(testOutFile); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); + BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + ) { + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + + // initialize vectors + try (VectorSchemaRoot root = new VectorSchemaRoot(schema, readerAllocator)) { + VectorLoader vectorLoader = new VectorLoader(root); + + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + vectorLoader.load(recordBatch); + } + validateContent(COUNT, root); + } + } + } + } + + private void validateContent(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + for (int i = 0; i < count; i++) { + Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); + } + } + + public void writeInput(File testInFile) throws FileNotFoundException, IOException { + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null)) { + writeData(count, parent); + write(parent.getChild("root"), testInFile); + } + } + + private void write(FieldVector parent, File file) throws FileNotFoundException, IOException { + Schema schema = new Schema(parent.getField().getChildren()); + int valueCount = parent.getAccessor().getValueCount(); + List fields = parent.getChildrenFromFields(); + VectorUnloader vectorUnloader = new VectorUnloader(schema, valueCount, fields); + try ( + FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + ) { + arrowWriter.writeRecordBatch(recordBatch); + } + } + +} diff --git a/java/vector/pom.xml b/java/vector/pom.xml index 1d06bdece01..64b68bf8a15 100644 --- a/java/vector/pom.xml +++ b/java/vector/pom.xml @@ -1,13 +1,13 @@ - 4.0.0 @@ -56,8 +56,6 @@ commons-lang3 3.4 - - @@ -72,13 +70,13 @@ false - - + + - ${basedir}/src/main/codegen codegen @@ -129,7 +127,7 @@ - org.eclipse.m2e @@ -160,8 +158,8 @@ - - + + diff --git a/java/vector/src/main/codegen/templates/NullableValueVectors.java b/java/vector/src/main/codegen/templates/NullableValueVectors.java index bafa3176020..48af7a2bafe 100644 --- a/java/vector/src/main/codegen/templates/NullableValueVectors.java +++ b/java/vector/src/main/codegen/templates/NullableValueVectors.java @@ -145,7 +145,7 @@ public List getChildrenFromFields() { @Override public void loadFieldBuffers(ArrowFieldNode fieldNode, List ownBuffers) { org.apache.arrow.vector.BaseDataValueVector.load(getFieldInnerVectors(), ownBuffers); - // TODO: do something with the sizes in fieldNode? + bits.valueCount = fieldNode.getLength(); } public List getFieldBuffers() { diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java index b7040da9d82..4afd82315d9 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorLoader.java @@ -27,7 +27,6 @@ import org.apache.arrow.vector.schema.ArrowRecordBatch; import org.apache.arrow.vector.schema.VectorLayout; import org.apache.arrow.vector.types.pojo.Field; -import org.apache.arrow.vector.types.pojo.Schema; import com.google.common.collect.Iterators; @@ -37,22 +36,16 @@ * Loads buffers into vectors */ public class VectorLoader { - private final List fieldVectors; - private final List fields; + private final VectorSchemaRoot root; /** * will create children in root based on schema * @param schema the expected schema * @param root the root to add vectors to based on schema */ - public VectorLoader(Schema schema, FieldVector root) { + public VectorLoader(VectorSchemaRoot root) { super(); - this.fields = schema.getFields(); - root.initializeChildrenFromFields(fields); - this.fieldVectors = root.getChildrenFromFields(); - if (this.fieldVectors.size() != fields.size()) { - throw new IllegalArgumentException("The root vector did not create the right number of children. found " + fieldVectors.size() + " expected " + fields.size()); - } + this.root = root; } /** @@ -63,16 +56,19 @@ public VectorLoader(Schema schema, FieldVector root) { public void load(ArrowRecordBatch recordBatch) { Iterator buffers = recordBatch.getBuffers().iterator(); Iterator nodes = recordBatch.getNodes().iterator(); + List fields = root.getSchema().getFields(); for (int i = 0; i < fields.size(); ++i) { Field field = fields.get(i); - FieldVector fieldVector = fieldVectors.get(i); + FieldVector fieldVector = root.getVector(field.getName()); loadBuffers(fieldVector, field, buffers, nodes); } + root.setRowCount(recordBatch.getLength()); if (nodes.hasNext() || buffers.hasNext()) { throw new IllegalArgumentException("not all nodes and buffers where consumed. nodes: " + Iterators.toString(nodes) + " buffers: " + Iterators.toString(buffers)); } } + private void loadBuffers(FieldVector vector, Field field, Iterator buffers, Iterator nodes) { checkArgument(nodes.hasNext(), "no more field nodes for for field " + field + " and vector " + vector); @@ -85,7 +81,7 @@ private void loadBuffers(FieldVector vector, Field field, Iterator buf try { vector.loadFieldBuffers(fieldNode, ownBuffers); } catch (RuntimeException e) { - throw new IllegalArgumentException("Could not load buffers for field " + field); + throw new IllegalArgumentException("Could not load buffers for field " + field, e); } List children = field.getChildren(); if (children.size() > 0) { @@ -98,4 +94,5 @@ private void loadBuffers(FieldVector vector, Field field, Iterator buf } } } + } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java new file mode 100644 index 00000000000..1cbe18787ef --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorSchemaRoot.java @@ -0,0 +1,140 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.types.Types; +import org.apache.arrow.vector.types.Types.MinorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +public class VectorSchemaRoot implements AutoCloseable { + + private final Schema schema; + private int rowCount; + private final List fieldVectors; + private final Map fieldVectorsMap = new HashMap<>(); + + public VectorSchemaRoot(FieldVector parent) { + this.schema = new Schema(parent.getField().getChildren()); + this.rowCount = parent.getAccessor().getValueCount(); + this.fieldVectors = parent.getChildrenFromFields(); + for (int i = 0; i < schema.getFields().size(); ++i) { + Field field = schema.getFields().get(i); + FieldVector vector = fieldVectors.get(i); + fieldVectorsMap.put(field.getName(), vector); + } + } + + public VectorSchemaRoot(Schema schema, BufferAllocator allocator) { + super(); + this.schema = schema; + List fieldVectors = new ArrayList<>(); + for (Field field : schema.getFields()) { + MinorType minorType = Types.getMinorTypeForArrowType(field.getType()); + FieldVector vector = minorType.getNewVector(field.getName(), allocator, null); + vector.initializeChildrenFromFields(field.getChildren()); + fieldVectors.add(vector); + fieldVectorsMap.put(field.getName(), vector); + } + this.fieldVectors = Collections.unmodifiableList(fieldVectors); + if (this.fieldVectors.size() != schema.getFields().size()) { + throw new IllegalArgumentException("The root vector did not create the right number of children. found " + fieldVectors.size() + " expected " + schema.getFields().size()); + } + } + + public List getFieldVectors() { + return fieldVectors; + } + + public FieldVector getVector(String name) { + return fieldVectorsMap.get(name); + } + + public Schema getSchema() { + return schema; + } + + public int getRowCount() { + return rowCount; + } + + public void setRowCount(int rowCount) { + this.rowCount = rowCount; + } + + @Override + public void close() { + RuntimeException ex = null; + for (FieldVector fieldVector : fieldVectors) { + try { + fieldVector.close(); + } catch (RuntimeException e) { + ex = chain(ex, e); + } + } + if (ex!= null) { + throw ex; + } + } + + private RuntimeException chain(RuntimeException root, RuntimeException e) { + if (root == null) { + root = e; + } else { + root.addSuppressed(e); + } + return root; + } + + private void printRow(StringBuilder sb, List row) { + boolean first = true; + for (Object v : row) { + if (first) { + first = false; + } else { + sb.append("\t"); + } + sb.append(v); + } + sb.append("\n"); + } + + public String contentToTSVString() { + StringBuilder sb = new StringBuilder(); + List row = new ArrayList<>(schema.getFields().size()); + for (Field field : schema.getFields()) { + row.add(field.getName()); + } + printRow(sb, row); + for (int i = 0; i < rowCount; i++) { + row.clear(); + for (FieldVector v : fieldVectors) { + row.add(v.getAccessor().getObject(i)); + } + printRow(sb, row); + } + return sb.toString(); + } +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java index 3375a7d5c31..e2462180ffa 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java @@ -34,11 +34,15 @@ public class VectorUnloader { private final int valueCount; private final List vectors; - public VectorUnloader(FieldVector parent) { + public VectorUnloader(Schema schema, int valueCount, List vectors) { super(); - this.schema = new Schema(parent.getField().getChildren()); - this.valueCount = parent.getAccessor().getValueCount(); - this.vectors = parent.getChildrenFromFields(); + this.schema = schema; + this.valueCount = valueCount; + this.vectors = vectors; + } + + public VectorUnloader(VectorSchemaRoot root) { + this(root.getSchema(), root.getRowCount(), root.getFieldVectors()); } public Schema getSchema() { @@ -77,4 +81,5 @@ private void appendNodes(FieldVector vector, List nodes, List fields = root.getChildrenFromFields(); + return new VectorUnloader(schema, valueCount, fields); + } + @AfterClass public static void afterClass() { allocator.close(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index 0f28d53295c..e97bc14d169 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -17,6 +17,8 @@ */ package org.apache.arrow.vector.file; +import static org.apache.arrow.vector.TestVectorUnloadLoad.newVectorUnloader; + import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; @@ -29,12 +31,12 @@ import org.apache.arrow.vector.FieldVector; import org.apache.arrow.vector.ValueVector.Accessor; import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NullableMapVector; import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; -import org.apache.arrow.vector.complex.impl.SingleMapReaderImpl; -import org.apache.arrow.vector.complex.reader.BaseReader.MapReader; +import org.apache.arrow.vector.complex.reader.FieldReader; import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; @@ -43,7 +45,6 @@ import org.apache.arrow.vector.holders.NullableTimeStampHolder; import org.apache.arrow.vector.schema.ArrowBuffer; import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.types.Types.MinorType; import org.apache.arrow.vector.types.pojo.Schema; import org.joda.time.DateTimeZone; import org.junit.After; @@ -94,8 +95,9 @@ public void testWriteComplex() throws IOException { BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null)) { writeComplexData(count, parent); - validateComplexContent(count, parent); - write(parent.getChild("root"), file); + FieldVector root = parent.getChild("root"); + validateComplexContent(count, new VectorSchemaRoot(root)); + write(root, file); } } @@ -174,33 +176,31 @@ public void testWriteRead() throws IOException { // initialize vectors - NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); - - VectorLoader vectorLoader = new VectorLoader(schema, root); - - List recordBatches = footer.getRecordBatches(); - for (ArrowBlock rbBlock : recordBatches) { - Assert.assertEquals(0, rbBlock.getOffset() % 8); - Assert.assertEquals(0, rbBlock.getMetadataLength() % 8); - try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { - List buffersLayout = recordBatch.getBuffersLayout(); - for (ArrowBuffer arrowBuffer : buffersLayout) { - Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + try (VectorSchemaRoot root = new VectorSchemaRoot(schema, vectorAllocator)) { + VectorLoader vectorLoader = new VectorLoader(root); + + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + Assert.assertEquals(0, rbBlock.getOffset() % 8); + Assert.assertEquals(0, rbBlock.getMetadataLength() % 8); + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + List buffersLayout = recordBatch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } + vectorLoader.load(recordBatch); } - vectorLoader.load(recordBatch); - } - validateContent(count, parent); + validateContent(count, root); + } } } } - private void validateContent(int count, MapVector parent) { - MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + private void validateContent(int count, VectorSchemaRoot root) { for (int i = 0; i < count; i++) { - rootReader.setPosition(i); - Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); - Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); + Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); } } @@ -231,15 +231,15 @@ public void testWriteReadComplex() throws IOException { // initialize vectors - NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); - VectorLoader vectorLoader = new VectorLoader(schema, root); - - List recordBatches = footer.getRecordBatches(); - for (ArrowBlock rbBlock : recordBatches) { - try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { - vectorLoader.load(recordBatch); + try (VectorSchemaRoot root = new VectorSchemaRoot(schema, vectorAllocator)) { + VectorLoader vectorLoader = new VectorLoader(root); + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + vectorLoader.load(recordBatch); + } + validateComplexContent(count, root); } - validateComplexContent(count, parent); } } } @@ -255,23 +255,23 @@ public void printVectors(List vectors) { } } - private void validateComplexContent(int count, NullableMapVector parent) { - printVectors(parent.getChildrenFromFields()); - - MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + private void validateComplexContent(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + printVectors(root.getFieldVectors()); for (int i = 0; i < count; i++) { - rootReader.setPosition(i); - Assert.assertEquals(i, rootReader.reader("int").readInteger().intValue()); - Assert.assertEquals(i, rootReader.reader("bigInt").readLong().longValue()); - Assert.assertEquals(i % 3, rootReader.reader("list").size()); + Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); + Assert.assertEquals(i % 3, ((List)root.getVector("list").getAccessor().getObject(i)).size()); NullableTimeStampHolder h = new NullableTimeStampHolder(); - rootReader.reader("map").reader("timestamp").read(h); + FieldReader mapReader = root.getVector("map").getReader(); + mapReader.setPosition(i); + mapReader.reader("timestamp").read(h); Assert.assertEquals(i, h.value); } } private void write(FieldVector parent, File file) throws FileNotFoundException, IOException { - VectorUnloader vectorUnloader = new VectorUnloader(parent); + VectorUnloader vectorUnloader = newVectorUnloader(parent); Schema schema = vectorUnloader.getSchema(); LOGGER.debug("writing schema: " + schema); try ( @@ -294,7 +294,7 @@ public void testWriteReadMultipleRBs() throws IOException { MapVector parent = new MapVector("parent", originalVectorAllocator, null); FileOutputStream fileOutputStream = new FileOutputStream(file);) { writeData(count, parent); - VectorUnloader vectorUnloader = new VectorUnloader(parent.getChild("root")); + VectorUnloader vectorUnloader = newVectorUnloader(parent.getChild("root")); Schema schema = vectorUnloader.getSchema(); Assert.assertEquals(2, schema.getFields().size()); try (ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema);) { @@ -320,20 +320,21 @@ public void testWriteReadMultipleRBs() throws IOException { ArrowFooter footer = arrowReader.readFooter(); Schema schema = footer.getSchema(); LOGGER.debug("reading schema: " + schema); - NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); - VectorLoader vectorLoader = new VectorLoader(schema, root); - List recordBatches = footer.getRecordBatches(); - Assert.assertEquals(2, recordBatches.size()); - for (ArrowBlock rbBlock : recordBatches) { - Assert.assertEquals(0, rbBlock.getOffset() % 8); - Assert.assertEquals(0, rbBlock.getMetadataLength() % 8); - try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { - List buffersLayout = recordBatch.getBuffersLayout(); - for (ArrowBuffer arrowBuffer : buffersLayout) { - Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + try (VectorSchemaRoot root = new VectorSchemaRoot(schema, vectorAllocator);) { + VectorLoader vectorLoader = new VectorLoader(root); + List recordBatches = footer.getRecordBatches(); + Assert.assertEquals(2, recordBatches.size()); + for (ArrowBlock rbBlock : recordBatches) { + Assert.assertEquals(0, rbBlock.getOffset() % 8); + Assert.assertEquals(0, rbBlock.getMetadataLength() % 8); + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + List buffersLayout = recordBatch.getBuffersLayout(); + for (ArrowBuffer arrowBuffer : buffersLayout) { + Assert.assertEquals(0, arrowBuffer.getOffset() % 8); + } + vectorLoader.load(recordBatch); + validateContent(count, root); } - vectorLoader.load(recordBatch); - validateContent(count, parent); } } } @@ -351,7 +352,7 @@ public void testWriteReadUnion() throws IOException { printVectors(parent.getChildrenFromFields()); - validateUnionData(count, parent); + validateUnionData(count, new VectorSchemaRoot(parent.getChild("root"))); write(parent.getChild("root"), file); } @@ -361,44 +362,42 @@ public void testWriteReadUnion() throws IOException { FileInputStream fileInputStream = new FileInputStream(file); ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); - NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null) ) { ArrowFooter footer = arrowReader.readFooter(); Schema schema = footer.getSchema(); LOGGER.debug("reading schema: " + schema); // initialize vectors - - NullableMapVector root = parent.addOrGet("root", MinorType.MAP, NullableMapVector.class); - VectorLoader vectorLoader = new VectorLoader(schema, root); - - List recordBatches = footer.getRecordBatches(); - for (ArrowBlock rbBlock : recordBatches) { - try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { - vectorLoader.load(recordBatch); + try (VectorSchemaRoot root = new VectorSchemaRoot(schema, vectorAllocator);) { + VectorLoader vectorLoader = new VectorLoader(root); + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + vectorLoader.load(recordBatch); + } + validateUnionData(count, root); } - validateUnionData(count, parent); } } } - public void validateUnionData(int count, MapVector parent) { - MapReader rootReader = new SingleMapReaderImpl(parent).reader("root"); + public void validateUnionData(int count, VectorSchemaRoot root) { + FieldReader unionReader = root.getVector("union").getReader(); for (int i = 0; i < count; i++) { - rootReader.setPosition(i); + unionReader.setPosition(i); switch (i % 4) { case 0: - Assert.assertEquals(i, rootReader.reader("union").readInteger().intValue()); + Assert.assertEquals(i, unionReader.readInteger().intValue()); break; case 1: - Assert.assertEquals(i, rootReader.reader("union").readLong().longValue()); + Assert.assertEquals(i, unionReader.readLong().longValue()); break; case 2: - Assert.assertEquals(i % 3, rootReader.reader("union").size()); + Assert.assertEquals(i % 3, unionReader.size()); break; case 3: NullableTimeStampHolder h = new NullableTimeStampHolder(); - rootReader.reader("union").reader("timestamp").read(h); + unionReader.reader("timestamp").read(h); Assert.assertEquals(i, h.value); break; } From 4fa7ac4f6ca30c34a73fb84d9d56d54aed96491b Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Wed, 9 Nov 2016 08:55:51 -0800 Subject: [PATCH 196/210] ARROW-372: json vector serialization format This format serializes the vectors in JSON. It is not a generic JSON to arrow converter but rather a human readable version of the vectors to help with tests. Author: Julien Le Dem Closes #201 from julienledem/json_file and squashes the following commits: 2e63bec [Julien Le Dem] add missing license 5588729 [Julien Le Dem] refactor tests, improve format 5ef5356 [Julien Le Dem] improve format to allow empty column name 746430c [Julien Le Dem] ARROW-372: Create JSON arrow file format for integration tests --- .../vector/file/json/JsonFileReader.java | 223 ++++++++++++++++++ .../vector/file/json/JsonFileWriter.java | 167 +++++++++++++ .../arrow/vector/file/BaseFileTest.java | 220 +++++++++++++++++ .../arrow/vector/file/TestArrowFile.java | 200 +--------------- .../arrow/vector/file/json/TestJSONFile.java | 120 ++++++++++ 5 files changed, 741 insertions(+), 189 deletions(-) create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java create mode 100644 java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java create mode 100644 java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java new file mode 100644 index 00000000000..859a3a0e80a --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -0,0 +1,223 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.file.json; + +import static com.fasterxml.jackson.core.JsonToken.END_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.END_OBJECT; +import static com.fasterxml.jackson.core.JsonToken.START_ARRAY; +import static com.fasterxml.jackson.core.JsonToken.START_OBJECT; +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.BigIntVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.Float4Vector; +import org.apache.arrow.vector.Float8Vector; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.SmallIntVector; +import org.apache.arrow.vector.TimeStampVector; +import org.apache.arrow.vector.TinyIntVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ValueVector.Mutator; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.schema.ArrowVectorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +import com.fasterxml.jackson.core.JsonParseException; +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.databind.MappingJsonFactory; +import com.google.common.base.Objects; + +public class JsonFileReader { + private final File inputFile; + private final JsonParser parser; + private final BufferAllocator allocator; + private Schema schema; + + public JsonFileReader(File inputFile, BufferAllocator allocator) throws JsonParseException, IOException { + super(); + this.inputFile = inputFile; + this.allocator = allocator; + MappingJsonFactory jsonFactory = new MappingJsonFactory(); + this.parser = jsonFactory.createParser(inputFile); + } + + public Schema start() throws JsonParseException, IOException { + readToken(START_OBJECT); + { + this.schema = readNextField("schema", Schema.class); + nextFieldIs("batches"); + readToken(START_ARRAY); + return schema; + } + } + + public VectorSchemaRoot read() throws IOException { + VectorSchemaRoot recordBatch = new VectorSchemaRoot(schema, allocator); + readToken(START_OBJECT); + { + int count = readNextField("count", Integer.class); + recordBatch.setRowCount(count); + nextFieldIs("columns"); + readToken(START_ARRAY); + { + for (Field field : schema.getFields()) { + FieldVector vector = recordBatch.getVector(field.getName()); + readVector(field, vector); + } + } + readToken(END_ARRAY); + } + readToken(END_OBJECT); + return recordBatch; + } + + private void readVector(Field field, FieldVector vector) throws JsonParseException, IOException { + List vectorTypes = field.getTypeLayout().getVectorTypes(); + List fieldInnerVectors = vector.getFieldInnerVectors(); + if (vectorTypes.size() != fieldInnerVectors.size()) { + throw new IllegalArgumentException("vector types and inner vectors are not the same size: " + vectorTypes.size() + " != " + fieldInnerVectors.size()); + } + readToken(START_OBJECT); + { + String name = readNextField("name", String.class); + if (!Objects.equal(field.getName(), name)) { + throw new IllegalArgumentException("Expected field " + field.getName() + " but got " + name); + } + int count = readNextField("count", Integer.class); + for (int v = 0; v < vectorTypes.size(); v++) { + ArrowVectorType vectorType = vectorTypes.get(v); + BufferBacked innerVector = fieldInnerVectors.get(v); + nextFieldIs(vectorType.getName()); + readToken(START_ARRAY); + ValueVector valueVector = (ValueVector)innerVector; + valueVector.allocateNew(); + Mutator mutator = valueVector.getMutator(); + mutator.setValueCount(count); + for (int i = 0; i < count; i++) { + parser.nextToken(); + setValueFromParser(valueVector, i); + } + readToken(END_ARRAY); + } + // if children + List fields = field.getChildren(); + if (!fields.isEmpty()) { + List vectorChildren = vector.getChildrenFromFields(); + if (fields.size() != vectorChildren.size()) { + throw new IllegalArgumentException("fields and children are not the same size: " + fields.size() + " != " + vectorChildren.size()); + } + nextFieldIs("children"); + readToken(START_ARRAY); + for (int i = 0; i < fields.size(); i++) { + Field childField = fields.get(i); + FieldVector childVector = vectorChildren.get(i); + readVector(childField, childVector); + } + readToken(END_ARRAY); + } + } + readToken(END_OBJECT); + } + + private void setValueFromParser(ValueVector valueVector, int i) throws IOException { + switch (valueVector.getMinorType()) { + case BIT: + ((BitVector)valueVector).getMutator().set(i, parser.readValueAs(Boolean.class) ? 1 : 0); + break; + case TINYINT: + ((TinyIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case SMALLINT: + ((SmallIntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case INT: + ((IntVector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case BIGINT: + ((BigIntVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case UINT1: + ((UInt1Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT2: + ((UInt2Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT4: + ((UInt4Vector)valueVector).getMutator().set(i, parser.readValueAs(Integer.class)); + break; + case UINT8: + ((UInt8Vector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + case FLOAT4: + ((Float4Vector)valueVector).getMutator().set(i, parser.readValueAs(Float.class)); + break; + case FLOAT8: + ((Float8Vector)valueVector).getMutator().set(i, parser.readValueAs(Double.class)); + break; + case VARCHAR: + ((VarCharVector)valueVector).getMutator().setSafe(i, parser.readValueAs(String.class).getBytes(UTF_8)); + break; + case TIMESTAMP: + ((TimeStampVector)valueVector).getMutator().set(i, parser.readValueAs(Long.class)); + break; + default: + throw new UnsupportedOperationException("minor type: " + valueVector.getMinorType()); + } + } + + public void close() throws IOException { + readToken(END_ARRAY); + readToken(END_OBJECT); + parser.close(); + } + + private T readNextField(String expectedFieldName, Class c) throws IOException, JsonParseException { + nextFieldIs(expectedFieldName); + parser.nextToken(); + return parser.readValueAs(c); + } + + private void nextFieldIs(String expectedFieldName) throws IOException, JsonParseException { + String name = parser.nextFieldName(); + if (name == null || !name.equals(expectedFieldName)) { + throw new IllegalStateException("Expected " + expectedFieldName + " but got " + name); + } + } + + private void readToken(JsonToken expected) throws JsonParseException, IOException { + JsonToken t = parser.nextToken(); + if (t != expected) { + throw new IllegalStateException("Expected " + expected + " but got " + t); + } + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java new file mode 100644 index 00000000000..47c1a7dabef --- /dev/null +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -0,0 +1,167 @@ +/******************************************************************************* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ +package org.apache.arrow.vector.file.json; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.BufferBacked; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.TimeStampVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.ValueVector.Accessor; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.schema.ArrowVectorType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; + +import com.fasterxml.jackson.core.JsonEncoding; +import com.fasterxml.jackson.core.JsonGenerator; +import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; +import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter; +import com.fasterxml.jackson.databind.MappingJsonFactory; + +public class JsonFileWriter { + + public static final class JSONWriteConfig { + private final boolean pretty; + private JSONWriteConfig(boolean pretty) { + this.pretty = pretty; + } + private JSONWriteConfig() { + this.pretty = false; + } + public JSONWriteConfig pretty(boolean pretty) { + return new JSONWriteConfig(pretty); + } + } + + public static JSONWriteConfig config() { + return new JSONWriteConfig(); + } + + private final JsonGenerator generator; + private Schema schema; + + public JsonFileWriter(File outputFile) throws IOException { + this(outputFile, config()); + } + + public JsonFileWriter(File outputFile, JSONWriteConfig config) throws IOException { + MappingJsonFactory jsonFactory = new MappingJsonFactory(); + this.generator = jsonFactory.createGenerator(outputFile, JsonEncoding.UTF8); + if (config.pretty) { + DefaultPrettyPrinter prettyPrinter = new DefaultPrettyPrinter(); + prettyPrinter.indentArraysWith(NopIndenter.instance); + this.generator.setPrettyPrinter(prettyPrinter); + } + } + + public void start(Schema schema) throws IOException { + this.schema = schema; + generator.writeStartObject(); + generator.writeObjectField("schema", schema); + generator.writeArrayFieldStart("batches"); + } + + public void write(VectorSchemaRoot recordBatch) throws IOException { + if (!recordBatch.getSchema().equals(schema)) { + throw new IllegalArgumentException("record batches must have the same schema: " + schema); + } + generator.writeStartObject(); + { + generator.writeObjectField("count", recordBatch.getRowCount()); + generator.writeArrayFieldStart("columns"); + for (Field field : schema.getFields()) { + FieldVector vector = recordBatch.getVector(field.getName()); + writeVector(field, vector); + } + generator.writeEndArray(); + } + generator.writeEndObject(); + } + + private void writeVector(Field field, FieldVector vector) throws IOException { + List vectorTypes = field.getTypeLayout().getVectorTypes(); + List fieldInnerVectors = vector.getFieldInnerVectors(); + if (vectorTypes.size() != fieldInnerVectors.size()) { + throw new IllegalArgumentException("vector types and inner vectors are not the same size: " + vectorTypes.size() + " != " + fieldInnerVectors.size()); + } + generator.writeStartObject(); + { + generator.writeObjectField("name", field.getName()); + int valueCount = vector.getAccessor().getValueCount(); + generator.writeObjectField("count", valueCount); + for (int v = 0; v < vectorTypes.size(); v++) { + ArrowVectorType vectorType = vectorTypes.get(v); + BufferBacked innerVector = fieldInnerVectors.get(v); + generator.writeArrayFieldStart(vectorType.getName()); + ValueVector valueVector = (ValueVector)innerVector; + for (int i = 0; i < valueCount; i++) { + writeValueToGenerator(valueVector, i); + } + generator.writeEndArray(); + } + List fields = field.getChildren(); + List children = vector.getChildrenFromFields(); + if (fields.size() != children.size()) { + throw new IllegalArgumentException("fields and children are not the same size: " + fields.size() + " != " + children.size()); + } + if (fields.size() > 0) { + generator.writeArrayFieldStart("children"); + for (int i = 0; i < fields.size(); i++) { + Field childField = fields.get(i); + FieldVector childVector = children.get(i); + writeVector(childField, childVector); + } + generator.writeEndArray(); + } + } + generator.writeEndObject(); + } + + private void writeValueToGenerator(ValueVector valueVector, int i) throws IOException { + switch (valueVector.getMinorType()) { + case TIMESTAMP: + generator.writeNumber(((TimeStampVector)valueVector).getAccessor().get(i)); + break; + case BIT: + generator.writeNumber(((BitVector)valueVector).getAccessor().get(i)); + break; + default: + // TODO: each type + Accessor accessor = valueVector.getAccessor(); + Object value = accessor.getObject(i); + if (value instanceof Number || value instanceof Boolean) { + generator.writeObject(value); + } else { + generator.writeObject(value.toString()); + } + break; + } + } + + public void close() throws IOException { + generator.writeEndArray(); + generator.writeEndObject(); + generator.close(); + } + +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java new file mode 100644 index 00000000000..6e577b500a6 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/BaseFileTest.java @@ -0,0 +1,220 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file; + +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.ValueVector.Accessor; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.reader.FieldReader; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.holders.NullableTimeStampHolder; +import org.joda.time.DateTimeZone; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.netty.buffer.ArrowBuf; + +/** + * Helps testing the file formats + */ +public class BaseFileTest { + private static final Logger LOGGER = LoggerFactory.getLogger(BaseFileTest.class); + protected static final int COUNT = 10; + protected BufferAllocator allocator; + + private DateTimeZone defaultTimezone = DateTimeZone.getDefault(); + + @Before + public void init() { + DateTimeZone.setDefault(DateTimeZone.forOffsetHours(2)); + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void tearDown() { + allocator.close(); + DateTimeZone.setDefault(defaultTimezone); + } + + protected void validateContent(int count, VectorSchemaRoot root) { + for (int i = 0; i < count; i++) { + Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); + } + } + + protected void writeComplexData(int count, MapVector parent) { + ArrowBuf varchar = allocator.buffer(3); + varchar.readerIndex(0); + varchar.setByte(0, 'a'); + varchar.setByte(1, 'b'); + varchar.setByte(2, 'c'); + varchar.writerIndex(3); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + ListWriter listWriter = rootWriter.list("list"); + MapWriter mapWriter = rootWriter.map("map"); + for (int i = 0; i < count; i++) { + if (i % 5 != 3) { + intWriter.setPosition(i); + intWriter.writeInt(i); + } + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.timeStamp("timestamp").writeTimeStamp(i); + mapWriter.end(); + } + writer.setValueCount(count); + varchar.release(); + } + + public void printVectors(List vectors) { + for (FieldVector vector : vectors) { + LOGGER.debug(vector.getField().getName()); + Accessor accessor = vector.getAccessor(); + int valueCount = accessor.getValueCount(); + for (int i = 0; i < valueCount; i++) { + LOGGER.debug(String.valueOf(accessor.getObject(i))); + } + } + } + + protected void validateComplexContent(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + printVectors(root.getFieldVectors()); + for (int i = 0; i < count; i++) { + Object intVal = root.getVector("int").getAccessor().getObject(i); + if (i % 5 != 3) { + Assert.assertEquals(i, intVal); + } else { + Assert.assertNull(intVal); + } + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); + Assert.assertEquals(i % 3, ((List)root.getVector("list").getAccessor().getObject(i)).size()); + NullableTimeStampHolder h = new NullableTimeStampHolder(); + FieldReader mapReader = root.getVector("map").getReader(); + mapReader.setPosition(i); + mapReader.reader("timestamp").read(h); + Assert.assertEquals(i, h.value); + } + } + + protected void writeData(int count, MapVector parent) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + writer.setValueCount(count); + } + + public void validateUnionData(int count, VectorSchemaRoot root) { + FieldReader unionReader = root.getVector("union").getReader(); + for (int i = 0; i < count; i++) { + unionReader.setPosition(i); + switch (i % 4) { + case 0: + Assert.assertEquals(i, unionReader.readInteger().intValue()); + break; + case 1: + Assert.assertEquals(i, unionReader.readLong().longValue()); + break; + case 2: + Assert.assertEquals(i % 3, unionReader.size()); + break; + case 3: + NullableTimeStampHolder h = new NullableTimeStampHolder(); + unionReader.reader("timestamp").read(h); + Assert.assertEquals(i, h.value); + break; + } + } + } + + public void writeUnionData(int count, NullableMapVector parent) { + ArrowBuf varchar = allocator.buffer(3); + varchar.readerIndex(0); + varchar.setByte(0, 'a'); + varchar.setByte(1, 'b'); + varchar.setByte(2, 'c'); + varchar.writerIndex(3); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("union"); + BigIntWriter bigIntWriter = rootWriter.bigInt("union"); + ListWriter listWriter = rootWriter.list("union"); + MapWriter mapWriter = rootWriter.map("union"); + for (int i = 0; i < count; i++) { + switch (i % 4) { + case 0: + intWriter.setPosition(i); + intWriter.writeInt(i); + break; + case 1: + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + break; + case 2: + listWriter.setPosition(i); + listWriter.startList(); + for (int j = 0; j < i % 3; j++) { + listWriter.varChar().writeVarChar(0, 3, varchar); + } + listWriter.endList(); + break; + case 3: + mapWriter.setPosition(i); + mapWriter.start(); + mapWriter.timeStamp("timestamp").writeTimeStamp(i); + mapWriter.end(); + break; + } + } + writer.setValueCount(count); + varchar.release(); + } +} diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java index e97bc14d169..c9e60ee047b 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/TestArrowFile.java @@ -27,53 +27,22 @@ import java.util.List; import org.apache.arrow.memory.BufferAllocator; -import org.apache.arrow.memory.RootAllocator; import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.ValueVector.Accessor; import org.apache.arrow.vector.VectorLoader; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.VectorUnloader; import org.apache.arrow.vector.complex.MapVector; import org.apache.arrow.vector.complex.NullableMapVector; -import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; -import org.apache.arrow.vector.complex.reader.FieldReader; -import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; -import org.apache.arrow.vector.complex.writer.BaseWriter.ListWriter; -import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; -import org.apache.arrow.vector.complex.writer.BigIntWriter; -import org.apache.arrow.vector.complex.writer.IntWriter; -import org.apache.arrow.vector.holders.NullableTimeStampHolder; import org.apache.arrow.vector.schema.ArrowBuffer; import org.apache.arrow.vector.schema.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; -import org.joda.time.DateTimeZone; -import org.junit.After; import org.junit.Assert; -import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import io.netty.buffer.ArrowBuf; - -public class TestArrowFile { +public class TestArrowFile extends BaseFileTest { private static final Logger LOGGER = LoggerFactory.getLogger(TestArrowFile.class); - private static final int COUNT = 10; - private BufferAllocator allocator; - - private DateTimeZone defaultTimezone = DateTimeZone.getDefault(); - - @Before - public void init() { - DateTimeZone.setDefault(DateTimeZone.forOffsetHours(2)); - allocator = new RootAllocator(Integer.MAX_VALUE); - } - - @After - public void tearDown() { - allocator.close(); - DateTimeZone.setDefault(defaultTimezone); - } @Test public void testWrite() throws IOException { @@ -101,54 +70,6 @@ public void testWriteComplex() throws IOException { } } - private void writeComplexData(int count, MapVector parent) { - ArrowBuf varchar = allocator.buffer(3); - varchar.readerIndex(0); - varchar.setByte(0, 'a'); - varchar.setByte(1, 'b'); - varchar.setByte(2, 'c'); - varchar.writerIndex(3); - ComplexWriter writer = new ComplexWriterImpl("root", parent); - MapWriter rootWriter = writer.rootAsMap(); - IntWriter intWriter = rootWriter.integer("int"); - BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); - ListWriter listWriter = rootWriter.list("list"); - MapWriter mapWriter = rootWriter.map("map"); - for (int i = 0; i < count; i++) { - intWriter.setPosition(i); - intWriter.writeInt(i); - bigIntWriter.setPosition(i); - bigIntWriter.writeBigInt(i); - listWriter.setPosition(i); - listWriter.startList(); - for (int j = 0; j < i % 3; j++) { - listWriter.varChar().writeVarChar(0, 3, varchar); - } - listWriter.endList(); - mapWriter.setPosition(i); - mapWriter.start(); - mapWriter.timeStamp("timestamp").writeTimeStamp(i); - mapWriter.end(); - } - writer.setValueCount(count); - varchar.release(); - } - - - private void writeData(int count, MapVector parent) { - ComplexWriter writer = new ComplexWriterImpl("root", parent); - MapWriter rootWriter = writer.rootAsMap(); - IntWriter intWriter = rootWriter.integer("int"); - BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); - for (int i = 0; i < count; i++) { - intWriter.setPosition(i); - intWriter.writeInt(i); - bigIntWriter.setPosition(i); - bigIntWriter.writeBigInt(i); - } - writer.setValueCount(count); - } - @Test public void testWriteRead() throws IOException { File file = new File("target/mytest.arrow"); @@ -197,13 +118,6 @@ public void testWriteRead() throws IOException { } } - private void validateContent(int count, VectorSchemaRoot root) { - for (int i = 0; i < count; i++) { - Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); - Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); - } - } - @Test public void testWriteReadComplex() throws IOException { File file = new File("target/mytest_complex.arrow"); @@ -244,45 +158,6 @@ public void testWriteReadComplex() throws IOException { } } - public void printVectors(List vectors) { - for (FieldVector vector : vectors) { - LOGGER.debug(vector.getField().getName()); - Accessor accessor = vector.getAccessor(); - int valueCount = accessor.getValueCount(); - for (int i = 0; i < valueCount; i++) { - LOGGER.debug(String.valueOf(accessor.getObject(i))); - } - } - } - - private void validateComplexContent(int count, VectorSchemaRoot root) { - Assert.assertEquals(count, root.getRowCount()); - printVectors(root.getFieldVectors()); - for (int i = 0; i < count; i++) { - Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); - Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); - Assert.assertEquals(i % 3, ((List)root.getVector("list").getAccessor().getObject(i)).size()); - NullableTimeStampHolder h = new NullableTimeStampHolder(); - FieldReader mapReader = root.getVector("map").getReader(); - mapReader.setPosition(i); - mapReader.reader("timestamp").read(h); - Assert.assertEquals(i, h.value); - } - } - - private void write(FieldVector parent, File file) throws FileNotFoundException, IOException { - VectorUnloader vectorUnloader = newVectorUnloader(parent); - Schema schema = vectorUnloader.getSchema(); - LOGGER.debug("writing schema: " + schema); - try ( - FileOutputStream fileOutputStream = new FileOutputStream(file); - ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); - ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); - ) { - arrowWriter.writeRecordBatch(recordBatch); - } - } - @Test public void testWriteReadMultipleRBs() throws IOException { File file = new File("target/mytest_multiple.arrow"); @@ -381,69 +256,16 @@ public void testWriteReadUnion() throws IOException { } } - public void validateUnionData(int count, VectorSchemaRoot root) { - FieldReader unionReader = root.getVector("union").getReader(); - for (int i = 0; i < count; i++) { - unionReader.setPosition(i); - switch (i % 4) { - case 0: - Assert.assertEquals(i, unionReader.readInteger().intValue()); - break; - case 1: - Assert.assertEquals(i, unionReader.readLong().longValue()); - break; - case 2: - Assert.assertEquals(i % 3, unionReader.size()); - break; - case 3: - NullableTimeStampHolder h = new NullableTimeStampHolder(); - unionReader.reader("timestamp").read(h); - Assert.assertEquals(i, h.value); - break; - } - } - } - - public void writeUnionData(int count, NullableMapVector parent) { - ArrowBuf varchar = allocator.buffer(3); - varchar.readerIndex(0); - varchar.setByte(0, 'a'); - varchar.setByte(1, 'b'); - varchar.setByte(2, 'c'); - varchar.writerIndex(3); - ComplexWriter writer = new ComplexWriterImpl("root", parent); - MapWriter rootWriter = writer.rootAsMap(); - IntWriter intWriter = rootWriter.integer("union"); - BigIntWriter bigIntWriter = rootWriter.bigInt("union"); - ListWriter listWriter = rootWriter.list("union"); - MapWriter mapWriter = rootWriter.map("union"); - for (int i = 0; i < count; i++) { - switch (i % 4) { - case 0: - intWriter.setPosition(i); - intWriter.writeInt(i); - break; - case 1: - bigIntWriter.setPosition(i); - bigIntWriter.writeBigInt(i); - break; - case 2: - listWriter.setPosition(i); - listWriter.startList(); - for (int j = 0; j < i % 3; j++) { - listWriter.varChar().writeVarChar(0, 3, varchar); - } - listWriter.endList(); - break; - case 3: - mapWriter.setPosition(i); - mapWriter.start(); - mapWriter.timeStamp("timestamp").writeTimeStamp(i); - mapWriter.end(); - break; - } + private void write(FieldVector parent, File file) throws FileNotFoundException, IOException { + VectorUnloader vectorUnloader = newVectorUnloader(parent); + Schema schema = vectorUnloader.getSchema(); + LOGGER.debug("writing schema: " + schema); + try ( + FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + ) { + arrowWriter.writeRecordBatch(recordBatch); } - writer.setValueCount(count); - varchar.release(); } } diff --git a/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java new file mode 100644 index 00000000000..7d25003f8b3 --- /dev/null +++ b/java/vector/src/test/java/org/apache/arrow/vector/file/json/TestJSONFile.java @@ -0,0 +1,120 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.arrow.vector.file.json; + +import java.io.File; +import java.io.IOException; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.NullableMapVector; +import org.apache.arrow.vector.file.BaseFileTest; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestJSONFile extends BaseFileTest { + private static final Logger LOGGER = LoggerFactory.getLogger(TestJSONFile.class); + + @Test + public void testWriteReadComplexJSON() throws IOException { + File file = new File("target/mytest_complex.json"); + int count = COUNT; + + // write + try ( + BufferAllocator originalVectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", originalVectorAllocator, null)) { + writeComplexData(count, parent); + writeJSON(file, new VectorSchemaRoot(parent.getChild("root"))); + } + + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + ) { + JsonFileReader reader = new JsonFileReader(file, readerAllocator); + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateComplexContent(count, root); + } + reader.close(); + } + } + + @Test + public void testWriteComplexJSON() throws IOException { + File file = new File("target/mytest_write_complex.json"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null)) { + writeComplexData(count, parent); + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateComplexContent(root.getRowCount(), root); + writeJSON(file, root); + } + } + + public void writeJSON(File file, VectorSchemaRoot root) throws IOException { + JsonFileWriter writer = new JsonFileWriter(file, JsonFileWriter.config().pretty(true)); + writer.start(root.getSchema()); + writer.write(root); + writer.close(); + } + + + @Test + public void testWriteReadUnionJSON() throws IOException { + File file = new File("target/mytest_write_union.json"); + int count = COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + NullableMapVector parent = new NullableMapVector("parent", vectorAllocator, null)) { + + writeUnionData(count, parent); + + printVectors(parent.getChildrenFromFields()); + + VectorSchemaRoot root = new VectorSchemaRoot(parent.getChild("root")); + validateUnionData(count, root); + + writeJSON(file, root); + } + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + ) { + JsonFileReader reader = new JsonFileReader(file, readerAllocator); + Schema schema = reader.start(); + LOGGER.debug("reading schema: " + schema); + + // initialize vectors + try (VectorSchemaRoot root = reader.read();) { + validateUnionData(count, root); + } + } + } + +} From 7f048a4b8bdc6a20cd8f6eeca928ecbb6db7dd96 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 11 Nov 2016 14:18:09 -0500 Subject: [PATCH 197/210] ARROW-356: Add documentation about reading Parquet Assumes #192. Author: Uwe L. Korn Closes #193 from xhochy/ARROW-356 and squashes the following commits: 530484f [Uwe L. Korn] Mention new setup instructions 06b2f9c [Uwe L. Korn] Add tables describing dtype support 0467e0e [Uwe L. Korn] Move installation instructions into Sphinx docs 744202a [Uwe L. Korn] Document Pandas<->Arrow conversion b5b4df5 [Uwe L. Korn] ARROW-356: Add documentation about reading Parquet --- python/doc/INSTALL.md | 101 -------------------------- python/doc/index.rst | 16 +++-- python/doc/install.rst | 151 +++++++++++++++++++++++++++++++++++++++ python/doc/pandas.rst | 114 +++++++++++++++++++++++++++++ python/doc/parquet.rst | 66 +++++++++++++++++ python/pyarrow/table.pyx | 15 ++++ 6 files changed, 355 insertions(+), 108 deletions(-) delete mode 100644 python/doc/INSTALL.md create mode 100644 python/doc/install.rst create mode 100644 python/doc/pandas.rst create mode 100644 python/doc/parquet.rst diff --git a/python/doc/INSTALL.md b/python/doc/INSTALL.md deleted file mode 100644 index 81eed565d91..00000000000 --- a/python/doc/INSTALL.md +++ /dev/null @@ -1,101 +0,0 @@ - - -## Building pyarrow (Apache Arrow Python library) - -First, clone the master git repository: - -```bash -git clone https://github.com/apache/arrow.git arrow -``` - -#### System requirements - -Building pyarrow requires: - -* A C++11 compiler - - * Linux: gcc >= 4.8 or clang >= 3.5 - * OS X: XCode 6.4 or higher preferred - -* [cmake][1] - -#### Python requirements - -You will need Python (CPython) 2.7, 3.4, or 3.5 installed. Earlier releases and -are not being targeted. - -> This library targets CPython only due to an emphasis on interoperability with -> pandas and NumPy, which are only available for CPython. - -The build requires NumPy, Cython, and a few other Python dependencies: - -```bash -pip install cython -cd arrow/python -pip install -r requirements.txt -``` - -#### Installing Arrow C++ library - -First, you should choose an installation location for Arrow C++. In the future -using the default system install location will work, but for now we are being -explicit: - -```bash -export ARROW_HOME=$HOME/local -``` - -Now, we build Arrow: - -```bash -cd arrow/cpp - -mkdir dev-build -cd dev-build - -cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. - -make - -# Use sudo here if $ARROW_HOME requires it -make install -``` - -#### Install `pyarrow` - -```bash -cd arrow/python - -python setup.py install -``` - -> On XCode 6 and prior there are some known OS X `@rpath` issues. If you are -> unable to import pyarrow, upgrading XCode may be the solution. - - -```python -In [1]: import pyarrow - -In [2]: pyarrow.from_pylist([1,2,3]) -Out[2]: - -[ - 1, - 2, - 3 -] -``` - -[1]: https://cmake.org/ diff --git a/python/doc/index.rst b/python/doc/index.rst index 88725badc1e..6725ae707d9 100644 --- a/python/doc/index.rst +++ b/python/doc/index.rst @@ -31,14 +31,16 @@ additional functionality such as reading Apache Parquet files into Arrow structures. .. toctree:: - :maxdepth: 4 - :hidden: + :maxdepth: 2 + :caption: Getting Started + Installing pyarrow + Pandas Module Reference -Indices and tables -================== +.. toctree:: + :maxdepth: 2 + :caption: Additional Features + + Parquet format -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/python/doc/install.rst b/python/doc/install.rst new file mode 100644 index 00000000000..1bab0173016 --- /dev/null +++ b/python/doc/install.rst @@ -0,0 +1,151 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Install PyArrow +=============== + +Conda +----- + +To install the latest version of PyArrow from conda-forge using conda: + +.. code-block:: bash + + conda install -c conda-forge pyarrow + +Pip +--- + +Install the latest version from PyPI: + +.. code-block:: bash + + pip install pyarrow + +.. note:: + Currently there are only binary artifcats available for Linux and MacOS. + Otherwise this will only pull the python sources and assumes an existing + installation of the C++ part of Arrow. + To retrieve the binary artifacts, you'll need a recent ``pip`` version that + supports features like the ``manylinux1`` tag. + +Building from source +-------------------- + +First, clone the master git repository: + +.. code-block:: bash + + git clone https://github.com/apache/arrow.git arrow + +System requirements +~~~~~~~~~~~~~~~~~~~ + +Building pyarrow requires: + +* A C++11 compiler + + * Linux: gcc >= 4.8 or clang >= 3.5 + * OS X: XCode 6.4 or higher preferred + +* `CMake `_ + +Python requirements +~~~~~~~~~~~~~~~~~~~ + +You will need Python (CPython) 2.7, 3.4, or 3.5 installed. Earlier releases and +are not being targeted. + +.. note:: + This library targets CPython only due to an emphasis on interoperability with + pandas and NumPy, which are only available for CPython. + +The build requires NumPy, Cython, and a few other Python dependencies: + +.. code-block:: bash + + pip install cython + cd arrow/python + pip install -r requirements.txt + +Installing Arrow C++ library +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +First, you should choose an installation location for Arrow C++. In the future +using the default system install location will work, but for now we are being +explicit: + +.. code-block:: bash + + export ARROW_HOME=$HOME/local + +Now, we build Arrow: + +.. code-block:: bash + + cd arrow/cpp + + mkdir dev-build + cd dev-build + + cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME .. + + make + + # Use sudo here if $ARROW_HOME requires it + make install + +To get the optional Parquet support, you should also build and install +`parquet-cpp `_. + +Install `pyarrow` +~~~~~~~~~~~~~~~~~ + + +.. code-block:: bash + + cd arrow/python + + # --with-parquet enable the Apache Parquet support in PyArrow + # --build-type=release disables debugging information and turns on + # compiler optimizations for native code + python setup.py build_ext --with-parquet --build-type=release install + python setup.py install + +.. warning:: + On XCode 6 and prior there are some known OS X `@rpath` issues. If you are + unable to import pyarrow, upgrading XCode may be the solution. + +.. note:: + In development installations, you will also need to set a correct + ``LD_LIBARY_PATH``. This is most probably done with + ``export LD_LIBARY_PATH=$ARROW_HOME/lib:$LD_LIBARY_PATH``. + + +.. code-block:: python + + In [1]: import pyarrow + + In [2]: pyarrow.from_pylist([1,2,3]) + Out[2]: + + [ + 1, + 2, + 3 + ] + diff --git a/python/doc/pandas.rst b/python/doc/pandas.rst new file mode 100644 index 00000000000..7c700748178 --- /dev/null +++ b/python/doc/pandas.rst @@ -0,0 +1,114 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Pandas Interface +================ + +To interface with Pandas, PyArrow provides various conversion routines to +consume Pandas structures and convert back to them. + +DataFrames +---------- + +The equivalent to a Pandas DataFrame in Arrow is a :class:`pyarrow.table.Table`. +Both consist of a set of named columns of equal length. While Pandas only +supports flat columns, the Table also provides nested columns, thus it can +represent more data than a DataFrame, so a full conversion is not always possible. + +Conversion from a Table to a DataFrame is done by calling +:meth:`pyarrow.table.Table.to_pandas`. The inverse is then achieved by using +:meth:`pyarrow.from_pandas_dataframe`. This conversion routine provides the +convience parameter ``timestamps_to_ms``. Although Arrow supports timestamps of +different resolutions, Pandas only supports nanosecond timestamps and most +other systems (e.g. Parquet) only work on millisecond timestamps. This parameter +can be used to already do the time conversion during the Pandas to Arrow +conversion. + +.. code-block:: python + + import pyarrow as pa + import pandas as pd + + df = pd.DataFrame({"a": [1, 2, 3]}) + # Convert from Pandas to Arrow + table = pa.from_pandas_dataframe(df) + # Convert back to Pandas + df_new = table.to_pandas() + + +Series +------ + +In Arrow, the most similar structure to a Pandas Series is an Array. +It is a vector that contains data of the same type as linear memory. You can +convert a Pandas Series to an Arrow Array using :meth:`pyarrow.array.from_pandas_series`. +As Arrow Arrays are always nullable, you can supply an optional mask using +the ``mask`` parameter to mark all null-entries. + +Type differences +---------------- + +With the current design of Pandas and Arrow, it is not possible to convert all +column types unmodified. One of the main issues here is that Pandas has no +support for nullable columns of arbitrary type. Also ``datetime64`` is currently +fixed to nanosecond resolution. On the other side, Arrow might be still missing +support for some types. + +Pandas -> Arrow Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++------------------------+--------------------------+ +| Source Type (Pandas) | Destination Type (Arrow) | ++========================+==========================+ +| ``bool`` | ``BOOL`` | ++------------------------+--------------------------+ +| ``(u)int{8,16,32,64}`` | ``(U)INT{8,16,32,64}`` | ++------------------------+--------------------------+ +| ``float32`` | ``FLOAT`` | ++------------------------+--------------------------+ +| ``float64`` | ``DOUBLE`` | ++------------------------+--------------------------+ +| ``str`` / ``unicode`` | ``STRING`` | ++------------------------+--------------------------+ +| ``pd.Timestamp`` | ``TIMESTAMP(unit=ns)`` | ++------------------------+--------------------------+ +| ``pd.Categorical`` | *not supported* | ++------------------------+--------------------------+ + +Arrow -> Pandas Conversion +~~~~~~~~~~~~~~~~~~~~~~~~~~ + ++-------------------------------------+--------------------------------------------------------+ +| Source Type (Arrow) | Destination Type (Pandas) | ++=====================================+========================================================+ +| ``BOOL`` | ``bool`` | ++-------------------------------------+--------------------------------------------------------+ +| ``BOOL`` *with nulls* | ``object`` (with values ``True``, ``False``, ``None``) | ++-------------------------------------+--------------------------------------------------------+ +| ``(U)INT{8,16,32,64}`` | ``(u)int{8,16,32,64}`` | ++-------------------------------------+--------------------------------------------------------+ +| ``(U)INT{8,16,32,64}`` *with nulls* | ``float64`` | ++-------------------------------------+--------------------------------------------------------+ +| ``FLOAT`` | ``float32`` | ++-------------------------------------+--------------------------------------------------------+ +| ``DOUBLE`` | ``float64`` | ++-------------------------------------+--------------------------------------------------------+ +| ``STRING`` | ``str`` | ++-------------------------------------+--------------------------------------------------------+ +| ``TIMESTAMP(unit=*)`` | ``pd.Timestamp`` (``np.datetime64[ns]``) | ++-------------------------------------+--------------------------------------------------------+ + diff --git a/python/doc/parquet.rst b/python/doc/parquet.rst new file mode 100644 index 00000000000..674ed80f27c --- /dev/null +++ b/python/doc/parquet.rst @@ -0,0 +1,66 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Reading/Writing Parquet files +============================= + +If you have built ``pyarrow`` with Parquet support, i.e. ``parquet-cpp`` was +found during the build, you can read files in the Parquet format to/from Arrow +memory structures. The Parquet support code is located in the +:mod:`pyarrow.parquet` module and your package needs to be built with the +``--with-parquet`` flag for ``build_ext``. + +Reading Parquet +--------------- + +To read a Parquet file into Arrow memory, you can use the following code +snippet. It will read the whole Parquet file into memory as an +:class:`pyarrow.table.Table`. + +.. code-block:: python + + import pyarrow + import pyarrow.parquet + + A = pyarrow + + table = A.parquet.read_table('') + +Writing Parquet +--------------- + +Given an instance of :class:`pyarrow.table.Table`, the most simple way to +persist it to Parquet is by using the :meth:`pyarrow.parquet.write_table` +method. + +.. code-block:: python + + import pyarrow + import pyarrow.parquet + + A = pyarrow + + table = A.Table(..) + A.parquet.write_table(table, '') + +By default this will write the Table as a single RowGroup using ``DICTIONARY`` +encoding. To increase the potential of parallelism a query engine can process +a Parquet file, set the ``chunk_size`` to a fraction of the total number of rows. + +If you also want to compress the columns, you can select a compression +method using the ``compression`` argument. Typically, ``GZIP`` is the choice if +you want to minimize size and ``SNAPPY`` for performance. diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index c71bc712bff..5459f26b80a 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -298,6 +298,8 @@ cdef class RecordBatch: cdef class Table: ''' + A collection of top-level named, equal length Arrow arrays. + Do not call this class's constructor directly. ''' @@ -335,6 +337,19 @@ cdef class Table: @staticmethod def from_arrays(names, arrays, name=None): + """ + Construct a Table from Arrow Arrays + + Parameters + ---------- + + names: list of str + Names for the table columns + arrays: list of pyarrow.array.Array + Equal-length arrays that should form the table. + name: str + (optional) name for the Table + """ cdef: Array arr c_string c_name From 48f9780a8677546cb143a09b25b0b57c1946ba07 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 11 Nov 2016 14:20:36 -0500 Subject: [PATCH 198/210] ARROW-375: Fix unicode Python 3 issue in columns argument of parquet.read_table Author: Wes McKinney Closes #204 from wesm/ARROW-375 and squashes the following commits: 9e6f2a6 [Wes McKinney] BUG: convert unicode to utf8 bytes for column filtering --- python/pyarrow/parquet.pyx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 2152f894741..a6e3ac30684 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -93,15 +93,18 @@ cdef class ParquetReader: Integer index of the position of the column """ cdef: - const FileMetaData* metadata = self.reader.get().parquet_reader().metadata() + const FileMetaData* metadata = (self.reader.get() + .parquet_reader().metadata()) int i = 0 if self.column_idx_map is None: self.column_idx_map = {} for i in range(0, metadata.num_columns()): - self.column_idx_map[str(metadata.schema().Column(i).path().get().ToDotString())] = i + col_bytes = tobytes(metadata.schema().Column(i) + .path().get().ToDotString()) + self.column_idx_map[col_bytes] = i - return self.column_idx_map[column_name] + return self.column_idx_map[tobytes(column_name)] def read_column(self, int column_index): cdef: @@ -109,7 +112,8 @@ cdef class ParquetReader: shared_ptr[CArray] carray with nogil: - check_status(self.reader.get().ReadFlatColumn(column_index, &carray)) + check_status(self.reader.get() + .ReadFlatColumn(column_index, &carray)) array.init(carray) return array From 78288b5fca8ff527257e487d45c7e68f7dbd8cd2 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Wed, 16 Nov 2016 16:18:50 -0500 Subject: [PATCH 199/210] ARROW-371: Handle pandas-nullable types correctly Author: Uwe L. Korn Closes #205 from xhochy/ARROW-371 and squashes the following commits: 1f73e8b [Uwe L. Korn] ARROW-371: Handle pandas-nullable types correctly --- python/pyarrow/tests/test_convert_pandas.py | 22 +++++++++- python/src/pyarrow/adapters/pandas.cc | 46 ++++++++++----------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 55302996f45..b527ca7e808 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -165,7 +165,7 @@ def test_strings(self): expected = pd.DataFrame({'strings': values * repeats}) self._check_pandas_roundtrip(df, expected) - def test_timestamps_notimezone(self): + def test_timestamps_notimezone_no_nulls(self): df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123', @@ -184,6 +184,26 @@ def test_timestamps_notimezone(self): }) self._check_pandas_roundtrip(df, timestamps_to_ms=False) + def test_timestamps_notimezone_nulls(self): + df = pd.DataFrame({ + 'datetime64': np.array([ + '2007-07-13T01:23:34.123', + None, + '2010-08-13T05:46:57.437'], + dtype='datetime64[ms]') + }) + df.info() + self._check_pandas_roundtrip(df, timestamps_to_ms=True) + + df = pd.DataFrame({ + 'datetime64': np.array([ + '2007-07-13T01:23:34.123456789', + None, + '2010-08-13T05:46:57.437699912'], + dtype='datetime64[ns]') + }) + self._check_pandas_roundtrip(df, timestamps_to_ms=False) + # def test_category(self): # repeats = 1000 # values = [b'foo', None, u'bar', 'qux', np.nan] diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 6a3966b7488..1f5b7009e6a 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -489,20 +489,20 @@ struct arrow_traits { static constexpr int npy_type = NPY_BOOL; static constexpr bool supports_nulls = false; static constexpr bool is_boolean = true; - static constexpr bool is_integer = false; - static constexpr bool is_floating = false; + static constexpr bool is_pandas_numeric_not_nullable = false; + static constexpr bool is_pandas_numeric_nullable = false; }; -#define INT_DECL(TYPE) \ - template <> \ - struct arrow_traits { \ - static constexpr int npy_type = NPY_##TYPE; \ - static constexpr bool supports_nulls = false; \ - static constexpr double na_value = NAN; \ - static constexpr bool is_boolean = false; \ - static constexpr bool is_integer = true; \ - static constexpr bool is_floating = false; \ - typedef typename npy_traits::value_type T; \ +#define INT_DECL(TYPE) \ + template <> \ + struct arrow_traits { \ + static constexpr int npy_type = NPY_##TYPE; \ + static constexpr bool supports_nulls = false; \ + static constexpr double na_value = NAN; \ + static constexpr bool is_boolean = false; \ + static constexpr bool is_pandas_numeric_not_nullable = true; \ + static constexpr bool is_pandas_numeric_nullable = false; \ + typedef typename npy_traits::value_type T; \ }; INT_DECL(INT8); @@ -520,8 +520,8 @@ struct arrow_traits { static constexpr bool supports_nulls = true; static constexpr float na_value = NAN; static constexpr bool is_boolean = false; - static constexpr bool is_integer = false; - static constexpr bool is_floating = true; + static constexpr bool is_pandas_numeric_not_nullable = false; + static constexpr bool is_pandas_numeric_nullable = true; typedef typename npy_traits::value_type T; }; @@ -531,8 +531,8 @@ struct arrow_traits { static constexpr bool supports_nulls = true; static constexpr double na_value = NAN; static constexpr bool is_boolean = false; - static constexpr bool is_integer = false; - static constexpr bool is_floating = true; + static constexpr bool is_pandas_numeric_not_nullable = false; + static constexpr bool is_pandas_numeric_nullable = true; typedef typename npy_traits::value_type T; }; @@ -542,8 +542,8 @@ struct arrow_traits { static constexpr bool supports_nulls = true; static constexpr int64_t na_value = std::numeric_limits::min(); static constexpr bool is_boolean = false; - static constexpr bool is_integer = true; - static constexpr bool is_floating = false; + static constexpr bool is_pandas_numeric_not_nullable = false; + static constexpr bool is_pandas_numeric_nullable = true; typedef typename npy_traits::value_type T; }; @@ -552,8 +552,8 @@ struct arrow_traits { static constexpr int npy_type = NPY_OBJECT; static constexpr bool supports_nulls = true; static constexpr bool is_boolean = false; - static constexpr bool is_integer = false; - static constexpr bool is_floating = false; + static constexpr bool is_pandas_numeric_not_nullable = false; + static constexpr bool is_pandas_numeric_nullable = false; }; @@ -655,7 +655,7 @@ class ArrowDeserializer { template inline typename std::enable_if< - arrow_traits::is_floating, Status>::type + arrow_traits::is_pandas_numeric_nullable, Status>::type ConvertValues(const std::shared_ptr& arr) { typedef typename arrow_traits::T T; @@ -668,7 +668,7 @@ class ArrowDeserializer { T* out_values = reinterpret_cast(PyArray_DATA(out_)); for (int64_t i = 0; i < arr->length(); ++i) { - out_values[i] = arr->IsNull(i) ? NAN : in_values[i]; + out_values[i] = arr->IsNull(i) ? arrow_traits::na_value : in_values[i]; } } else { // Zero-Copy. We can pass the data pointer directly to NumPy. @@ -683,7 +683,7 @@ class ArrowDeserializer { // Integer specialization template inline typename std::enable_if< - arrow_traits::is_integer, Status>::type + arrow_traits::is_pandas_numeric_not_nullable, Status>::type ConvertValues(const std::shared_ptr& arr) { typedef typename arrow_traits::T T; From 84170962712b976fd6f68f10ba55e219155a57db Mon Sep 17 00:00:00 2001 From: Julien Le Dem Date: Fri, 18 Nov 2016 11:09:28 -0500 Subject: [PATCH 200/210] ARROW-367: converter json <=> Arrow file format for Integration tests Author: Julien Le Dem Closes #203 from julienledem/integration and squashes the following commits: b3cd326 [Julien Le Dem] add license fdbe03f [Julien Le Dem] ARROW-367: converter json <=> Arrow file format for Integration tests --- .../org/apache/arrow/tools/Integration.java | 262 ++++++++++++++++++ .../arrow/tools/ArrowFileTestFixtures.java | 122 ++++++++ .../apache/arrow/tools/TestFileRoundtrip.java | 101 +------ .../apache/arrow/tools/TestIntegration.java | 143 ++++++++++ .../vector/file/json/JsonFileReader.java | 37 +-- .../vector/file/json/JsonFileWriter.java | 3 +- 6 files changed, 554 insertions(+), 114 deletions(-) create mode 100644 java/tools/src/main/java/org/apache/arrow/tools/Integration.java create mode 100644 java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java create mode 100644 java/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java diff --git a/java/tools/src/main/java/org/apache/arrow/tools/Integration.java b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java new file mode 100644 index 00000000000..29f0ee29e3c --- /dev/null +++ b/java/tools/src/main/java/org/apache/arrow/tools/Integration.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.arrow.tools; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.file.ArrowBlock; +import org.apache.arrow.vector.file.ArrowFooter; +import org.apache.arrow.vector.file.ArrowReader; +import org.apache.arrow.vector.file.ArrowWriter; +import org.apache.arrow.vector.file.json.JsonFileReader; +import org.apache.arrow.vector.file.json.JsonFileWriter; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Objects; + +public class Integration { + private static final Logger LOGGER = LoggerFactory.getLogger(Integration.class); + + public static void main(String[] args) { + try { + new Integration().run(args); + } catch (ParseException e) { + fatalError("Invalid parameters", e); + } catch (IOException e) { + fatalError("Error accessing files", e); + } catch (RuntimeException e) { + fatalError("Incompatible files", e); + } + } + + private final Options options; + + enum Command { + ARROW_TO_JSON(true, false) { + @Override + public void execute(File arrowFile, File jsonFile) throws IOException { + try( + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(arrowFile); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), allocator);) { + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + LOGGER.debug("Input file size: " + arrowFile.length()); + LOGGER.debug("Found schema: " + schema); + try (JsonFileWriter writer = new JsonFileWriter(jsonFile);) { + writer.start(schema); + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch inRecordBatch = arrowReader.readRecordBatch(rbBlock); + VectorSchemaRoot root = new VectorSchemaRoot(schema, allocator);) { + VectorLoader vectorLoader = new VectorLoader(root); + vectorLoader.load(inRecordBatch); + writer.write(root); + } + } + } + LOGGER.debug("Output file size: " + jsonFile.length()); + } + } + }, + JSON_TO_ARROW(false, true) { + @Override + public void execute(File arrowFile, File jsonFile) throws IOException { + try ( + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + JsonFileReader reader = new JsonFileReader(jsonFile, allocator); + ) { + Schema schema = reader.start(); + LOGGER.debug("Input file size: " + jsonFile.length()); + LOGGER.debug("Found schema: " + schema); + try ( + FileOutputStream fileOutputStream = new FileOutputStream(arrowFile); + ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); + ) { + + // initialize vectors + VectorSchemaRoot root; + while ((root = reader.read()) != null) { + VectorUnloader vectorUnloader = new VectorUnloader(root); + try (ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch();) { + arrowWriter.writeRecordBatch(recordBatch); + } + root.close(); + } + } + LOGGER.debug("Output file size: " + arrowFile.length()); + } + } + }, + VALIDATE(true, true) { + @Override + public void execute(File arrowFile, File jsonFile) throws IOException { + try ( + BufferAllocator allocator = new RootAllocator(Integer.MAX_VALUE); + JsonFileReader jsonReader = new JsonFileReader(jsonFile, allocator); + FileInputStream fileInputStream = new FileInputStream(arrowFile); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), allocator); + ) { + Schema jsonSchema = jsonReader.start(); + ArrowFooter footer = arrowReader.readFooter(); + Schema arrowSchema = footer.getSchema(); + LOGGER.debug("Arrow Input file size: " + arrowFile.length()); + LOGGER.debug("ARROW schema: " + arrowSchema); + LOGGER.debug("JSON Input file size: " + jsonFile.length()); + LOGGER.debug("JSON schema: " + jsonSchema); + compareSchemas(jsonSchema, arrowSchema); + + List recordBatches = footer.getRecordBatches(); + Iterator iterator = recordBatches.iterator(); + VectorSchemaRoot jsonRoot; + while ((jsonRoot = jsonReader.read()) != null && iterator.hasNext()) { + ArrowBlock rbBlock = iterator.next(); + try (ArrowRecordBatch inRecordBatch = arrowReader.readRecordBatch(rbBlock); + VectorSchemaRoot arrowRoot = new VectorSchemaRoot(arrowSchema, allocator);) { + VectorLoader vectorLoader = new VectorLoader(arrowRoot); + vectorLoader.load(inRecordBatch); + // TODO: compare + compare(arrowRoot, jsonRoot); + } + jsonRoot.close(); + } + boolean hasMoreJSON = jsonRoot != null; + boolean hasMoreArrow = iterator.hasNext(); + if (hasMoreJSON || hasMoreArrow) { + throw new IllegalArgumentException("Unexpected RecordBatches. J:" + hasMoreJSON + " A:" + hasMoreArrow); + } + } + } + }; + + public final boolean arrowExists; + public final boolean jsonExists; + + Command(boolean arrowExists, boolean jsonExists) { + this.arrowExists = arrowExists; + this.jsonExists = jsonExists; + } + + abstract public void execute(File arrowFile, File jsonFile) throws IOException; + + } + + Integration() { + this.options = new Options(); + this.options.addOption("a", "arrow", true, "arrow file"); + this.options.addOption("j", "json", true, "json file"); + this.options.addOption("c", "command", true, "command to execute: " + Arrays.toString(Command.values())); + } + + private File validateFile(String type, String fileName, boolean shouldExist) { + if (fileName == null) { + throw new IllegalArgumentException("missing " + type + " file parameter"); + } + File f = new File(fileName); + if (shouldExist && (!f.exists() || f.isDirectory())) { + throw new IllegalArgumentException(type + " file not found: " + f.getAbsolutePath()); + } + if (!shouldExist && f.exists()) { + throw new IllegalArgumentException(type + " file already exists: " + f.getAbsolutePath()); + } + return f; + } + + void run(String[] args) throws ParseException, IOException { + CommandLineParser parser = new PosixParser(); + CommandLine cmd = parser.parse(options, args, false); + + + Command command = toCommand(cmd.getOptionValue("command")); + File arrowFile = validateFile("arrow", cmd.getOptionValue("arrow"), command.arrowExists); + File jsonFile = validateFile("json", cmd.getOptionValue("json"), command.jsonExists); + command.execute(arrowFile, jsonFile); + } + + private Command toCommand(String commandName) { + try { + return Command.valueOf(commandName); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("Unknown command: " + commandName + " expected one of " + Arrays.toString(Command.values())); + } + } + + private static void fatalError(String message, Throwable e) { + System.err.println(message); + LOGGER.error(message, e); + System.exit(1); + } + + + private static void compare(VectorSchemaRoot arrowRoot, VectorSchemaRoot jsonRoot) { + compareSchemas(jsonRoot.getSchema(), arrowRoot.getSchema()); + if (arrowRoot.getRowCount() != jsonRoot.getRowCount()) { + throw new IllegalArgumentException("Different row count:\n" + arrowRoot.getRowCount() + "\n" + jsonRoot.getRowCount()); + } + List arrowVectors = arrowRoot.getFieldVectors(); + List jsonVectors = jsonRoot.getFieldVectors(); + if (arrowVectors.size() != jsonVectors.size()) { + throw new IllegalArgumentException("Different column count:\n" + arrowVectors.size() + "\n" + jsonVectors.size()); + } + for (int i = 0; i < arrowVectors.size(); i++) { + Field field = arrowRoot.getSchema().getFields().get(i); + FieldVector arrowVector = arrowVectors.get(i); + FieldVector jsonVector = jsonVectors.get(i); + int valueCount = arrowVector.getAccessor().getValueCount(); + if (valueCount != jsonVector.getAccessor().getValueCount()) { + throw new IllegalArgumentException("Different value count for field " + field + " : " + valueCount + " != " + jsonVector.getAccessor().getValueCount()); + } + for (int j = 0; j < valueCount; j++) { + Object arrow = arrowVector.getAccessor().getObject(j); + Object json = jsonVector.getAccessor().getObject(j); + if (!Objects.equal(arrow, json)) { + throw new IllegalArgumentException( + "Different values in column:\n" + field + " at index " + j + ": " + arrow + " != " + json); + } + } + } + } + + private static void compareSchemas(Schema jsonSchema, Schema arrowSchema) { + if (!arrowSchema.equals(jsonSchema)) { + throw new IllegalArgumentException("Different schemas:\n" + arrowSchema + "\n" + jsonSchema); + } + } +} diff --git a/java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java b/java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java new file mode 100644 index 00000000000..4cfc52fe086 --- /dev/null +++ b/java/tools/src/test/java/org/apache/arrow/tools/ArrowFileTestFixtures.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.arrow.tools; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.List; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.VectorLoader; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.VectorUnloader; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.apache.arrow.vector.file.ArrowBlock; +import org.apache.arrow.vector.file.ArrowFooter; +import org.apache.arrow.vector.file.ArrowReader; +import org.apache.arrow.vector.file.ArrowWriter; +import org.apache.arrow.vector.schema.ArrowRecordBatch; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Assert; + +public class ArrowFileTestFixtures { + static final int COUNT = 10; + + static void writeData(int count, MapVector parent) { + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + for (int i = 0; i < count; i++) { + intWriter.setPosition(i); + intWriter.writeInt(i); + bigIntWriter.setPosition(i); + bigIntWriter.writeBigInt(i); + } + writer.setValueCount(count); + } + + static void validateOutput(File testOutFile, BufferAllocator allocator) throws Exception { + // read + try ( + BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); + FileInputStream fileInputStream = new FileInputStream(testOutFile); + ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); + BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); + ) { + ArrowFooter footer = arrowReader.readFooter(); + Schema schema = footer.getSchema(); + + // initialize vectors + try (VectorSchemaRoot root = new VectorSchemaRoot(schema, readerAllocator)) { + VectorLoader vectorLoader = new VectorLoader(root); + + List recordBatches = footer.getRecordBatches(); + for (ArrowBlock rbBlock : recordBatches) { + try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { + vectorLoader.load(recordBatch); + } + validateContent(COUNT, root); + } + } + } + } + + static void validateContent(int count, VectorSchemaRoot root) { + Assert.assertEquals(count, root.getRowCount()); + for (int i = 0; i < count; i++) { + Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); + Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); + } + } + + static void write(FieldVector parent, File file) throws FileNotFoundException, IOException { + Schema schema = new Schema(parent.getField().getChildren()); + int valueCount = parent.getAccessor().getValueCount(); + List fields = parent.getChildrenFromFields(); + VectorUnloader vectorUnloader = new VectorUnloader(schema, valueCount, fields); + try ( + FileOutputStream fileOutputStream = new FileOutputStream(file); + ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); + ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); + ) { + arrowWriter.writeRecordBatch(recordBatch); + } + } + + + static void writeInput(File testInFile, BufferAllocator allocator) throws FileNotFoundException, IOException { + int count = ArrowFileTestFixtures.COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null)) { + writeData(count, parent); + write(parent.getChild("root"), testInFile); + } + } +} diff --git a/java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java b/java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java index 339725e5af1..ee39f5e92c7 100644 --- a/java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java +++ b/java/tools/src/test/java/org/apache/arrow/tools/TestFileRoundtrip.java @@ -18,42 +18,21 @@ */ package org.apache.arrow.tools; +import static org.apache.arrow.tools.ArrowFileTestFixtures.validateOutput; +import static org.apache.arrow.tools.ArrowFileTestFixtures.writeInput; import static org.junit.Assert.assertEquals; import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.IOException; -import java.util.List; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; -import org.apache.arrow.vector.FieldVector; -import org.apache.arrow.vector.VectorLoader; -import org.apache.arrow.vector.VectorSchemaRoot; -import org.apache.arrow.vector.VectorUnloader; -import org.apache.arrow.vector.complex.MapVector; -import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; -import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; -import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; -import org.apache.arrow.vector.complex.writer.BigIntWriter; -import org.apache.arrow.vector.complex.writer.IntWriter; -import org.apache.arrow.vector.file.ArrowBlock; -import org.apache.arrow.vector.file.ArrowFooter; -import org.apache.arrow.vector.file.ArrowReader; -import org.apache.arrow.vector.file.ArrowWriter; -import org.apache.arrow.vector.schema.ArrowRecordBatch; -import org.apache.arrow.vector.types.pojo.Schema; import org.junit.After; -import org.junit.Assert; import org.junit.Before; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; public class TestFileRoundtrip { - private static final int COUNT = 10; @Rule public TemporaryFolder testFolder = new TemporaryFolder(); @@ -70,90 +49,18 @@ public void tearDown() { allocator.close(); } - private void writeData(int count, MapVector parent) { - ComplexWriter writer = new ComplexWriterImpl("root", parent); - MapWriter rootWriter = writer.rootAsMap(); - IntWriter intWriter = rootWriter.integer("int"); - BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); - for (int i = 0; i < count; i++) { - intWriter.setPosition(i); - intWriter.writeInt(i); - bigIntWriter.setPosition(i); - bigIntWriter.writeBigInt(i); - } - writer.setValueCount(count); - } - @Test public void test() throws Exception { File testInFile = testFolder.newFile("testIn.arrow"); File testOutFile = testFolder.newFile("testOut.arrow"); - writeInput(testInFile); + writeInput(testInFile, allocator); String[] args = { "-i", testInFile.getAbsolutePath(), "-o", testOutFile.getAbsolutePath()}; int result = new FileRoundtrip(System.out, System.err).run(args); assertEquals(0, result); - validateOutput(testOutFile); - } - - private void validateOutput(File testOutFile) throws Exception { - // read - try ( - BufferAllocator readerAllocator = allocator.newChildAllocator("reader", 0, Integer.MAX_VALUE); - FileInputStream fileInputStream = new FileInputStream(testOutFile); - ArrowReader arrowReader = new ArrowReader(fileInputStream.getChannel(), readerAllocator); - BufferAllocator vectorAllocator = allocator.newChildAllocator("final vectors", 0, Integer.MAX_VALUE); - ) { - ArrowFooter footer = arrowReader.readFooter(); - Schema schema = footer.getSchema(); - - // initialize vectors - try (VectorSchemaRoot root = new VectorSchemaRoot(schema, readerAllocator)) { - VectorLoader vectorLoader = new VectorLoader(root); - - List recordBatches = footer.getRecordBatches(); - for (ArrowBlock rbBlock : recordBatches) { - try (ArrowRecordBatch recordBatch = arrowReader.readRecordBatch(rbBlock)) { - vectorLoader.load(recordBatch); - } - validateContent(COUNT, root); - } - } - } - } - - private void validateContent(int count, VectorSchemaRoot root) { - Assert.assertEquals(count, root.getRowCount()); - for (int i = 0; i < count; i++) { - Assert.assertEquals(i, root.getVector("int").getAccessor().getObject(i)); - Assert.assertEquals(Long.valueOf(i), root.getVector("bigInt").getAccessor().getObject(i)); - } - } - - public void writeInput(File testInFile) throws FileNotFoundException, IOException { - int count = COUNT; - try ( - BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); - MapVector parent = new MapVector("parent", vectorAllocator, null)) { - writeData(count, parent); - write(parent.getChild("root"), testInFile); - } - } - - private void write(FieldVector parent, File file) throws FileNotFoundException, IOException { - Schema schema = new Schema(parent.getField().getChildren()); - int valueCount = parent.getAccessor().getValueCount(); - List fields = parent.getChildrenFromFields(); - VectorUnloader vectorUnloader = new VectorUnloader(schema, valueCount, fields); - try ( - FileOutputStream fileOutputStream = new FileOutputStream(file); - ArrowWriter arrowWriter = new ArrowWriter(fileOutputStream.getChannel(), schema); - ArrowRecordBatch recordBatch = vectorUnloader.getRecordBatch(); - ) { - arrowWriter.writeRecordBatch(recordBatch); - } + validateOutput(testOutFile, allocator); } } diff --git a/java/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java b/java/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java new file mode 100644 index 00000000000..bb69ed1498e --- /dev/null +++ b/java/tools/src/test/java/org/apache/arrow/tools/TestIntegration.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.arrow.tools; + +import static org.apache.arrow.tools.ArrowFileTestFixtures.validateOutput; +import static org.apache.arrow.tools.ArrowFileTestFixtures.write; +import static org.apache.arrow.tools.ArrowFileTestFixtures.writeData; +import static org.apache.arrow.tools.ArrowFileTestFixtures.writeInput; +import static org.junit.Assert.fail; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.tools.Integration.Command; +import org.apache.arrow.vector.complex.MapVector; +import org.apache.arrow.vector.complex.impl.ComplexWriterImpl; +import org.apache.arrow.vector.complex.writer.BaseWriter.ComplexWriter; +import org.apache.arrow.vector.complex.writer.BaseWriter.MapWriter; +import org.apache.arrow.vector.complex.writer.BigIntWriter; +import org.apache.arrow.vector.complex.writer.IntWriter; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +public class TestIntegration { + + @Rule + public TemporaryFolder testFolder = new TemporaryFolder(); + + private BufferAllocator allocator; + + @Before + public void init() { + allocator = new RootAllocator(Integer.MAX_VALUE); + } + + @After + public void tearDown() { + allocator.close(); + } + + @Test + public void testValid() throws Exception { + File testInFile = testFolder.newFile("testIn.arrow"); + File testJSONFile = testFolder.newFile("testOut.json"); + testJSONFile.delete(); + File testOutFile = testFolder.newFile("testOut.arrow"); + testOutFile.delete(); + + // generate an arow file + writeInput(testInFile, allocator); + + Integration integration = new Integration(); + + // convert it to json + String[] args1 = { "-arrow", testInFile.getAbsolutePath(), "-json", testJSONFile.getAbsolutePath(), "-command", Command.ARROW_TO_JSON.name()}; + integration.run(args1); + + // convert back to arrow + String[] args2 = { "-arrow", testOutFile.getAbsolutePath(), "-json", testJSONFile.getAbsolutePath(), "-command", Command.JSON_TO_ARROW.name()}; + integration.run(args2); + + // check it is the same + validateOutput(testOutFile, allocator); + + // validate arrow against json + String[] args3 = { "-arrow", testInFile.getAbsolutePath(), "-json", testJSONFile.getAbsolutePath(), "-command", Command.VALIDATE.name()}; + integration.run(args3); + } + + @Test + public void testInvalid() throws Exception { + File testValidInFile = testFolder.newFile("testValidIn.arrow"); + File testInvalidInFile = testFolder.newFile("testInvalidIn.arrow"); + File testJSONFile = testFolder.newFile("testInvalidOut.json"); + testJSONFile.delete(); + + // generate an arrow file + writeInput(testValidInFile, allocator); + // generate a different arrow file + writeInput2(testInvalidInFile, allocator); + + Integration integration = new Integration(); + + // convert the "valid" file to json + String[] args1 = { "-arrow", testValidInFile.getAbsolutePath(), "-json", testJSONFile.getAbsolutePath(), "-command", Command.ARROW_TO_JSON.name()}; + integration.run(args1); + + // compare the "invalid" file to the "valid" json + String[] args3 = { "-arrow", testInvalidInFile.getAbsolutePath(), "-json", testJSONFile.getAbsolutePath(), "-command", Command.VALIDATE.name()}; + // this should fail + try { + integration.run(args3); + fail("should have failed"); + } catch (IllegalArgumentException e) { + Assert.assertTrue(e.getMessage(), e.getMessage().contains("Different values in column")); + Assert.assertTrue(e.getMessage(), e.getMessage().contains("999")); + } + + } + + static void writeInput2(File testInFile, BufferAllocator allocator) throws FileNotFoundException, IOException { + int count = ArrowFileTestFixtures.COUNT; + try ( + BufferAllocator vectorAllocator = allocator.newChildAllocator("original vectors", 0, Integer.MAX_VALUE); + MapVector parent = new MapVector("parent", vectorAllocator, null)) { + writeData(count, parent); + ComplexWriter writer = new ComplexWriterImpl("root", parent); + MapWriter rootWriter = writer.rootAsMap(); + IntWriter intWriter = rootWriter.integer("int"); + BigIntWriter bigIntWriter = rootWriter.bigInt("bigInt"); + intWriter.setPosition(5); + intWriter.writeInt(999); + bigIntWriter.setPosition(4); + bigIntWriter.writeBigInt(777L); + writer.setValueCount(count); + write(parent.getChild("root"), testInFile); + } + } + +} diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java index 859a3a0e80a..f07b5172507 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileReader.java @@ -56,7 +56,7 @@ import com.fasterxml.jackson.databind.MappingJsonFactory; import com.google.common.base.Objects; -public class JsonFileReader { +public class JsonFileReader implements AutoCloseable { private final File inputFile; private final JsonParser parser; private final BufferAllocator allocator; @@ -81,23 +81,29 @@ public Schema start() throws JsonParseException, IOException { } public VectorSchemaRoot read() throws IOException { - VectorSchemaRoot recordBatch = new VectorSchemaRoot(schema, allocator); - readToken(START_OBJECT); - { - int count = readNextField("count", Integer.class); - recordBatch.setRowCount(count); - nextFieldIs("columns"); - readToken(START_ARRAY); + JsonToken t = parser.nextToken(); + if (t == START_OBJECT) { + VectorSchemaRoot recordBatch = new VectorSchemaRoot(schema, allocator); { - for (Field field : schema.getFields()) { - FieldVector vector = recordBatch.getVector(field.getName()); - readVector(field, vector); + int count = readNextField("count", Integer.class); + recordBatch.setRowCount(count); + nextFieldIs("columns"); + readToken(START_ARRAY); + { + for (Field field : schema.getFields()) { + FieldVector vector = recordBatch.getVector(field.getName()); + readVector(field, vector); + } } + readToken(END_ARRAY); } - readToken(END_ARRAY); + readToken(END_OBJECT); + return recordBatch; + } else if (t == END_ARRAY) { + return null; + } else { + throw new IllegalArgumentException("Invalid token: " + t); } - readToken(END_OBJECT); - return recordBatch; } private void readVector(Field field, FieldVector vector) throws JsonParseException, IOException { @@ -194,9 +200,8 @@ private void setValueFromParser(ValueVector valueVector, int i) throws IOExcepti } } + @Override public void close() throws IOException { - readToken(END_ARRAY); - readToken(END_OBJECT); parser.close(); } diff --git a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java index 47c1a7dabef..812b3da32f8 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/file/json/JsonFileWriter.java @@ -38,7 +38,7 @@ import com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter; import com.fasterxml.jackson.databind.MappingJsonFactory; -public class JsonFileWriter { +public class JsonFileWriter implements AutoCloseable { public static final class JSONWriteConfig { private final boolean pretty; @@ -158,6 +158,7 @@ private void writeValueToGenerator(ValueVector valueVector, int i) throws IOExce } } + @Override public void close() throws IOException { generator.writeEndArray(); generator.writeEndObject(); From ed6ec3b76e1ac27fab85cd4bc74fbd61e8dfb27f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 18 Nov 2016 14:58:46 -0500 Subject: [PATCH 201/210] ARROW-373: [C++] JSON serialization format for testing C++ version of ARROW-372 Author: Wes McKinney Closes #202 from wesm/ARROW-373 and squashes the following commits: d13a05f [Wes McKinney] Compiler warning 72c24fe [Wes McKinney] Add a minimal literal JSON example a2cf47b [Wes McKinney] cpplint 3d9fcc2 [Wes McKinney] Complete round trip json file test with multiple record batches 2753449 [Wes McKinney] Complete draft json roundtrip implementation. tests not complete yet 3d6bbbd [Wes McKinney] Start high level writer scaffold 6bbd669 [Wes McKinney] Tweaks e2e86b5 [Wes McKinney] Test JSON array roundtrip for numeric types, strings, lists, structs 82f108b [Wes McKinney] Refactoring. Array test scaffold 0891378 [Wes McKinney] Declare loop variables 6566343 [Wes McKinney] Recursively construct children for list/struct 35c2f85 [Wes McKinney] Refactoring. Start drafting string/list reader f26402a [Wes McKinney] Install type_traits.h. cpplint 4fc7294 [Wes McKinney] Refactoring, type attribute consistency. Array reader compiles 2c93cce [Wes McKinney] WIP JSON array reader code path 932ba7a [Wes McKinney] Add ArrayVisitor methods, add enough metaprogramming to detect presence of c_type type member 15c1094 [Wes McKinney] Add type traits, refactoring, drafting json array writing. not working yet 209ba48 [Wes McKinney] More types refactoring. Strange linker error in pyarrow 379da3c [Wes McKinney] Implement union metadata JSON serialization 5fbea41 [Wes McKinney] Implement some more json types and add convenience factory functions 1c08233 [Wes McKinney] JSON schema roundtrip passing for many types 86c9559 [Wes McKinney] Add convenience factory functions for common types 3b9d14e [Wes McKinney] Add type-specific JSON metadata to schema writer 820b0f2 [Wes McKinney] Drafting JSON schema read/write 68ee7ab [Wes McKinney] Move forward declarations into type_fwd.h 1edf2a9 [Wes McKinney] Prototyping out visitor pattern for json serialization 24c1d5d [Wes McKinney] Some Types refactoring, add TypeVisitor abstract class. Add RapidJSON as external project --- cpp/CMakeLists.txt | 19 + cpp/src/arrow/CMakeLists.txt | 2 + cpp/src/arrow/array.cc | 15 + cpp/src/arrow/array.h | 12 + cpp/src/arrow/column-test.cc | 1 + cpp/src/arrow/io/hdfs.cc | 8 +- cpp/src/arrow/io/libhdfs_shim.cc | 26 +- cpp/src/arrow/ipc/CMakeLists.txt | 7 + cpp/src/arrow/ipc/adapter.cc | 2 +- cpp/src/arrow/ipc/ipc-json-test.cc | 353 ++++++++ cpp/src/arrow/ipc/json-internal.cc | 1113 +++++++++++++++++++++++++ cpp/src/arrow/ipc/json-internal.h | 111 +++ cpp/src/arrow/ipc/json.cc | 219 +++++ cpp/src/arrow/ipc/json.h | 92 ++ cpp/src/arrow/ipc/test-common.h | 14 +- cpp/src/arrow/schema-test.cc | 52 +- cpp/src/arrow/schema.cc | 15 + cpp/src/arrow/schema.h | 12 +- cpp/src/arrow/test-util.h | 51 +- cpp/src/arrow/type.cc | 122 ++- cpp/src/arrow/type.h | 338 ++++++-- cpp/src/arrow/type_fwd.h | 157 ++++ cpp/src/arrow/type_traits.h | 197 +++++ cpp/src/arrow/types/CMakeLists.txt | 1 - cpp/src/arrow/types/collection.h | 41 - cpp/src/arrow/types/datetime.h | 37 +- cpp/src/arrow/types/decimal.h | 14 +- cpp/src/arrow/types/list-test.cc | 2 +- cpp/src/arrow/types/list.cc | 4 + cpp/src/arrow/types/list.h | 8 +- cpp/src/arrow/types/primitive-test.cc | 36 +- cpp/src/arrow/types/primitive.cc | 97 ++- cpp/src/arrow/types/primitive.h | 190 ++--- cpp/src/arrow/types/string-test.cc | 12 +- cpp/src/arrow/types/string.cc | 16 +- cpp/src/arrow/types/string.h | 24 +- cpp/src/arrow/types/struct-test.cc | 2 +- cpp/src/arrow/types/struct.cc | 4 + cpp/src/arrow/types/struct.h | 4 + cpp/src/arrow/types/test-common.h | 16 + cpp/src/arrow/types/union.cc | 23 +- cpp/src/arrow/types/union.h | 21 - cpp/src/arrow/util/logging.h | 4 +- format/Metadata.md | 5 + 44 files changed, 3049 insertions(+), 450 deletions(-) create mode 100644 cpp/src/arrow/ipc/ipc-json-test.cc create mode 100644 cpp/src/arrow/ipc/json-internal.cc create mode 100644 cpp/src/arrow/ipc/json-internal.h create mode 100644 cpp/src/arrow/ipc/json.cc create mode 100644 cpp/src/arrow/ipc/json.h create mode 100644 cpp/src/arrow/type_fwd.h create mode 100644 cpp/src/arrow/type_traits.h delete mode 100644 cpp/src/arrow/types/collection.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6f954830b63..0bff7528578 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -545,6 +545,25 @@ if(ARROW_BUILD_BENCHMARKS) endif() endif() +# RapidJSON, header only dependency +if("$ENV{RAPIDJSON_HOME}" STREQUAL "") + ExternalProject_Add(rapidjson_ep + PREFIX "${CMAKE_BINARY_DIR}" + URL "https://github.com/miloyip/rapidjson/archive/v1.1.0.tar.gz" + URL_MD5 "badd12c511e081fec6c89c43a7027bce" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "") + + ExternalProject_Get_Property(rapidjson_ep SOURCE_DIR) + set(RAPIDJSON_INCLUDE_DIR "${SOURCE_DIR}/include") +else() + set(RAPIDJSON_INCLUDE_DIR "$ENV{RAPIDJSON_HOME}/include") +endif() +message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIR}") +include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) + ## Google PerfTools ## ## Disabled with TSAN/ASAN as well as with gold+dynamic linking (see comment diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a9b2feca28c..81851bc5b3e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -24,6 +24,8 @@ install(FILES schema.h table.h type.h + type_fwd.h + type_traits.h test-util.h DESTINATION include/arrow) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index e432a53781f..3262425e99b 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -18,6 +18,7 @@ #include "arrow/array.h" #include +#include #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -25,6 +26,16 @@ namespace arrow { +Status GetEmptyBitmap( + MemoryPool* pool, int32_t length, std::shared_ptr* result) { + auto buffer = std::make_shared(pool); + RETURN_NOT_OK(buffer->Resize(BitUtil::BytesForBits(length))); + memset(buffer->mutable_data(), 0, buffer->size()); + + *result = buffer; + return Status::OK(); +} + // ---------------------------------------------------------------------- // Base array class @@ -66,4 +77,8 @@ bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st return true; } +Status NullArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index ff37323f605..ff2b70e213b 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -29,6 +29,8 @@ namespace arrow { class Buffer; +class MemoryPool; +class MutableBuffer; class Status; // Immutable data array with some logical type and some length. Any memory is @@ -70,6 +72,8 @@ class ARROW_EXPORT Array { // returning Status::OK. This can be an expensive check. virtual Status Validate() const; + virtual Status Accept(ArrayVisitor* visitor) const = 0; + protected: std::shared_ptr type_; int32_t null_count_; @@ -86,6 +90,8 @@ class ARROW_EXPORT Array { // Degenerate null type Array class ARROW_EXPORT NullArray : public Array { public: + using TypeClass = NullType; + NullArray(const std::shared_ptr& type, int32_t length) : Array(type, length, length, nullptr) {} @@ -94,9 +100,15 @@ class ARROW_EXPORT NullArray : public Array { bool Equals(const std::shared_ptr& arr) const override; bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, const std::shared_ptr& arr) const override; + + Status Accept(ArrayVisitor* visitor) const override; }; typedef std::shared_ptr ArrayPtr; + +Status ARROW_EXPORT GetEmptyBitmap( + MemoryPool* pool, int32_t length, std::shared_ptr* result); + } // namespace arrow #endif diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc index 1edf313d49b..ac3636d1b6d 100644 --- a/cpp/src/arrow/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" +#include "arrow/array.h" #include "arrow/column.h" #include "arrow/schema.h" #include "arrow/test-util.h" diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 6490a7574ee..13491e780e2 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -289,13 +289,9 @@ class HdfsClient::HdfsClientImpl { // connect to HDFS with the builder object hdfsBuilder* builder = hdfsNewBuilder(); - if (!config->host.empty()) { - hdfsBuilderSetNameNode(builder, config->host.c_str()); - } + if (!config->host.empty()) { hdfsBuilderSetNameNode(builder, config->host.c_str()); } hdfsBuilderSetNameNodePort(builder, config->port); - if (!config->user.empty()) { - hdfsBuilderSetUserName(builder, config->user.c_str()); - } + if (!config->user.empty()) { hdfsBuilderSetUserName(builder, config->user.c_str()); } if (!config->kerb_ticket.empty()) { hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); } diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index 1fee595d071..36b8a4ec980 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -74,12 +74,9 @@ static HINSTANCE libjvm_handle = NULL; // NOTE(wesm): cpplint does not like use of short and other imprecise C types static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL; -static void (*ptr_hdfsBuilderSetNameNode)( - hdfsBuilder* bld, const char* nn) = NULL; -static void (*ptr_hdfsBuilderSetNameNodePort)( - hdfsBuilder* bld, tPort port) = NULL; -static void (*ptr_hdfsBuilderSetUserName)( - hdfsBuilder* bld, const char* userName) = NULL; +static void (*ptr_hdfsBuilderSetNameNode)(hdfsBuilder* bld, const char* nn) = NULL; +static void (*ptr_hdfsBuilderSetNameNodePort)(hdfsBuilder* bld, tPort port) = NULL; +static void (*ptr_hdfsBuilderSetUserName)(hdfsBuilder* bld, const char* userName) = NULL; static void (*ptr_hdfsBuilderSetKerbTicketCachePath)( hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL; static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL; @@ -173,9 +170,9 @@ void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) { ptr_hdfsBuilderSetUserName(bld, userName); } -void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld, - const char* kerbTicketCachePath) { - ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath); +void hdfsBuilderSetKerbTicketCachePath( + hdfsBuilder* bld, const char* kerbTicketCachePath) { + ptr_hdfsBuilderSetKerbTicketCachePath(bld, kerbTicketCachePath); } hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) { @@ -364,7 +361,7 @@ static std::vector get_potential_libhdfs_paths() { std::vector libhdfs_potential_paths; std::string file_name; - // OS-specific file name +// OS-specific file name #ifdef __WIN32 file_name = "hdfs.dll"; #elif __APPLE__ @@ -374,10 +371,7 @@ static std::vector get_potential_libhdfs_paths() { #endif // Common paths - std::vector search_paths = { - fs::path(""), - fs::path(".") - }; + std::vector search_paths = {fs::path(""), fs::path(".")}; // Path from environment variable const char* hadoop_home = std::getenv("HADOOP_HOME"); @@ -387,9 +381,7 @@ static std::vector get_potential_libhdfs_paths() { } const char* libhdfs_dir = std::getenv("ARROW_LIBHDFS_DIR"); - if (libhdfs_dir != nullptr) { - search_paths.push_back(fs::path(libhdfs_dir)); - } + if (libhdfs_dir != nullptr) { search_paths.push_back(fs::path(libhdfs_dir)); } // All paths with file name for (auto& path : search_paths) { diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index d2db339de7e..6955bcb6c23 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -34,6 +34,8 @@ set(ARROW_IPC_TEST_LINK_LIBS set(ARROW_IPC_SRCS adapter.cc file.cc + json.cc + json-internal.cc metadata.cc metadata-internal.cc ) @@ -79,6 +81,10 @@ ADD_ARROW_TEST(ipc-metadata-test) ARROW_TEST_LINK_LIBRARIES(ipc-metadata-test ${ARROW_IPC_TEST_LINK_LIBS}) +ADD_ARROW_TEST(ipc-json-test) +ARROW_TEST_LINK_LIBRARIES(ipc-json-test + ${ARROW_IPC_TEST_LINK_LIBS}) + # make clean will delete the generated file set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE) @@ -114,6 +120,7 @@ add_dependencies(arrow_objlib metadata_fbs) install(FILES adapter.h file.h + json.h metadata.h DESTINATION include/arrow/ipc) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 74786bf85ff..da718c08d54 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -106,7 +106,7 @@ Status VisitArray(const Array* arr, std::vector* field_nodes buffers->push_back(binary_arr->data()); } else if (arr->type_enum() == Type::LIST) { const auto list_arr = static_cast(arr); - buffers->push_back(list_arr->offset_buffer()); + buffers->push_back(list_arr->offsets()); RETURN_NOT_OK(VisitArray( list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1)); } else if (arr->type_enum() == Type::STRUCT) { diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc new file mode 100644 index 00000000000..a51371c6200 --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -0,0 +1,353 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/array.h" +#include "arrow/ipc/json-internal.h" +#include "arrow/ipc/json.h" +#include "arrow/table.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +void TestSchemaRoundTrip(const Schema& schema) { + rj::StringBuffer sb; + rj::Writer writer(sb); + + ASSERT_OK(WriteJsonSchema(schema, &writer)); + + rj::Document d; + d.Parse(sb.GetString()); + + std::shared_ptr out; + ASSERT_OK(ReadJsonSchema(d, &out)); + + ASSERT_TRUE(schema.Equals(out)); +} + +void TestArrayRoundTrip(const Array& array) { + static std::string name = "dummy"; + + rj::StringBuffer sb; + rj::Writer writer(sb); + + ASSERT_OK(WriteJsonArray(name, array, &writer)); + + std::string array_as_json = sb.GetString(); + + rj::Document d; + d.Parse(array_as_json); + + if (d.HasParseError()) { FAIL() << "JSON parsing failed"; } + + std::shared_ptr out; + ASSERT_OK(ReadJsonArray(default_memory_pool(), d, array.type(), &out)); + + ASSERT_TRUE(array.Equals(out)) << array_as_json; +} + +template +void CheckPrimitive(const std::shared_ptr& type, + const std::vector& is_valid, const std::vector& values) { + MemoryPool* pool = default_memory_pool(); + typename TypeTraits::BuilderType builder(pool, type); + + for (size_t i = 0; i < values.size(); ++i) { + if (is_valid[i]) { + ASSERT_OK(builder.Append(values[i])); + } else { + ASSERT_OK(builder.AppendNull()); + } + } + + std::shared_ptr array; + ASSERT_OK(builder.Finish(&array)); + TestArrayRoundTrip(*array.get()); +} + +template +void MakeArray(const std::shared_ptr& type, const std::vector& is_valid, + const std::vector& values, std::shared_ptr* out) { + std::shared_ptr values_buffer; + std::shared_ptr values_bitmap; + + ASSERT_OK(test::CopyBufferFromVector(values, &values_buffer)); + ASSERT_OK(test::GetBitmapFromBoolVector(is_valid, &values_bitmap)); + + using ArrayType = typename TypeTraits::ArrayType; + + int32_t null_count = 0; + for (bool val : is_valid) { + if (!val) { ++null_count; } + } + + *out = std::make_shared(type, static_cast(values.size()), + values_buffer, null_count, values_bitmap); +} + +TEST(TestJsonSchemaWriter, FlatTypes) { + std::vector> fields = {field("f0", int8()), + field("f1", int16(), false), field("f2", int32()), field("f3", int64(), false), + field("f4", uint8()), field("f5", uint16()), field("f6", uint32()), + field("f7", uint64()), field("f8", float32()), field("f9", float64()), + field("f10", utf8()), field("f11", binary()), field("f12", list(int32())), + field("f13", struct_({field("s1", int32()), field("s2", utf8())})), + field("f14", date()), field("f15", timestamp(TimeUnit::NANO)), + field("f16", time(TimeUnit::MICRO)), + field("f17", union_({field("u1", int8()), field("u2", time(TimeUnit::MILLI))}, + {0, 1}, UnionMode::DENSE))}; + + Schema schema(fields); + TestSchemaRoundTrip(schema); +} + +template +void PrimitiveTypesCheckOne() { + using c_type = typename T::c_type; + + std::vector is_valid = {true, false, true, true, true, false, true, true}; + std::vector values = {0, 1, 2, 3, 4, 5, 6, 7}; + CheckPrimitive(std::make_shared(), is_valid, values); +} + +TEST(TestJsonArrayWriter, PrimitiveTypes) { + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + + std::vector is_valid = {true, false, true, true, true, false, true, true}; + std::vector values = {"foo", "bar", "", "baz", "qux", "foo", "a", "1"}; + + CheckPrimitive(utf8(), is_valid, values); + CheckPrimitive(binary(), is_valid, values); +} + +TEST(TestJsonArrayWriter, NestedTypes) { + auto value_type = int32(); + + std::vector values_is_valid = {true, false, true, true, false, true, true}; + std::vector values = {0, 1, 2, 3, 4, 5, 6}; + + std::shared_ptr values_array; + MakeArray(int32(), values_is_valid, values, &values_array); + + // List + std::vector list_is_valid = {true, false, true, true, true}; + std::vector offsets = {0, 0, 0, 1, 4, 7}; + + std::shared_ptr list_bitmap; + ASSERT_OK(test::GetBitmapFromBoolVector(list_is_valid, &list_bitmap)); + std::shared_ptr offsets_buffer = test::GetBufferFromVector(offsets); + + ListArray list_array(list(value_type), 5, offsets_buffer, values_array, 1, list_bitmap); + + TestArrayRoundTrip(list_array); + + // Struct + std::vector struct_is_valid = {true, false, true, true, true, false, true}; + std::shared_ptr struct_bitmap; + ASSERT_OK(test::GetBitmapFromBoolVector(struct_is_valid, &struct_bitmap)); + + auto struct_type = + struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())}); + + std::vector> fields = {values_array, values_array, values_array}; + StructArray struct_array( + struct_type, static_cast(struct_is_valid.size()), fields, 2, struct_bitmap); + TestArrayRoundTrip(struct_array); +} + +// Data generation for test case below +void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, + std::vector>* arrays) { + std::vector is_valid; + test::random_is_valid(num_rows, 0.25, &is_valid); + + std::vector v1_values; + std::vector v2_values; + + test::randint(num_rows, 0, 100, &v1_values); + test::randint(num_rows, 0, 100, &v2_values); + + std::shared_ptr v1; + MakeArray(schema->field(0)->type, is_valid, v1_values, &v1); + + std::shared_ptr v2; + MakeArray(schema->field(1)->type, is_valid, v2_values, &v2); + + static const int kBufferSize = 10; + static uint8_t buffer[kBufferSize]; + static uint32_t seed = 0; + StringBuilder string_builder(default_memory_pool(), utf8()); + for (int i = 0; i < num_rows; ++i) { + if (!is_valid[i]) { + string_builder.AppendNull(); + } else { + test::random_ascii(kBufferSize, seed++, buffer); + string_builder.Append(buffer, kBufferSize); + } + } + std::shared_ptr v3; + ASSERT_OK(string_builder.Finish(&v3)); + + arrays->emplace_back(v1); + arrays->emplace_back(v2); + arrays->emplace_back(v3); +} + +TEST(TestJsonFileReadWrite, BasicRoundTrip) { + auto v1_type = int8(); + auto v2_type = int32(); + auto v3_type = utf8(); + + std::shared_ptr schema( + new Schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)})); + + std::unique_ptr writer; + ASSERT_OK(JsonWriter::Open(schema, &writer)); + + const int nbatches = 3; + std::vector> batches; + for (int i = 0; i < nbatches; ++i) { + int32_t num_rows = 5 + i * 5; + std::vector> arrays; + + MakeBatchArrays(schema, num_rows, &arrays); + batches.emplace_back(std::make_shared(schema, num_rows, arrays)); + ASSERT_OK(writer->WriteRecordBatch(arrays, num_rows)); + } + + std::string result; + ASSERT_OK(writer->Finish(&result)); + + std::unique_ptr reader; + + auto buffer = std::make_shared( + reinterpret_cast(result.c_str()), static_cast(result.size())); + + ASSERT_OK(JsonReader::Open(buffer, &reader)); + ASSERT_TRUE(reader->schema()->Equals(*schema.get())); + + ASSERT_EQ(nbatches, reader->num_record_batches()); + + for (int i = 0; i < nbatches; ++i) { + std::shared_ptr batch; + ASSERT_OK(reader->GetRecordBatch(i, &batch)); + ASSERT_TRUE(batch->Equals(*batches[i].get())); + } +} + +TEST(TestJsonFileReadWrite, MinimalFormatExample) { + static const char* example = R"example( +{ + "schema": { + "fields": [ + { + "name": "foo", + "type": {"name": "int", "isSigned": true, "bitWidth": 32}, + "nullable": true, "children": [], + "typeLayout": [ + {"type": "VALIDITY", "typeBitWidth": 1}, + {"type": "DATA", "typeBitWidth": 32} + ] + }, + { + "name": "bar", + "type": {"name": "floatingpoint", "precision": "DOUBLE"}, + "nullable": true, "children": [], + "typeLayout": [ + {"type": "VALIDITY", "typeBitWidth": 1}, + {"type": "DATA", "typeBitWidth": 64} + ] + } + ] + }, + "batches": [ + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "DATA": [1, 2, 3, 4, 5], + "VALIDITY": [1, 0, 1, 1, 1] + }, + { + "name": "bar", + "count": 5, + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0], + "VALIDITY": [1, 0, 0, 1, 1] + } + ] + } + ] +} +)example"; + + auto buffer = std::make_shared( + reinterpret_cast(example), strlen(example)); + + std::unique_ptr reader; + ASSERT_OK(JsonReader::Open(buffer, &reader)); + + Schema ex_schema({field("foo", int32()), field("bar", float64())}); + + ASSERT_TRUE(reader->schema()->Equals(ex_schema)); + ASSERT_EQ(1, reader->num_record_batches()); + + std::shared_ptr batch; + ASSERT_OK(reader->GetRecordBatch(0, &batch)); + + std::vector foo_valid = {true, false, true, true, true}; + std::vector foo_values = {1, 2, 3, 4, 5}; + std::shared_ptr foo; + MakeArray(int32(), foo_valid, foo_values, &foo); + ASSERT_TRUE(batch->column(0)->Equals(foo)); + + std::vector bar_valid = {true, false, false, true, true}; + std::vector bar_values = {1, 2, 3, 4, 5}; + std::shared_ptr bar; + MakeArray(float64(), bar_valid, bar_values, &bar); + ASSERT_TRUE(batch->column(1)->Equals(bar)); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc new file mode 100644 index 00000000000..31fe35b44ce --- /dev/null +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -0,0 +1,1113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/json-internal.h" + +#include +#include +#include +#include +#include +#include + +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" + +#include "arrow/array.h" +#include "arrow/schema.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +using RjArray = rj::Value::ConstArray; +using RjObject = rj::Value::ConstObject; + +enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY }; + +static std::string GetBufferTypeName(BufferType type) { + switch (type) { + case BufferType::DATA: + return "DATA"; + case BufferType::OFFSET: + return "OFFSET"; + case BufferType::TYPE: + return "TYPE"; + case BufferType::VALIDITY: + return "VALIDITY"; + default: + break; + } + return "UNKNOWN"; +} + +static std::string GetFloatingPrecisionName(FloatingPointMeta::Precision precision) { + switch (precision) { + case FloatingPointMeta::HALF: + return "HALF"; + case FloatingPointMeta::SINGLE: + return "SINGLE"; + case FloatingPointMeta::DOUBLE: + return "DOUBLE"; + default: + break; + } + return "UNKNOWN"; +} + +static std::string GetTimeUnitName(TimeUnit unit) { + switch (unit) { + case TimeUnit::SECOND: + return "SECOND"; + case TimeUnit::MILLI: + return "MILLISECOND"; + case TimeUnit::MICRO: + return "MICROSECOND"; + case TimeUnit::NANO: + return "NANOSECOND"; + default: + break; + } + return "UNKNOWN"; +} + +class BufferLayout { + public: + BufferLayout(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {} + + BufferType type() const { return type_; } + int bit_width() const { return bit_width_; } + + private: + BufferType type_; + int bit_width_; +}; + +static const BufferLayout kValidityBuffer(BufferType::VALIDITY, 1); +static const BufferLayout kOffsetBuffer(BufferType::OFFSET, 32); +static const BufferLayout kTypeBuffer(BufferType::TYPE, 32); +static const BufferLayout kBooleanBuffer(BufferType::DATA, 1); +static const BufferLayout kValues64(BufferType::DATA, 64); +static const BufferLayout kValues32(BufferType::DATA, 32); +static const BufferLayout kValues16(BufferType::DATA, 16); +static const BufferLayout kValues8(BufferType::DATA, 8); + +class JsonSchemaWriter : public TypeVisitor { + public: + explicit JsonSchemaWriter(const Schema& schema, RjWriter* writer) + : schema_(schema), writer_(writer) {} + + Status Write() { + writer_->StartObject(); + writer_->Key("fields"); + writer_->StartArray(); + for (const std::shared_ptr& field : schema_.fields()) { + RETURN_NOT_OK(VisitField(*field.get())); + } + writer_->EndArray(); + writer_->EndObject(); + return Status::OK(); + } + + Status VisitField(const Field& field) { + writer_->StartObject(); + + writer_->Key("name"); + writer_->String(field.name.c_str()); + + writer_->Key("nullable"); + writer_->Bool(field.nullable); + + // Visit the type + RETURN_NOT_OK(field.type->Accept(this)); + writer_->EndObject(); + + return Status::OK(); + } + + void SetNoChildren() { + writer_->Key("children"); + writer_->StartArray(); + writer_->EndArray(); + } + + template + typename std::enable_if::value || + std::is_base_of::value || + std::is_base_of::value, + void>::type + WriteTypeMetadata(const T& type) {} + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("bitWidth"); + writer_->Int(type.bit_width()); + writer_->Key("isSigned"); + writer_->Bool(type.is_signed()); + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("precision"); + writer_->String(GetFloatingPrecisionName(type.precision())); + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("unit"); + switch (type.unit) { + case IntervalType::Unit::YEAR_MONTH: + writer_->String("YEAR_MONTH"); + break; + case IntervalType::Unit::DAY_TIME: + writer_->String("DAY_TIME"); + break; + } + } + + template + typename std::enable_if::value || + std::is_base_of::value, + void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("unit"); + writer_->String(GetTimeUnitName(type.unit)); + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("precision"); + writer_->Int(type.precision); + writer_->Key("scale"); + writer_->Int(type.scale); + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("mode"); + switch (type.mode) { + case UnionMode::SPARSE: + writer_->String("SPARSE"); + break; + case UnionMode::DENSE: + writer_->String("DENSE"); + break; + } + + // Write type ids + writer_->Key("typeIds"); + writer_->StartArray(); + for (size_t i = 0; i < type.type_ids.size(); ++i) { + writer_->Uint(type.type_ids[i]); + } + writer_->EndArray(); + } + + // TODO(wesm): Other Type metadata + + template + void WriteName(const std::string& typeclass, const T& type) { + writer_->Key("type"); + writer_->StartObject(); + writer_->Key("name"); + writer_->String(typeclass); + WriteTypeMetadata(type); + writer_->EndObject(); + } + + template + Status WritePrimitive(const std::string& typeclass, const T& type, + const std::vector& buffer_layout) { + WriteName(typeclass, type); + SetNoChildren(); + WriteBufferLayout(buffer_layout); + return Status::OK(); + } + + template + Status WriteVarBytes(const std::string& typeclass, const T& type) { + WriteName(typeclass, type); + SetNoChildren(); + WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8}); + return Status::OK(); + } + + void WriteBufferLayout(const std::vector& buffer_layout) { + writer_->Key("typeLayout"); + writer_->StartArray(); + + for (const BufferLayout& buffer : buffer_layout) { + writer_->StartObject(); + writer_->Key("type"); + writer_->String(GetBufferTypeName(buffer.type())); + + writer_->Key("typeBitWidth"); + writer_->Int(buffer.bit_width()); + + writer_->EndObject(); + } + writer_->EndArray(); + } + + Status WriteChildren(const std::vector>& children) { + writer_->Key("children"); + writer_->StartArray(); + for (const std::shared_ptr& field : children) { + RETURN_NOT_OK(VisitField(*field.get())); + } + writer_->EndArray(); + return Status::OK(); + } + + Status Visit(const NullType& type) override { return WritePrimitive("null", type, {}); } + + Status Visit(const BooleanType& type) override { + return WritePrimitive("bool", type, {kValidityBuffer, kBooleanBuffer}); + } + + Status Visit(const Int8Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues8}); + } + + Status Visit(const Int16Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues16}); + } + + Status Visit(const Int32Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues32}); + } + + Status Visit(const Int64Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues64}); + } + + Status Visit(const UInt8Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues8}); + } + + Status Visit(const UInt16Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues16}); + } + + Status Visit(const UInt32Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues32}); + } + + Status Visit(const UInt64Type& type) override { + return WritePrimitive("int", type, {kValidityBuffer, kValues64}); + } + + Status Visit(const HalfFloatType& type) override { + return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues16}); + } + + Status Visit(const FloatType& type) override { + return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues32}); + } + + Status Visit(const DoubleType& type) override { + return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues64}); + } + + Status Visit(const StringType& type) override { return WriteVarBytes("utf8", type); } + + Status Visit(const BinaryType& type) override { return WriteVarBytes("binary", type); } + + Status Visit(const DateType& type) override { + return WritePrimitive("date", type, {kValidityBuffer, kValues64}); + } + + Status Visit(const TimeType& type) override { + return WritePrimitive("time", type, {kValidityBuffer, kValues64}); + } + + Status Visit(const TimestampType& type) override { + return WritePrimitive("timestamp", type, {kValidityBuffer, kValues64}); + } + + Status Visit(const IntervalType& type) override { + return WritePrimitive("interval", type, {kValidityBuffer, kValues64}); + } + + Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); } + + Status Visit(const ListType& type) override { + WriteName("list", type); + RETURN_NOT_OK(WriteChildren(type.children())); + WriteBufferLayout({kValidityBuffer, kOffsetBuffer}); + return Status::OK(); + } + + Status Visit(const StructType& type) override { + WriteName("struct", type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + return Status::OK(); + } + + Status Visit(const UnionType& type) override { + WriteName("union", type); + WriteChildren(type.children()); + + if (type.mode == UnionMode::SPARSE) { + WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + } else { + WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer}); + } + return Status::OK(); + } + + private: + const Schema& schema_; + RjWriter* writer_; +}; + +class JsonArrayWriter : public ArrayVisitor { + public: + explicit JsonArrayWriter(const std::string& name, const Array& array, RjWriter* writer) + : name_(name), array_(array), writer_(writer) {} + + Status Write() { return VisitArray(name_, array_); } + + Status VisitArray(const std::string& name, const Array& arr) { + writer_->StartObject(); + writer_->Key("name"); + writer_->String(name); + + writer_->Key("count"); + writer_->Int(arr.length()); + + RETURN_NOT_OK(arr.Accept(this)); + + writer_->EndObject(); + return Status::OK(); + } + + template + typename std::enable_if::value, void>::type WriteDataValues( + const T& arr) { + const auto data = arr.raw_data(); + for (int i = 0; i < arr.length(); ++i) { + writer_->Int64(data[i]); + } + } + + template + typename std::enable_if::value, void>::type WriteDataValues( + const T& arr) { + const auto data = arr.raw_data(); + for (int i = 0; i < arr.length(); ++i) { + writer_->Uint64(data[i]); + } + } + + template + typename std::enable_if::value, void>::type WriteDataValues( + const T& arr) { + const auto data = arr.raw_data(); + for (int i = 0; i < arr.length(); ++i) { + writer_->Double(data[i]); + } + } + + // String (Utf8), Binary + template + typename std::enable_if::value, void>::type + WriteDataValues(const T& arr) { + for (int i = 0; i < arr.length(); ++i) { + int32_t length; + const char* buf = reinterpret_cast(arr.GetValue(i, &length)); + writer_->String(buf, length); + } + } + + template + typename std::enable_if::value, void>::type + WriteDataValues(const T& arr) { + for (int i = 0; i < arr.length(); ++i) { + writer_->Bool(arr.Value(i)); + } + } + + template + void WriteDataField(const T& arr) { + writer_->Key("DATA"); + writer_->StartArray(); + WriteDataValues(arr); + writer_->EndArray(); + } + + template + void WriteOffsetsField(const T* offsets, int32_t length) { + writer_->Key("OFFSETS"); + writer_->StartArray(); + for (int i = 0; i < length; ++i) { + writer_->Int64(offsets[i]); + } + writer_->EndArray(); + } + + void WriteValidityField(const Array& arr) { + writer_->Key("VALIDITY"); + writer_->StartArray(); + if (arr.null_count() > 0) { + for (int i = 0; i < arr.length(); ++i) { + writer_->Int(arr.IsNull(i) ? 0 : 1); + } + } else { + for (int i = 0; i < arr.length(); ++i) { + writer_->Int(1); + } + } + writer_->EndArray(); + } + + void SetNoChildren() { + writer_->Key("children"); + writer_->StartArray(); + writer_->EndArray(); + } + + template + Status WritePrimitive(const T& array) { + WriteValidityField(array); + WriteDataField(array); + SetNoChildren(); + return Status::OK(); + } + + template + Status WriteVarBytes(const T& array) { + WriteValidityField(array); + WriteOffsetsField(array.raw_offsets(), array.length() + 1); + WriteDataField(array); + SetNoChildren(); + return Status::OK(); + } + + Status WriteChildren(const std::vector>& fields, + const std::vector>& arrays) { + writer_->Key("children"); + writer_->StartArray(); + for (size_t i = 0; i < fields.size(); ++i) { + RETURN_NOT_OK(VisitArray(fields[i]->name, *arrays[i].get())); + } + writer_->EndArray(); + return Status::OK(); + } + + Status Visit(const NullArray& array) override { + SetNoChildren(); + return Status::OK(); + } + + Status Visit(const BooleanArray& array) override { return WritePrimitive(array); } + + Status Visit(const Int8Array& array) override { return WritePrimitive(array); } + + Status Visit(const Int16Array& array) override { return WritePrimitive(array); } + + Status Visit(const Int32Array& array) override { return WritePrimitive(array); } + + Status Visit(const Int64Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt8Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt16Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt32Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt64Array& array) override { return WritePrimitive(array); } + + Status Visit(const HalfFloatArray& array) override { return WritePrimitive(array); } + + Status Visit(const FloatArray& array) override { return WritePrimitive(array); } + + Status Visit(const DoubleArray& array) override { return WritePrimitive(array); } + + Status Visit(const StringArray& array) override { return WriteVarBytes(array); } + + Status Visit(const BinaryArray& array) override { return WriteVarBytes(array); } + + Status Visit(const DateArray& array) override { return Status::NotImplemented("date"); } + + Status Visit(const TimeArray& array) override { return Status::NotImplemented("time"); } + + Status Visit(const TimestampArray& array) override { + return Status::NotImplemented("timestamp"); + } + + Status Visit(const IntervalArray& array) override { + return Status::NotImplemented("interval"); + } + + Status Visit(const DecimalArray& array) override { + return Status::NotImplemented("decimal"); + } + + Status Visit(const ListArray& array) override { + WriteValidityField(array); + WriteOffsetsField(array.raw_offsets(), array.length() + 1); + auto type = static_cast(array.type().get()); + return WriteChildren(type->children(), {array.values()}); + } + + Status Visit(const StructArray& array) override { + WriteValidityField(array); + auto type = static_cast(array.type().get()); + return WriteChildren(type->children(), array.fields()); + } + + Status Visit(const UnionArray& array) override { + return Status::NotImplemented("union"); + } + + private: + const std::string& name_; + const Array& array_; + RjWriter* writer_; +}; + +class JsonSchemaReader { + public: + explicit JsonSchemaReader(const rj::Value& json_schema) : json_schema_(json_schema) {} + + Status GetSchema(std::shared_ptr* schema) { + const auto& obj_schema = json_schema_.GetObject(); + + const auto& json_fields = obj_schema.FindMember("fields"); + RETURN_NOT_ARRAY("fields", json_fields, obj_schema); + + std::vector> fields; + RETURN_NOT_OK(GetFieldsFromArray(json_fields->value, &fields)); + + *schema = std::make_shared(fields); + return Status::OK(); + } + + Status GetFieldsFromArray( + const rj::Value& obj, std::vector>* fields) { + const auto& values = obj.GetArray(); + + fields->resize(values.Size()); + for (size_t i = 0; i < fields->size(); ++i) { + RETURN_NOT_OK(GetField(values[i], &(*fields)[i])); + } + return Status::OK(); + } + + Status GetField(const rj::Value& obj, std::shared_ptr* field) { + if (!obj.IsObject()) { return Status::Invalid("Field was not a JSON object"); } + const auto& json_field = obj.GetObject(); + + const auto& json_name = json_field.FindMember("name"); + RETURN_NOT_STRING("name", json_name, json_field); + + const auto& json_nullable = json_field.FindMember("nullable"); + RETURN_NOT_BOOL("nullable", json_nullable, json_field); + + const auto& json_type = json_field.FindMember("type"); + RETURN_NOT_OBJECT("type", json_type, json_field); + + const auto& json_children = json_field.FindMember("children"); + RETURN_NOT_ARRAY("children", json_children, json_field); + + std::vector> children; + RETURN_NOT_OK(GetFieldsFromArray(json_children->value, &children)); + + std::shared_ptr type; + RETURN_NOT_OK(GetType(json_type->value.GetObject(), children, &type)); + + *field = std::make_shared( + json_name->value.GetString(), type, json_nullable->value.GetBool()); + return Status::OK(); + } + + Status GetInteger( + const rj::Value::ConstObject& json_type, std::shared_ptr* type) { + const auto& json_bit_width = json_type.FindMember("bitWidth"); + RETURN_NOT_INT("bitWidth", json_bit_width, json_type); + + const auto& json_is_signed = json_type.FindMember("isSigned"); + RETURN_NOT_BOOL("isSigned", json_is_signed, json_type); + + bool is_signed = json_is_signed->value.GetBool(); + int bit_width = json_bit_width->value.GetInt(); + + switch (bit_width) { + case 8: + *type = is_signed ? int8() : uint8(); + break; + case 16: + *type = is_signed ? int16() : uint16(); + break; + case 32: + *type = is_signed ? int32() : uint32(); + break; + case 64: + *type = is_signed ? int64() : uint64(); + break; + default: + std::stringstream ss; + ss << "Invalid bit width: " << bit_width; + return Status::Invalid(ss.str()); + } + return Status::OK(); + } + + Status GetFloatingPoint(const RjObject& json_type, std::shared_ptr* type) { + const auto& json_precision = json_type.FindMember("precision"); + RETURN_NOT_STRING("precision", json_precision, json_type); + + std::string precision = json_precision->value.GetString(); + + if (precision == "DOUBLE") { + *type = float64(); + } else if (precision == "SINGLE") { + *type = float32(); + } else if (precision == "HALF") { + *type = float16(); + } else { + std::stringstream ss; + ss << "Invalid precision: " << precision; + return Status::Invalid(ss.str()); + } + return Status::OK(); + } + + template + Status GetTimeLike(const RjObject& json_type, std::shared_ptr* type) { + const auto& json_unit = json_type.FindMember("unit"); + RETURN_NOT_STRING("unit", json_unit, json_type); + + std::string unit_str = json_unit->value.GetString(); + + TimeUnit unit; + + if (unit_str == "SECOND") { + unit = TimeUnit::SECOND; + } else if (unit_str == "MILLISECOND") { + unit = TimeUnit::MILLI; + } else if (unit_str == "MICROSECOND") { + unit = TimeUnit::MICRO; + } else if (unit_str == "NANOSECOND") { + unit = TimeUnit::NANO; + } else { + std::stringstream ss; + ss << "Invalid time unit: " << unit_str; + return Status::Invalid(ss.str()); + } + + *type = std::make_shared(unit); + + return Status::OK(); + } + + Status GetUnion(const RjObject& json_type, + const std::vector>& children, + std::shared_ptr* type) { + const auto& json_mode = json_type.FindMember("mode"); + RETURN_NOT_STRING("mode", json_mode, json_type); + + std::string mode_str = json_mode->value.GetString(); + UnionMode mode; + + if (mode_str == "SPARSE") { + mode = UnionMode::SPARSE; + } else if (mode_str == "DENSE") { + mode = UnionMode::DENSE; + } else { + std::stringstream ss; + ss << "Invalid union mode: " << mode_str; + return Status::Invalid(ss.str()); + } + + const auto& json_type_ids = json_type.FindMember("typeIds"); + RETURN_NOT_ARRAY("typeIds", json_type_ids, json_type); + + std::vector type_ids; + const auto& id_array = json_type_ids->value.GetArray(); + for (const rj::Value& val : id_array) { + DCHECK(val.IsUint()); + type_ids.push_back(val.GetUint()); + } + + *type = union_(children, type_ids, mode); + + return Status::OK(); + } + + Status GetType(const RjObject& json_type, + const std::vector>& children, + std::shared_ptr* type) { + const auto& json_type_name = json_type.FindMember("name"); + RETURN_NOT_STRING("name", json_type_name, json_type); + + std::string type_name = json_type_name->value.GetString(); + + if (type_name == "int") { + return GetInteger(json_type, type); + } else if (type_name == "floatingpoint") { + return GetFloatingPoint(json_type, type); + } else if (type_name == "bool") { + *type = boolean(); + } else if (type_name == "utf8") { + *type = utf8(); + } else if (type_name == "binary") { + *type = binary(); + } else if (type_name == "null") { + *type = null(); + } else if (type_name == "date") { + *type = date(); + } else if (type_name == "time") { + return GetTimeLike(json_type, type); + } else if (type_name == "timestamp") { + return GetTimeLike(json_type, type); + } else if (type_name == "list") { + *type = list(children[0]); + } else if (type_name == "struct") { + *type = struct_(children); + } else { + return GetUnion(json_type, children, type); + } + return Status::OK(); + } + + private: + const rj::Value& json_schema_; +}; + +class JsonArrayReader { + public: + explicit JsonArrayReader(MemoryPool* pool) : pool_(pool) {} + + Status GetValidityBuffer(const std::vector& is_valid, int32_t* null_count, + std::shared_ptr* validity_buffer) { + int length = static_cast(is_valid.size()); + + std::shared_ptr out_buffer; + RETURN_NOT_OK(GetEmptyBitmap(pool_, length, &out_buffer)); + uint8_t* bitmap = out_buffer->mutable_data(); + + *null_count = 0; + for (int i = 0; i < length; ++i) { + if (!is_valid[i]) { + ++(*null_count); + continue; + } + BitUtil::SetBit(bitmap, i); + } + + *validity_buffer = out_buffer; + return Status::OK(); + } + + template + typename std::enable_if::value || + std::is_base_of::value, + Status>::type + ReadArray(const RjObject& json_array, int32_t length, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + typename TypeTraits::BuilderType builder(pool_, type); + + const auto& json_data = json_array.FindMember("DATA"); + RETURN_NOT_ARRAY("DATA", json_data, json_array); + + const auto& json_data_arr = json_data->value.GetArray(); + + DCHECK_EQ(static_cast(json_data_arr.Size()), length); + for (int i = 0; i < length; ++i) { + if (!is_valid[i]) { + builder.AppendNull(); + continue; + } + + const rj::Value& val = json_data_arr[i]; + if (IsSignedInt::value) { + DCHECK(val.IsInt()); + builder.Append(val.GetInt64()); + } else if (IsUnsignedInt::value) { + DCHECK(val.IsUint()); + builder.Append(val.GetUint64()); + } else if (IsFloatingPoint::value) { + DCHECK(val.IsFloat()); + builder.Append(val.GetFloat()); + } else if (std::is_base_of::value) { + DCHECK(val.IsBool()); + builder.Append(val.GetBool()); + } else { + // We are in the wrong function + return Status::Invalid(type->ToString()); + } + } + + return builder.Finish(array); + } + + template + typename std::enable_if::value, Status>::type ReadArray( + const RjObject& json_array, int32_t length, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + typename TypeTraits::BuilderType builder(pool_, type); + + const auto& json_data = json_array.FindMember("DATA"); + RETURN_NOT_ARRAY("DATA", json_data, json_array); + + const auto& json_data_arr = json_data->value.GetArray(); + + DCHECK_EQ(static_cast(json_data_arr.Size()), length); + for (int i = 0; i < length; ++i) { + if (!is_valid[i]) { + builder.AppendNull(); + continue; + } + + const rj::Value& val = json_data_arr[i]; + DCHECK(val.IsString()); + builder.Append(val.GetString()); + } + + return builder.Finish(array); + } + + template + typename std::enable_if::value, Status>::type ReadArray( + const RjObject& json_array, int32_t length, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + const auto& json_offsets = json_array.FindMember("OFFSETS"); + RETURN_NOT_ARRAY("OFFSETS", json_offsets, json_array); + const auto& json_offsets_arr = json_offsets->value.GetArray(); + + int32_t null_count = 0; + std::shared_ptr validity_buffer; + RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer)); + + auto offsets_buffer = std::make_shared(pool_); + RETURN_NOT_OK(offsets_buffer->Resize((length + 1) * sizeof(int32_t))); + int32_t* offsets = reinterpret_cast(offsets_buffer->mutable_data()); + + for (int i = 0; i < length + 1; ++i) { + const rj::Value& val = json_offsets_arr[i]; + DCHECK(val.IsInt()); + offsets[i] = val.GetInt(); + } + + std::vector> children; + RETURN_NOT_OK(GetChildren(json_array, type, &children)); + DCHECK_EQ(children.size(), 1); + + *array = std::make_shared( + type, length, offsets_buffer, children[0], null_count, validity_buffer); + + return Status::OK(); + } + + template + typename std::enable_if::value, Status>::type ReadArray( + const RjObject& json_array, int32_t length, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + int32_t null_count = 0; + std::shared_ptr validity_buffer; + RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer)); + + std::vector> fields; + RETURN_NOT_OK(GetChildren(json_array, type, &fields)); + + *array = + std::make_shared(type, length, fields, null_count, validity_buffer); + + return Status::OK(); + } + + template + typename std::enable_if::value, Status>::type ReadArray( + const RjObject& json_array, int32_t length, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + *array = std::make_shared(type, length); + return Status::OK(); + } + + Status GetChildren(const RjObject& json_array, const std::shared_ptr& type, + std::vector>* array) { + const auto& json_children = json_array.FindMember("children"); + RETURN_NOT_ARRAY("children", json_children, json_array); + const auto& json_children_arr = json_children->value.GetArray(); + + if (type->num_children() != static_cast(json_children_arr.Size())) { + std::stringstream ss; + ss << "Expected " << type->num_children() << " children, but got " + << json_children_arr.Size(); + return Status::Invalid(ss.str()); + } + + for (int i = 0; i < static_cast(json_children_arr.Size()); ++i) { + const rj::Value& json_child = json_children_arr[i]; + DCHECK(json_child.IsObject()); + + std::shared_ptr child_field = type->child(i); + + auto it = json_child.FindMember("name"); + RETURN_NOT_STRING("name", it, json_child); + + DCHECK_EQ(it->value.GetString(), child_field->name); + std::shared_ptr child; + RETURN_NOT_OK(GetArray(json_children_arr[i], child_field->type, &child)); + array->emplace_back(child); + } + + return Status::OK(); + } + + Status GetArray(const rj::Value& obj, const std::shared_ptr& type, + std::shared_ptr* array) { + if (!obj.IsObject()) { + return Status::Invalid("Array element was not a JSON object"); + } + const auto& json_array = obj.GetObject(); + + const auto& json_length = json_array.FindMember("count"); + RETURN_NOT_INT("count", json_length, json_array); + int32_t length = json_length->value.GetInt(); + + const auto& json_valid_iter = json_array.FindMember("VALIDITY"); + RETURN_NOT_ARRAY("VALIDITY", json_valid_iter, json_array); + + const auto& json_validity = json_valid_iter->value.GetArray(); + + DCHECK_EQ(static_cast(json_validity.Size()), length); + + std::vector is_valid; + for (const rj::Value& val : json_validity) { + DCHECK(val.IsInt()); + is_valid.push_back(static_cast(val.GetInt())); + } + +#define TYPE_CASE(TYPE) \ + case TYPE::type_id: \ + return ReadArray(json_array, length, is_valid, type, array); + +#define NOT_IMPLEMENTED_CASE(TYPE_ENUM) \ + case Type::TYPE_ENUM: { \ + std::stringstream ss; \ + ss << type->ToString(); \ + return Status::NotImplemented(ss.str()); \ + } + + switch (type->type) { + TYPE_CASE(NullType); + TYPE_CASE(BooleanType); + TYPE_CASE(UInt8Type); + TYPE_CASE(Int8Type); + TYPE_CASE(UInt16Type); + TYPE_CASE(Int16Type); + TYPE_CASE(UInt32Type); + TYPE_CASE(Int32Type); + TYPE_CASE(UInt64Type); + TYPE_CASE(Int64Type); + TYPE_CASE(HalfFloatType); + TYPE_CASE(FloatType); + TYPE_CASE(DoubleType); + TYPE_CASE(StringType); + TYPE_CASE(BinaryType); + NOT_IMPLEMENTED_CASE(DATE); + NOT_IMPLEMENTED_CASE(TIMESTAMP); + NOT_IMPLEMENTED_CASE(TIME); + NOT_IMPLEMENTED_CASE(INTERVAL); + TYPE_CASE(ListType); + TYPE_CASE(StructType); + NOT_IMPLEMENTED_CASE(UNION); + default: + std::stringstream ss; + ss << type->ToString(); + return Status::NotImplemented(ss.str()); + } + +#undef TYPE_CASE +#undef NOT_IMPLEMENTED_CASE + + return Status::OK(); + } + + private: + MemoryPool* pool_; +}; + +Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer) { + JsonSchemaWriter converter(schema, json_writer); + return converter.Write(); +} + +Status ReadJsonSchema(const rj::Value& json_schema, std::shared_ptr* schema) { + JsonSchemaReader converter(json_schema); + return converter.GetSchema(schema); +} + +Status WriteJsonArray( + const std::string& name, const Array& array, RjWriter* json_writer) { + JsonArrayWriter converter(name, array, json_writer); + return converter.Write(); +} + +Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, + const std::shared_ptr& type, std::shared_ptr* array) { + JsonArrayReader converter(pool); + return converter.GetArray(json_array, type, array); +} + +Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema, + std::shared_ptr* array) { + if (!json_array.IsObject()) { return Status::Invalid("Element was not a JSON object"); } + + const auto& json_obj = json_array.GetObject(); + + const auto& json_name = json_obj.FindMember("name"); + RETURN_NOT_STRING("name", json_name, json_obj); + + std::string name = json_name->value.GetString(); + + std::shared_ptr result = nullptr; + for (const std::shared_ptr& field : schema.fields()) { + if (field->name == name) { + result = field; + break; + } + } + + if (result == nullptr) { + std::stringstream ss; + ss << "Field named " << name << " not found in schema"; + return Status::KeyError(ss.str()); + } + + return ReadJsonArray(pool, json_array, result->type, array); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h new file mode 100644 index 00000000000..0c167a4ec53 --- /dev/null +++ b/cpp/src/arrow/ipc/json-internal.h @@ -0,0 +1,111 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_IPC_JSON_INTERNAL_H +#define ARROW_IPC_JSON_INTERNAL_H + +#define RAPIDJSON_HAS_STDSTRING 1 +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 + +#include +#include +#include + +#include "rapidjson/document.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" + +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace rj = rapidjson; +using RjWriter = rj::Writer; + +#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ + if (NAME == PARENT.MemberEnd()) { \ + std::stringstream ss; \ + ss << "field " << TOK << " not found"; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + std::stringstream ss; \ + ss << "field was not a string" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + std::stringstream ss; \ + ss << "field was not a boolean" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + std::stringstream ss; \ + ss << "field was not an int" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + std::stringstream ss; \ + ss << "field was not an array" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + std::stringstream ss; \ + ss << "field was not an object" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ + } + +namespace arrow { +namespace ipc { + +// TODO(wesm): Only exporting these because arrow_ipc does not have a static +// library at the moment. Better to not export +Status ARROW_EXPORT WriteJsonSchema(const Schema& schema, RjWriter* json_writer); +Status ARROW_EXPORT WriteJsonArray( + const std::string& name, const Array& array, RjWriter* json_writer); + +Status ARROW_EXPORT ReadJsonSchema( + const rj::Value& json_obj, std::shared_ptr* schema); +Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj, + const std::shared_ptr& type, std::shared_ptr* array); + +Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj, + const Schema& schema, std::shared_ptr* array); + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_JSON_INTERNAL_H diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc new file mode 100644 index 00000000000..2281611f8b8 --- /dev/null +++ b/cpp/src/arrow/ipc/json.cc @@ -0,0 +1,219 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/json.h" + +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/ipc/json-internal.h" +#include "arrow/schema.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/util/buffer.h" +#include "arrow/util/logging.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +// ---------------------------------------------------------------------- +// Writer implementation + +class JsonWriter::JsonWriterImpl { + public: + explicit JsonWriterImpl(const std::shared_ptr& schema) : schema_(schema) { + writer_.reset(new RjWriter(string_buffer_)); + } + + Status Start() { + writer_->StartObject(); + + writer_->Key("schema"); + RETURN_NOT_OK(WriteJsonSchema(*schema_.get(), writer_.get())); + + // Record batches + writer_->Key("batches"); + writer_->StartArray(); + return Status::OK(); + } + + Status Finish(std::string* result) { + writer_->EndArray(); // Record batches + writer_->EndObject(); + + *result = string_buffer_.GetString(); + return Status::OK(); + } + + Status WriteRecordBatch( + const std::vector>& columns, int32_t num_rows) { + DCHECK_EQ(static_cast(columns.size()), schema_->num_fields()); + + writer_->StartObject(); + writer_->Key("count"); + writer_->Int(num_rows); + + writer_->Key("columns"); + writer_->StartArray(); + + for (int i = 0; i < schema_->num_fields(); ++i) { + const std::shared_ptr& column = columns[i]; + + DCHECK_EQ(num_rows, column->length()) + << "Array length did not match record batch length"; + + RETURN_NOT_OK( + WriteJsonArray(schema_->field(i)->name, *column.get(), writer_.get())); + } + + writer_->EndArray(); + writer_->EndObject(); + return Status::OK(); + } + + private: + std::shared_ptr schema_; + + rj::StringBuffer string_buffer_; + std::unique_ptr writer_; +}; + +JsonWriter::JsonWriter(const std::shared_ptr& schema) { + impl_.reset(new JsonWriterImpl(schema)); +} + +JsonWriter::~JsonWriter() {} + +Status JsonWriter::Open( + const std::shared_ptr& schema, std::unique_ptr* writer) { + *writer = std::unique_ptr(new JsonWriter(schema)); + return (*writer)->impl_->Start(); +} + +Status JsonWriter::Finish(std::string* result) { + return impl_->Finish(result); +} + +Status JsonWriter::WriteRecordBatch( + const std::vector>& columns, int32_t num_rows) { + return impl_->WriteRecordBatch(columns, num_rows); +} + +// ---------------------------------------------------------------------- +// Reader implementation + +class JsonReader::JsonReaderImpl { + public: + JsonReaderImpl(MemoryPool* pool, const std::shared_ptr& data) + : pool_(pool), data_(data) {} + + Status ParseAndReadSchema() { + doc_.Parse(reinterpret_cast(data_->data()), + static_cast(data_->size())); + if (doc_.HasParseError()) { return Status::IOError("JSON parsing failed"); } + + auto it = doc_.FindMember("schema"); + RETURN_NOT_OBJECT("schema", it, doc_); + RETURN_NOT_OK(ReadJsonSchema(it->value, &schema_)); + + it = doc_.FindMember("batches"); + RETURN_NOT_ARRAY("batches", it, doc_); + record_batches_ = &it->value; + + return Status::OK(); + } + + Status GetRecordBatch(int i, std::shared_ptr* batch) const { + DCHECK_GE(i, 0) << "i out of bounds"; + DCHECK_LT(i, static_cast(record_batches_->GetArray().Size())) + << "i out of bounds"; + + const auto& batch_val = record_batches_->GetArray()[i]; + DCHECK(batch_val.IsObject()); + + const auto& batch_obj = batch_val.GetObject(); + + auto it = batch_obj.FindMember("count"); + RETURN_NOT_INT("count", it, batch_obj); + int32_t num_rows = static_cast(it->value.GetInt()); + + it = batch_obj.FindMember("columns"); + RETURN_NOT_ARRAY("columns", it, batch_obj); + const auto& json_columns = it->value.GetArray(); + + std::vector> columns(json_columns.Size()); + for (size_t i = 0; i < columns.size(); ++i) { + const std::shared_ptr& type = schema_->field(i)->type; + RETURN_NOT_OK(ReadJsonArray(pool_, json_columns[i], type, &columns[i])); + } + + *batch = std::make_shared(schema_, num_rows, columns); + return Status::OK(); + } + + std::shared_ptr schema() const { return schema_; } + + int num_record_batches() const { + return static_cast(record_batches_->GetArray().Size()); + } + + private: + MemoryPool* pool_; + std::shared_ptr data_; + rj::Document doc_; + + const rj::Value* record_batches_; + + std::shared_ptr schema_; +}; + +JsonReader::JsonReader(MemoryPool* pool, const std::shared_ptr& data) { + impl_.reset(new JsonReaderImpl(pool, data)); +} + +JsonReader::~JsonReader() {} + +Status JsonReader::Open( + const std::shared_ptr& data, std::unique_ptr* reader) { + return Open(default_memory_pool(), data, reader); +} + +Status JsonReader::Open(MemoryPool* pool, const std::shared_ptr& data, + std::unique_ptr* reader) { + *reader = std::unique_ptr(new JsonReader(pool, data)); + return (*reader)->impl_->ParseAndReadSchema(); +} + +std::shared_ptr JsonReader::schema() const { + return impl_->schema(); +} + +int JsonReader::num_record_batches() const { + return impl_->num_record_batches(); +} + +Status JsonReader::GetRecordBatch(int i, std::shared_ptr* batch) const { + return impl_->GetRecordBatch(i, batch); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h new file mode 100644 index 00000000000..7395be43b96 --- /dev/null +++ b/cpp/src/arrow/ipc/json.h @@ -0,0 +1,92 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement Arrow JSON serialization format + +#ifndef ARROW_IPC_JSON_H +#define ARROW_IPC_JSON_H + +#include +#include +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace io { + +class OutputStream; +class ReadableFileInterface; + +} // namespace io + +namespace ipc { + +class ARROW_EXPORT JsonWriter { + public: + ~JsonWriter(); + + static Status Open( + const std::shared_ptr& schema, std::unique_ptr* out); + + // TODO(wesm): Write dictionaries + + Status WriteRecordBatch( + const std::vector>& columns, int32_t num_rows); + + Status Finish(std::string* result); + + private: + explicit JsonWriter(const std::shared_ptr& schema); + + // Hide RapidJSON details from public API + class JsonWriterImpl; + std::unique_ptr impl_; +}; + +// TODO(wesm): Read from a file stream rather than an in-memory buffer +class ARROW_EXPORT JsonReader { + public: + ~JsonReader(); + + static Status Open(MemoryPool* pool, const std::shared_ptr& data, + std::unique_ptr* reader); + + // Use the default memory pool + static Status Open( + const std::shared_ptr& data, std::unique_ptr* reader); + + std::shared_ptr schema() const; + + int num_record_batches() const; + + // Read a record batch from the file + Status GetRecordBatch(int i, std::shared_ptr* batch) const; + + private: + JsonReader(MemoryPool* pool, const std::shared_ptr& data); + + // Hide RapidJSON details from public API + class JsonReaderImpl; + std::unique_ptr impl_; +}; + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_JSON_H diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 784e238e977..9abc20d876d 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -27,6 +27,7 @@ #include "arrow/array.h" #include "arrow/table.h" #include "arrow/test-util.h" +#include "arrow/type.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/types/string.h" @@ -39,15 +40,14 @@ namespace arrow { namespace ipc { const auto kInt32 = std::make_shared(); -const auto kListInt32 = std::make_shared(kInt32); -const auto kListListInt32 = std::make_shared(kListInt32); +const auto kListInt32 = list(kInt32); +const auto kListListInt32 = list(kListInt32); Status MakeRandomInt32Array( int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { std::shared_ptr data; test::MakeRandomInt32PoolBuffer(length, pool, &data); - const auto kInt32 = std::make_shared(); - Int32Builder builder(pool, kInt32); + Int32Builder builder(pool, int32()); if (include_nulls) { std::shared_ptr valid_bytes; test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); @@ -134,8 +134,8 @@ Status MakeRandomBinaryArray( Status MakeStringTypesRecordBatch(std::shared_ptr* out) { const int32_t length = 500; - auto string_type = std::make_shared(); - auto binary_type = std::make_shared(); + auto string_type = utf8(); + auto binary_type = binary(); auto f0 = std::make_shared("f0", string_type); auto f1 = std::make_shared("f1", binary_type); std::shared_ptr schema(new Schema({f0, f1})); @@ -233,7 +233,7 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { const bool include_nulls = true; RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); for (int i = 0; i < 63; ++i) { - type = std::static_pointer_cast(std::make_shared(type)); + type = std::static_pointer_cast(list(type)); RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); } diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc index 8cc80be120a..4826199f73d 100644 --- a/cpp/src/arrow/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -29,23 +29,21 @@ using std::vector; namespace arrow { -const auto INT32 = std::make_shared(); - TEST(TestField, Basics) { - Field f0("f0", INT32); - Field f0_nn("f0", INT32, false); + Field f0("f0", int32()); + Field f0_nn("f0", int32(), false); ASSERT_EQ(f0.name, "f0"); - ASSERT_EQ(f0.type->ToString(), INT32->ToString()); + ASSERT_EQ(f0.type->ToString(), int32()->ToString()); ASSERT_TRUE(f0.nullable); ASSERT_FALSE(f0_nn.nullable); } TEST(TestField, Equals) { - Field f0("f0", INT32); - Field f0_nn("f0", INT32, false); - Field f0_other("f0", INT32); + Field f0("f0", int32()); + Field f0_nn("f0", int32(), false); + Field f0_other("f0", int32()); ASSERT_EQ(f0, f0_other); ASSERT_NE(f0, f0_nn); @@ -57,11 +55,11 @@ class TestSchema : public ::testing::Test { }; TEST_F(TestSchema, Basics) { - auto f0 = std::make_shared("f0", INT32); - auto f1 = std::make_shared("f1", std::make_shared(), false); - auto f1_optional = std::make_shared("f1", std::make_shared()); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f1_optional = field("f1", uint8()); - auto f2 = std::make_shared("f2", std::make_shared()); + auto f2 = field("f2", utf8()); vector> fields = {f0, f1, f2}; auto schema = std::make_shared(fields); @@ -83,11 +81,10 @@ TEST_F(TestSchema, Basics) { } TEST_F(TestSchema, ToString) { - auto f0 = std::make_shared("f0", INT32); - auto f1 = std::make_shared("f1", std::make_shared(), false); - auto f2 = std::make_shared("f2", std::make_shared()); - auto f3 = std::make_shared( - "f3", std::make_shared(std::make_shared())); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + auto f3 = field("f3", list(int16())); vector> fields = {f0, f1, f2, f3}; auto schema = std::make_shared(fields); @@ -101,4 +98,25 @@ f3: list)"; ASSERT_EQ(expected, result); } +TEST_F(TestSchema, GetFieldByName) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + auto f3 = field("f3", list(int16())); + + vector> fields = {f0, f1, f2, f3}; + auto schema = std::make_shared(fields); + + std::shared_ptr result; + + result = schema->GetFieldByName("f1"); + ASSERT_TRUE(f1->Equals(result)); + + result = schema->GetFieldByName("f3"); + ASSERT_TRUE(f3->Equals(result)); + + result = schema->GetFieldByName("not-found"); + ASSERT_TRUE(result == nullptr); +} + } // namespace arrow diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc index ff3ea1990e5..cd8256e658e 100644 --- a/cpp/src/arrow/schema.cc +++ b/cpp/src/arrow/schema.cc @@ -42,6 +42,21 @@ bool Schema::Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } +std::shared_ptr Schema::GetFieldByName(const std::string& name) { + if (fields_.size() > 0 && name_to_index_.size() == 0) { + for (size_t i = 0; i < fields_.size(); ++i) { + name_to_index_[fields_[i]->name] = i; + } + } + + auto it = name_to_index_.find(name); + if (it == name_to_index_.end()) { + return nullptr; + } else { + return fields_[it->second]; + } +} + std::string Schema::ToString() const { std::stringstream buffer; diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h index 4301968e015..0e1ab5c368e 100644 --- a/cpp/src/arrow/schema.h +++ b/cpp/src/arrow/schema.h @@ -20,14 +20,14 @@ #include #include +#include #include +#include "arrow/type.h" #include "arrow/util/visibility.h" namespace arrow { -struct Field; - class ARROW_EXPORT Schema { public: explicit Schema(const std::vector>& fields); @@ -37,7 +37,12 @@ class ARROW_EXPORT Schema { bool Equals(const std::shared_ptr& other) const; // Return the ith schema element. Does not boundscheck - const std::shared_ptr& field(int i) const { return fields_[i]; } + std::shared_ptr field(int i) const { return fields_[i]; } + + // Returns nullptr if name not found + std::shared_ptr GetFieldByName(const std::string& name); + + const std::vector>& fields() const { return fields_; } // Render a string representation of the schema suitable for debugging std::string ToString() const; @@ -46,6 +51,7 @@ class ARROW_EXPORT Schema { private: std::vector> fields_; + std::unordered_map name_to_index_; }; } // namespace arrow diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index ac56f5ed087..ab4b980b3be 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -27,6 +27,7 @@ #include "gtest/gtest.h" +#include "arrow/array.h" #include "arrow/column.h" #include "arrow/schema.h" #include "arrow/table.h" @@ -102,20 +103,57 @@ void random_real(int n, uint32_t seed, T min_value, T max_value, std::vector* } template -std::shared_ptr to_buffer(const std::vector& values) { +std::shared_ptr GetBufferFromVector(const std::vector& values) { return std::make_shared( reinterpret_cast(values.data()), values.size() * sizeof(T)); } +template +inline Status CopyBufferFromVector( + const std::vector& values, std::shared_ptr* result) { + int64_t nbytes = static_cast(values.size()) * sizeof(T); + + auto buffer = std::make_shared(default_memory_pool()); + RETURN_NOT_OK(buffer->Resize(nbytes)); + memcpy(buffer->mutable_data(), values.data(), nbytes); + + *result = buffer; + return Status::OK(); +} + +static inline Status GetBitmapFromBoolVector( + const std::vector& is_valid, std::shared_ptr* result) { + int length = static_cast(is_valid.size()); + + std::shared_ptr buffer; + RETURN_NOT_OK(GetEmptyBitmap(default_memory_pool(), length, &buffer)); + + uint8_t* bitmap = buffer->mutable_data(); + for (int i = 0; i < length; ++i) { + if (is_valid[i]) { BitUtil::SetBit(bitmap, i); } + } + + *result = buffer; + return Status::OK(); +} + // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. -void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { +static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { null_bytes[i] = rng.NextDoubleFraction() > pct_null; } } +static inline void random_is_valid( + int64_t n, double pct_null, std::vector* is_valid) { + Random rng(random_seed()); + for (int i = 0; i < n; ++i) { + is_valid->push_back(rng.NextDoubleFraction() > pct_null); + } +} + static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { std::mt19937 gen(seed); std::uniform_int_distribution d(0, 255); @@ -125,6 +163,15 @@ static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { } } +static inline void random_ascii(int n, uint32_t seed, uint8_t* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(65, 122); + + for (int i = 0; i < n; ++i) { + out[i] = d(gen) & 0xFF; + } +} + template void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { DCHECK(out); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4fd50b7c193..589bdadb77c 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -20,6 +20,8 @@ #include #include +#include "arrow/util/status.h" + namespace arrow { std::string Field::ToString() const { @@ -44,9 +46,24 @@ bool DataType::Equals(const DataType* other) const { return equals; } +std::string BooleanType::ToString() const { + return name(); +} + +FloatingPointMeta::Precision HalfFloatType::precision() const { + return FloatingPointMeta::HALF; +} + +FloatingPointMeta::Precision FloatType::precision() const { + return FloatingPointMeta::SINGLE; +} + +FloatingPointMeta::Precision DoubleType::precision() const { + return FloatingPointMeta::DOUBLE; +} + std::string StringType::ToString() const { - std::string result(name()); - return result; + return std::string("string"); } std::string ListType::ToString() const { @@ -56,7 +73,7 @@ std::string ListType::ToString() const { } std::string BinaryType::ToString() const { - return std::string(name()); + return std::string("binary"); } std::string StructType::ToString() const { @@ -71,4 +88,103 @@ std::string StructType::ToString() const { return s.str(); } +std::string UnionType::ToString() const { + std::stringstream s; + + if (mode == UnionMode::SPARSE) { + s << "union[sparse]<"; + } else { + s << "union[dense]<"; + } + + for (size_t i = 0; i < children_.size(); ++i) { + if (i) { s << ", "; } + s << children_[i]->ToString(); + } + s << ">"; + return s.str(); +} + +int NullType::bit_width() const { + return 0; +} + +std::string NullType::ToString() const { + return name(); +} + +// Visitors and template instantiation + +#define ACCEPT_VISITOR(TYPE) \ + Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); } + +ACCEPT_VISITOR(NullType); +ACCEPT_VISITOR(BooleanType); +ACCEPT_VISITOR(BinaryType); +ACCEPT_VISITOR(StringType); +ACCEPT_VISITOR(ListType); +ACCEPT_VISITOR(StructType); +ACCEPT_VISITOR(DecimalType); +ACCEPT_VISITOR(UnionType); +ACCEPT_VISITOR(DateType); +ACCEPT_VISITOR(TimeType); +ACCEPT_VISITOR(TimestampType); +ACCEPT_VISITOR(IntervalType); + +#define TYPE_FACTORY(NAME, KLASS) \ + std::shared_ptr NAME() { \ + static std::shared_ptr result = std::make_shared(); \ + return result; \ + } + +TYPE_FACTORY(null, NullType); +TYPE_FACTORY(boolean, BooleanType); +TYPE_FACTORY(int8, Int8Type); +TYPE_FACTORY(uint8, UInt8Type); +TYPE_FACTORY(int16, Int16Type); +TYPE_FACTORY(uint16, UInt16Type); +TYPE_FACTORY(int32, Int32Type); +TYPE_FACTORY(uint32, UInt32Type); +TYPE_FACTORY(int64, Int64Type); +TYPE_FACTORY(uint64, UInt64Type); +TYPE_FACTORY(float16, HalfFloatType); +TYPE_FACTORY(float32, FloatType); +TYPE_FACTORY(float64, DoubleType); +TYPE_FACTORY(utf8, StringType); +TYPE_FACTORY(binary, BinaryType); +TYPE_FACTORY(date, DateType); + +std::shared_ptr timestamp(TimeUnit unit) { + static std::shared_ptr result = std::make_shared(); + return result; +} + +std::shared_ptr time(TimeUnit unit) { + static std::shared_ptr result = std::make_shared(); + return result; +} + +std::shared_ptr list(const std::shared_ptr& value_type) { + return std::make_shared(value_type); +} + +std::shared_ptr list(const std::shared_ptr& value_field) { + return std::make_shared(value_field); +} + +std::shared_ptr struct_(const std::vector>& fields) { + return std::make_shared(fields); +} + +std::shared_ptr ARROW_EXPORT union_( + const std::vector>& child_fields, + const std::vector& type_ids, UnionMode mode) { + return std::make_shared(child_fields, type_ids, mode); +} + +std::shared_ptr field( + const std::string& name, const TypePtr& type, bool nullable, int64_t dictionary) { + return std::make_shared(name, type, nullable, dictionary); +} + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index ea8516fc347..5b4d7bc42bd 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,7 +23,9 @@ #include #include +#include "arrow/type_fwd.h" #include "arrow/util/macros.h" +#include "arrow/util/status.h" #include "arrow/util/visibility.h" namespace arrow { @@ -50,17 +52,20 @@ struct Type { UINT64 = 8, INT64 = 9, + // 2-byte floating point value + HALF_FLOAT = 10, + // 4-byte floating point value - FLOAT = 10, + FLOAT = 11, // 8-byte floating point value - DOUBLE = 11, + DOUBLE = 12, // UTF8 variable-length string as List STRING = 13, // Variable-length bytes (no guarantee of UTF8-ness) - BINARY = 15, + BINARY = 14, // By default, int32 days since the UNIX epoch DATE = 16, @@ -69,19 +74,16 @@ struct Type { // Default unit millisecond TIMESTAMP = 17, - // Timestamp as double seconds since the UNIX epoch - TIMESTAMP_DOUBLE = 18, - // Exact time encoded with int64, default unit millisecond - TIME = 19, + TIME = 18, + + // YEAR_MONTH or DAY_TIME interval in SQL style + INTERVAL = 19, // Precision- and scale-based decimal type. Storage type depends on the // parameters. DECIMAL = 20, - // Decimal value encoded as a text string - DECIMAL_TEXT = 21, - // A list of some logical data type LIST = 30, @@ -89,19 +91,16 @@ struct Type { STRUCT = 31, // Unions of logical types - DENSE_UNION = 32, - SPARSE_UNION = 33, + UNION = 32, - // Union - JSON_SCALAR = 50, + // Timestamp as double seconds since the UNIX epoch + TIMESTAMP_DOUBLE = 33, - // User-defined type - USER = 60 + // Decimal value encoded as a text string + DECIMAL_TEXT = 34, }; }; -struct Field; - struct ARROW_EXPORT DataType { Type::type type; @@ -123,15 +122,32 @@ struct ARROW_EXPORT DataType { const std::shared_ptr& child(int i) const { return children_[i]; } + const std::vector>& children() const { return children_; } + int num_children() const { return children_.size(); } - virtual int value_size() const { return -1; } + virtual Status Accept(TypeVisitor* visitor) const = 0; virtual std::string ToString() const = 0; }; typedef std::shared_ptr TypePtr; +struct ARROW_EXPORT FixedWidthMeta { + virtual int bit_width() const = 0; +}; + +struct ARROW_EXPORT IntegerMeta { + virtual bool is_signed() const = 0; +}; + +struct ARROW_EXPORT FloatingPointMeta { + enum Precision { HALF, SINGLE, DOUBLE }; + virtual Precision precision() const = 0; +}; + +struct NoExtraMeta {}; + // A field is a piece of metadata that includes (for now) a name and a data // type struct ARROW_EXPORT Field { @@ -139,7 +155,7 @@ struct ARROW_EXPORT Field { std::string name; // The field's data type - TypePtr type; + std::shared_ptr type; // Fields can be nullable bool nullable; @@ -148,8 +164,8 @@ struct ARROW_EXPORT Field { // 0 means it's not dictionary encoded int64_t dictionary; - Field(const std::string& name, const TypePtr& type, bool nullable = true, - int64_t dictionary = 0) + Field(const std::string& name, const std::shared_ptr& type, + bool nullable = true, int64_t dictionary = 0) : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } @@ -168,78 +184,112 @@ struct ARROW_EXPORT Field { }; typedef std::shared_ptr FieldPtr; -template -struct ARROW_EXPORT PrimitiveType : public DataType { - PrimitiveType() : DataType(Derived::type_enum) {} +struct PrimitiveCType : public DataType { + using DataType::DataType; +}; + +template +struct ARROW_EXPORT CTypeImpl : public PrimitiveCType, public FixedWidthMeta { + using c_type = C_TYPE; + static constexpr Type::type type_id = TYPE_ID; + + CTypeImpl() : PrimitiveCType(TYPE_ID) {} + int bit_width() const override { return sizeof(C_TYPE) * 8; } + + Status Accept(TypeVisitor* visitor) const override { + return visitor->Visit(*static_cast(this)); + } + + std::string ToString() const override { return std::string(DERIVED::name()); } +}; + +struct ARROW_EXPORT NullType : public DataType, public FixedWidthMeta { + static constexpr Type::type type_id = Type::NA; + + NullType() : DataType(Type::NA) {} + + int bit_width() const override; + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + + static std::string name() { return "null"; } +}; + +template +struct IntegerTypeImpl : public CTypeImpl, public IntegerMeta { + bool is_signed() const override { return std::is_signed::value; } }; -template -inline std::string PrimitiveType::ToString() const { - std::string result(static_cast(this)->name()); - return result; -} +struct ARROW_EXPORT BooleanType : public DataType, FixedWidthMeta { + static constexpr Type::type type_id = Type::BOOL; -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr Type::type type_enum = Type::ENUM; \ - \ - TYPENAME() : PrimitiveType() {} \ - \ - virtual int value_size() const { return SIZE; } \ - \ - static const char* name() { return NAME; } + BooleanType() : DataType(Type::BOOL) {} -struct ARROW_EXPORT NullType : public PrimitiveType { - PRIMITIVE_DECL(NullType, void, NA, 0, "null"); + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; + + int bit_width() const override { return 1; } + static std::string name() { return "bool"; } }; -struct ARROW_EXPORT BooleanType : public PrimitiveType { - PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); +struct ARROW_EXPORT UInt8Type : public IntegerTypeImpl { + static std::string name() { return "uint8"; } }; -struct ARROW_EXPORT UInt8Type : public PrimitiveType { - PRIMITIVE_DECL(UInt8Type, uint8_t, UINT8, 1, "uint8"); +struct ARROW_EXPORT Int8Type : public IntegerTypeImpl { + static std::string name() { return "int8"; } }; -struct ARROW_EXPORT Int8Type : public PrimitiveType { - PRIMITIVE_DECL(Int8Type, int8_t, INT8, 1, "int8"); +struct ARROW_EXPORT UInt16Type + : public IntegerTypeImpl { + static std::string name() { return "uint16"; } }; -struct ARROW_EXPORT UInt16Type : public PrimitiveType { - PRIMITIVE_DECL(UInt16Type, uint16_t, UINT16, 2, "uint16"); +struct ARROW_EXPORT Int16Type : public IntegerTypeImpl { + static std::string name() { return "int16"; } }; -struct ARROW_EXPORT Int16Type : public PrimitiveType { - PRIMITIVE_DECL(Int16Type, int16_t, INT16, 2, "int16"); +struct ARROW_EXPORT UInt32Type + : public IntegerTypeImpl { + static std::string name() { return "uint32"; } }; -struct ARROW_EXPORT UInt32Type : public PrimitiveType { - PRIMITIVE_DECL(UInt32Type, uint32_t, UINT32, 4, "uint32"); +struct ARROW_EXPORT Int32Type : public IntegerTypeImpl { + static std::string name() { return "int32"; } }; -struct ARROW_EXPORT Int32Type : public PrimitiveType { - PRIMITIVE_DECL(Int32Type, int32_t, INT32, 4, "int32"); +struct ARROW_EXPORT UInt64Type + : public IntegerTypeImpl { + static std::string name() { return "uint64"; } }; -struct ARROW_EXPORT UInt64Type : public PrimitiveType { - PRIMITIVE_DECL(UInt64Type, uint64_t, UINT64, 8, "uint64"); +struct ARROW_EXPORT Int64Type : public IntegerTypeImpl { + static std::string name() { return "int64"; } }; -struct ARROW_EXPORT Int64Type : public PrimitiveType { - PRIMITIVE_DECL(Int64Type, int64_t, INT64, 8, "int64"); +struct ARROW_EXPORT HalfFloatType + : public CTypeImpl, + public FloatingPointMeta { + Precision precision() const override; + static std::string name() { return "halffloat"; } }; -struct ARROW_EXPORT FloatType : public PrimitiveType { - PRIMITIVE_DECL(FloatType, float, FLOAT, 4, "float"); +struct ARROW_EXPORT FloatType : public CTypeImpl, + public FloatingPointMeta { + Precision precision() const override; + static std::string name() { return "float"; } }; -struct ARROW_EXPORT DoubleType : public PrimitiveType { - PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); +struct ARROW_EXPORT DoubleType : public CTypeImpl, + public FloatingPointMeta { + Precision precision() const override; + static std::string name() { return "double"; } }; -struct ARROW_EXPORT ListType : public DataType { +struct ARROW_EXPORT ListType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::LIST; + // List can contain any other logical value type explicit ListType(const std::shared_ptr& value_type) : ListType(std::make_shared("item", value_type)) {} @@ -252,16 +302,21 @@ struct ARROW_EXPORT ListType : public DataType { const std::shared_ptr& value_type() const { return children_[0]->type; } - static char const* name() { return "list"; } - + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + + static std::string name() { return "list"; } }; // BinaryType type is reprsents lists of 1-byte values. -struct ARROW_EXPORT BinaryType : public DataType { +struct ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::BINARY; + BinaryType() : BinaryType(Type::BINARY) {} - static char const* name() { return "binary"; } + + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static std::string name() { return "binary"; } protected: // Allow subclasses to change the logical type. @@ -270,25 +325,160 @@ struct ARROW_EXPORT BinaryType : public DataType { // UTF encoded strings struct ARROW_EXPORT StringType : public BinaryType { - StringType() : BinaryType(Type::STRING) {} + static constexpr Type::type type_id = Type::STRING; - static char const* name() { return "string"; } + StringType() : BinaryType(Type::STRING) {} + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static std::string name() { return "utf8"; } }; -struct ARROW_EXPORT StructType : public DataType { +struct ARROW_EXPORT StructType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::STRUCT; + explicit StructType(const std::vector>& fields) : DataType(Type::STRUCT) { children_ = fields; } + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static std::string name() { return "struct"; } +}; + +struct ARROW_EXPORT DecimalType : public DataType { + static constexpr Type::type type_id = Type::DECIMAL; + + explicit DecimalType(int precision_, int scale_) + : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} + int precision; + int scale; + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; + static std::string name() { return "decimal"; } +}; + +enum class UnionMode : char { SPARSE, DENSE }; + +struct ARROW_EXPORT UnionType : public DataType { + static constexpr Type::type type_id = Type::UNION; + + UnionType(const std::vector>& child_fields, + const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE) + : DataType(Type::UNION), mode(mode), type_ids(type_ids) { + children_ = child_fields; + } + + std::string ToString() const override; + static std::string name() { return "union"; } + Status Accept(TypeVisitor* visitor) const override; + + UnionMode mode; + std::vector type_ids; +}; + +struct ARROW_EXPORT DateType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::DATE; + + DateType() : DataType(Type::DATE) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override { return name(); } + static std::string name() { return "date"; } +}; + +enum class TimeUnit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; + +struct ARROW_EXPORT TimeType : public DataType { + static constexpr Type::type type_id = Type::TIME; + using Unit = TimeUnit; + + TimeUnit unit; + + explicit TimeType(TimeUnit unit = TimeUnit::MILLI) : DataType(Type::TIME), unit(unit) {} + TimeType(const TimeType& other) : TimeType(other.unit) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override { return name(); } + static std::string name() { return "time"; } +}; + +struct ARROW_EXPORT TimestampType : public DataType, public FixedWidthMeta { + using Unit = TimeUnit; + + typedef int64_t c_type; + static constexpr Type::type type_id = Type::TIMESTAMP; + + int bit_width() const override { return sizeof(int64_t) * 8; } + + TimeUnit unit; + + explicit TimestampType(TimeUnit unit = TimeUnit::MILLI) + : DataType(Type::TIMESTAMP), unit(unit) {} + + TimestampType(const TimestampType& other) : TimestampType(other.unit) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override { return name(); } + static std::string name() { return "timestamp"; } +}; + +struct ARROW_EXPORT IntervalType : public DataType, public FixedWidthMeta { + enum class Unit : char { YEAR_MONTH = 0, DAY_TIME = 1 }; + + typedef int64_t c_type; + static constexpr Type::type type_id = Type::INTERVAL; + + int bit_width() const override { return sizeof(int64_t) * 8; } + + Unit unit; + + explicit IntervalType(Unit unit = Unit::YEAR_MONTH) + : DataType(Type::INTERVAL), unit(unit) {} + + IntervalType(const IntervalType& other) : IntervalType(other.unit) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override { return name(); } + static std::string name() { return "date"; } }; -// These will be defined elsewhere -template -struct TypeTraits {}; +// Factory functions + +std::shared_ptr ARROW_EXPORT null(); +std::shared_ptr ARROW_EXPORT boolean(); +std::shared_ptr ARROW_EXPORT int8(); +std::shared_ptr ARROW_EXPORT int16(); +std::shared_ptr ARROW_EXPORT int32(); +std::shared_ptr ARROW_EXPORT int64(); +std::shared_ptr ARROW_EXPORT uint8(); +std::shared_ptr ARROW_EXPORT uint16(); +std::shared_ptr ARROW_EXPORT uint32(); +std::shared_ptr ARROW_EXPORT uint64(); +std::shared_ptr ARROW_EXPORT float16(); +std::shared_ptr ARROW_EXPORT float32(); +std::shared_ptr ARROW_EXPORT float64(); +std::shared_ptr ARROW_EXPORT utf8(); +std::shared_ptr ARROW_EXPORT binary(); + +std::shared_ptr ARROW_EXPORT list(const std::shared_ptr& value_type); +std::shared_ptr ARROW_EXPORT list(const std::shared_ptr& value_type); + +std::shared_ptr ARROW_EXPORT date(); +std::shared_ptr ARROW_EXPORT timestamp(TimeUnit unit); +std::shared_ptr ARROW_EXPORT time(TimeUnit unit); + +std::shared_ptr ARROW_EXPORT struct_( + const std::vector>& fields); + +std::shared_ptr ARROW_EXPORT union_( + const std::vector>& child_fields, + const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE); + +std::shared_ptr ARROW_EXPORT field(const std::string& name, + const std::shared_ptr& type, bool nullable = true, int64_t dictionary = 0); } // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h new file mode 100644 index 00000000000..6d660f4fdee --- /dev/null +++ b/cpp/src/arrow/type_fwd.h @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_FWD_H +#define ARROW_TYPE_FWD_H + +namespace arrow { + +class Status; + +struct DataType; +class Array; +class ArrayBuilder; +struct Field; + +class Buffer; +class MemoryPool; +class RecordBatch; +class Schema; + +struct NullType; +class NullArray; + +struct BooleanType; +class BooleanArray; +class BooleanBuilder; + +struct BinaryType; +class BinaryArray; +class BinaryBuilder; + +struct StringType; +class StringArray; +class StringBuilder; + +struct ListType; +class ListArray; +class ListBuilder; + +struct StructType; +class StructArray; +class StructBuilder; + +struct DecimalType; +class DecimalArray; + +struct UnionType; +class UnionArray; + +template +class NumericArray; + +template +class NumericBuilder; + +#define _NUMERIC_TYPE_DECL(KLASS) \ + struct KLASS##Type; \ + using KLASS##Array = NumericArray; \ + using KLASS##Builder = NumericBuilder; + +_NUMERIC_TYPE_DECL(Int8); +_NUMERIC_TYPE_DECL(Int16); +_NUMERIC_TYPE_DECL(Int32); +_NUMERIC_TYPE_DECL(Int64); +_NUMERIC_TYPE_DECL(UInt8); +_NUMERIC_TYPE_DECL(UInt16); +_NUMERIC_TYPE_DECL(UInt32); +_NUMERIC_TYPE_DECL(UInt64); +_NUMERIC_TYPE_DECL(HalfFloat); +_NUMERIC_TYPE_DECL(Float); +_NUMERIC_TYPE_DECL(Double); + +#undef _NUMERIC_TYPE_DECL + +struct DateType; +class DateArray; + +struct TimeType; +class TimeArray; + +struct TimestampType; +using TimestampArray = NumericArray; + +struct IntervalType; +using IntervalArray = NumericArray; + +class TypeVisitor { + public: + virtual Status Visit(const NullType& type) = 0; + virtual Status Visit(const BooleanType& type) = 0; + virtual Status Visit(const Int8Type& type) = 0; + virtual Status Visit(const Int16Type& type) = 0; + virtual Status Visit(const Int32Type& type) = 0; + virtual Status Visit(const Int64Type& type) = 0; + virtual Status Visit(const UInt8Type& type) = 0; + virtual Status Visit(const UInt16Type& type) = 0; + virtual Status Visit(const UInt32Type& type) = 0; + virtual Status Visit(const UInt64Type& type) = 0; + virtual Status Visit(const HalfFloatType& type) = 0; + virtual Status Visit(const FloatType& type) = 0; + virtual Status Visit(const DoubleType& type) = 0; + virtual Status Visit(const StringType& type) = 0; + virtual Status Visit(const BinaryType& type) = 0; + virtual Status Visit(const DateType& type) = 0; + virtual Status Visit(const TimeType& type) = 0; + virtual Status Visit(const TimestampType& type) = 0; + virtual Status Visit(const IntervalType& type) = 0; + virtual Status Visit(const DecimalType& type) = 0; + virtual Status Visit(const ListType& type) = 0; + virtual Status Visit(const StructType& type) = 0; + virtual Status Visit(const UnionType& type) = 0; +}; + +class ArrayVisitor { + public: + virtual Status Visit(const NullArray& array) = 0; + virtual Status Visit(const BooleanArray& array) = 0; + virtual Status Visit(const Int8Array& array) = 0; + virtual Status Visit(const Int16Array& array) = 0; + virtual Status Visit(const Int32Array& array) = 0; + virtual Status Visit(const Int64Array& array) = 0; + virtual Status Visit(const UInt8Array& array) = 0; + virtual Status Visit(const UInt16Array& array) = 0; + virtual Status Visit(const UInt32Array& array) = 0; + virtual Status Visit(const UInt64Array& array) = 0; + virtual Status Visit(const HalfFloatArray& array) = 0; + virtual Status Visit(const FloatArray& array) = 0; + virtual Status Visit(const DoubleArray& array) = 0; + virtual Status Visit(const StringArray& array) = 0; + virtual Status Visit(const BinaryArray& array) = 0; + virtual Status Visit(const DateArray& array) = 0; + virtual Status Visit(const TimeArray& array) = 0; + virtual Status Visit(const TimestampArray& array) = 0; + virtual Status Visit(const IntervalArray& array) = 0; + virtual Status Visit(const DecimalArray& array) = 0; + virtual Status Visit(const ListArray& array) = 0; + virtual Status Visit(const StructArray& array) = 0; + virtual Status Visit(const UnionArray& array) = 0; +}; + +} // namespace arrow + +#endif // ARROW_TYPE_FWD_H diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h new file mode 100644 index 00000000000..bbb807488e3 --- /dev/null +++ b/cpp/src/arrow/type_traits.h @@ -0,0 +1,197 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_TRAITS_H +#define ARROW_TYPE_TRAITS_H + +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/bit-util.h" + +namespace arrow { + +template +struct TypeTraits {}; + +template <> +struct TypeTraits { + using ArrayType = UInt8Array; + using BuilderType = UInt8Builder; + static inline int bytes_required(int elements) { return elements; } +}; + +template <> +struct TypeTraits { + using ArrayType = Int8Array; + using BuilderType = Int8Builder; + static inline int bytes_required(int elements) { return elements; } +}; + +template <> +struct TypeTraits { + using ArrayType = UInt16Array; + using BuilderType = UInt16Builder; + + static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = Int16Array; + using BuilderType = Int16Builder; + + static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = UInt32Array; + using BuilderType = UInt32Builder; + + static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = Int32Array; + using BuilderType = Int32Builder; + + static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = UInt64Array; + using BuilderType = UInt64Builder; + + static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = Int64Array; + using BuilderType = Int64Builder; + + static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = TimestampArray; + // using BuilderType = TimestampBuilder; + + static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = HalfFloatArray; + using BuilderType = HalfFloatBuilder; + + static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = FloatArray; + using BuilderType = FloatBuilder; + + static inline int bytes_required(int elements) { return elements * sizeof(float); } +}; + +template <> +struct TypeTraits { + using ArrayType = DoubleArray; + using BuilderType = DoubleBuilder; + + static inline int bytes_required(int elements) { return elements * sizeof(double); } +}; + +template <> +struct TypeTraits { + using ArrayType = BooleanArray; + using BuilderType = BooleanBuilder; + + static inline int bytes_required(int elements) { + return BitUtil::BytesForBits(elements); + } +}; + +template <> +struct TypeTraits { + using ArrayType = StringArray; + using BuilderType = StringBuilder; +}; + +template <> +struct TypeTraits { + using ArrayType = BinaryArray; + using BuilderType = BinaryBuilder; +}; + +// Not all type classes have a c_type +template +struct as_void { + using type = void; +}; + +// The partial specialization will match if T has the ATTR_NAME member +#define GET_ATTR(ATTR_NAME, DEFAULT) \ + template \ + struct GetAttr_##ATTR_NAME { \ + using type = DEFAULT; \ + }; \ + \ + template \ + struct GetAttr_##ATTR_NAME::type> { \ + using type = typename T::ATTR_NAME; \ + }; + +GET_ATTR(c_type, void); +GET_ATTR(TypeClass, void); + +#undef GET_ATTR + +#define PRIMITIVE_TRAITS(T) \ + using TypeClass = typename std::conditional::value, T, \ + typename GetAttr_TypeClass::type>::type; \ + using c_type = typename GetAttr_c_type::type; + +template +struct IsUnsignedInt { + PRIMITIVE_TRAITS(T); + static constexpr bool value = + std::is_integral::value && std::is_unsigned::value; +}; + +template +struct IsSignedInt { + PRIMITIVE_TRAITS(T); + static constexpr bool value = + std::is_integral::value && std::is_signed::value; +}; + +template +struct IsFloatingPoint { + PRIMITIVE_TRAITS(T); + static constexpr bool value = std::is_floating_point::value; +}; + +} // namespace arrow + +#endif // ARROW_TYPE_TRAITS_H diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 9f781698982..6d59acfdf2e 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -21,7 +21,6 @@ # Headers: top level install(FILES - collection.h construct.h datetime.h decimal.h diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h deleted file mode 100644 index 1712030203f..00000000000 --- a/cpp/src/arrow/types/collection.h +++ /dev/null @@ -1,41 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_COLLECTION_H -#define ARROW_TYPES_COLLECTION_H - -#include -#include - -#include "arrow/type.h" - -namespace arrow { - -template -struct CollectionType : public DataType { - std::vector child_types_; - - CollectionType() : DataType(T) {} - - const TypePtr& child(int i) const { return child_types_[i]; } - - int num_children() const { return child_types_.size(); } -}; - -} // namespace arrow - -#endif // ARROW_TYPES_COLLECTION_H diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index 241a126d100..a8f86392312 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -22,41 +22,6 @@ #include "arrow/type.h" -namespace arrow { - -struct DateType : public DataType { - enum class Unit : char { DAY = 0, MONTH = 1, YEAR = 2 }; - - Unit unit; - - explicit DateType(Unit unit = Unit::DAY) : DataType(Type::DATE), unit(unit) {} - - DateType(const DateType& other) : DateType(other.unit) {} - - static char const* name() { return "date"; } -}; - -struct ARROW_EXPORT TimestampType : public DataType { - enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; - - typedef int64_t c_type; - static constexpr Type::type type_enum = Type::TIMESTAMP; - - int value_size() const override { return sizeof(int64_t); } - - Unit unit; - - explicit TimestampType(Unit unit = Unit::MILLI) - : DataType(Type::TIMESTAMP), unit(unit) {} - - TimestampType(const TimestampType& other) : TimestampType(other.unit) {} - virtual ~TimestampType() {} - - std::string ToString() const override { return "timestamp"; } - - static char const* name() { return "timestamp"; } -}; - -} // namespace arrow +namespace arrow {} // namespace arrow #endif // ARROW_TYPES_DATETIME_H diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 6c497c597d9..b3ea3a56d80 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -23,18 +23,6 @@ #include "arrow/type.h" #include "arrow/util/visibility.h" -namespace arrow { - -struct ARROW_EXPORT DecimalType : public DataType { - explicit DecimalType(int precision_, int scale_) - : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} - int precision; - int scale; - static char const* name() { return "decimal"; } - - std::string ToString() const override; -}; - -} // namespace arrow +namespace arrow {} // namespace arrow #endif // ARROW_TYPES_DECIMAL_H diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 12c539495a2..cb9a8c12d8a 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -141,7 +141,7 @@ TEST_F(TestListBuilder, TestAppendNull) { ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_EQ(0, result_->offsets()[0]); + ASSERT_EQ(0, result_->raw_offsets()[0]); ASSERT_EQ(0, result_->offset(1)); ASSERT_EQ(0, result_->offset(2)); diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 4b1e8214727..d86563253bd 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -155,4 +155,8 @@ void ListBuilder::Reset() { null_bitmap_ = nullptr; } +Status ListArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 9440ffed4bf..bd93e8fdcfa 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -39,6 +39,8 @@ class MemoryPool; class ARROW_EXPORT ListArray : public Array { public: + using TypeClass = ListType; + ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, const ArrayPtr& values, int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) @@ -56,13 +58,13 @@ class ARROW_EXPORT ListArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. const std::shared_ptr& values() const { return values_; } - const std::shared_ptr offset_buffer() const { + std::shared_ptr offsets() const { return std::static_pointer_cast(offset_buffer_); } const std::shared_ptr& value_type() const { return values_->type(); } - const int32_t* offsets() const { return offsets_; } + const int32_t* raw_offsets() const { return offsets_; } int32_t offset(int i) const { return offsets_[i]; } @@ -76,6 +78,8 @@ class ARROW_EXPORT ListArray : public Array { bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, const ArrayPtr& arr) const override; + Status Accept(ArrayVisitor* visitor) const override; + protected: std::shared_ptr offset_buffer_; const int32_t* offsets_; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index e47f6dc74fb..bdc8ec00be0 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -25,6 +25,7 @@ #include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/types/construct.h" #include "arrow/types/primitive.h" #include "arrow/types/test-common.h" @@ -41,15 +42,15 @@ namespace arrow { class Array; -#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ - TEST(TypesTest, TestPrimitive_##ENUM) { \ - KLASS tp; \ - \ - ASSERT_EQ(tp.type, Type::ENUM); \ - ASSERT_EQ(tp.name(), string(NAME)); \ - \ - KLASS tp_copy = tp; \ - ASSERT_EQ(tp_copy.type, Type::ENUM); \ +#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ + TEST(TypesTest, TestPrimitive_##ENUM) { \ + KLASS tp; \ + \ + ASSERT_EQ(tp.type, Type::ENUM); \ + ASSERT_EQ(tp.ToString(), string(NAME)); \ + \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, Type::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -243,7 +244,8 @@ void TestPrimitiveBuilder::Check( } typedef ::testing::Types Primitives; + PInt32, PInt64, PFloat, PDouble> + Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); @@ -311,20 +313,6 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); } -template -Status MakeArray(const vector& valid_bytes, const vector& draws, int size, - Builder* builder, ArrayPtr* out) { - // Append the first 1000 - for (int i = 0; i < size; ++i) { - if (valid_bytes[i] > 0) { - RETURN_NOT_OK(builder->Append(draws[i])); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - return builder->Finish(out); -} - TYPED_TEST(TestPrimitiveBuilder, Equality) { DECL_T(); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index d2288bafa71..14667ee5b6e 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -19,6 +19,7 @@ #include +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" @@ -48,13 +49,14 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; - int value_size = type_->value_size(); - DCHECK_GT(value_size, 0); + auto size_meta = dynamic_cast(type_.get()); + int value_byte_size = size_meta->bit_width() / 8; + DCHECK_GT(value_byte_size, 0); for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && memcmp(this_data, other_data, value_size)) { return false; } - this_data += value_size; - other_data += value_size; + if (!IsNull(i) && memcmp(this_data, other_data, value_byte_size)) { return false; } + this_data += value_byte_size; + other_data += value_byte_size; } return true; } else { @@ -70,6 +72,11 @@ bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +template +Status NumericArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + template class NumericArray; template class NumericArray; template class NumericArray; @@ -79,9 +86,9 @@ template class NumericArray; template class NumericArray; template class NumericArray; template class NumericArray; +template class NumericArray; template class NumericArray; template class NumericArray; -template class NumericArray; template Status PrimitiveBuilder::Init(int32_t capacity) { @@ -145,8 +152,65 @@ Status PrimitiveBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } -template <> -Status PrimitiveBuilder::Append( +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +Status BooleanBuilder::Init(int32_t capacity) { + RETURN_NOT_OK(ArrayBuilder::Init(capacity)); + data_ = std::make_shared(pool_); + + int64_t nbytes = BitUtil::BytesForBits(capacity); + RETURN_NOT_OK(data_->Resize(nbytes)); + // TODO(emkornfield) valgrind complains without this + memset(data_->mutable_data(), 0, nbytes); + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return Status::OK(); +} + +Status BooleanBuilder::Resize(int32_t capacity) { + // XXX: Set floor size for now + if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } + + if (capacity_ == 0) { + RETURN_NOT_OK(Init(capacity)); + } else { + RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); + const int64_t old_bytes = data_->size(); + const int64_t new_bytes = BitUtil::BytesForBits(capacity); + + RETURN_NOT_OK(data_->Resize(new_bytes)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + } + return Status::OK(); +} + +Status BooleanBuilder::Finish(std::shared_ptr* out) { + const int64_t bytes_required = BitUtil::BytesForBits(length_); + + if (bytes_required > 0 && bytes_required < data_->size()) { + // Trim buffers + RETURN_NOT_OK(data_->Resize(bytes_required)); + } + *out = std::make_shared(type_, length_, data_, null_count_, null_bitmap_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status BooleanBuilder::Append( const uint8_t* values, int32_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); @@ -168,19 +232,6 @@ Status PrimitiveBuilder::Append( return Status::OK(); } -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; - BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) : PrimitiveArray( @@ -235,4 +286,8 @@ bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx, return true; } +Status BooleanArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c71df584ffe..a5a3704e2d2 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/types/datetime.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -54,9 +55,10 @@ class ARROW_EXPORT PrimitiveArray : public Array { const uint8_t* raw_data_; }; -template +template class ARROW_EXPORT NumericArray : public PrimitiveArray { public: + using TypeClass = TYPE; using value_type = typename TypeClass::c_type; NumericArray(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) @@ -88,29 +90,15 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { return reinterpret_cast(raw_data_); } + Status Accept(ArrayVisitor* visitor) const override; + value_type Value(int i) const { return raw_data()[i]; } }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass) \ - using NAME = NumericArray; \ - extern template class ARROW_EXPORT NumericArray; - -NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type); -NUMERIC_ARRAY_DECL(Int8Array, Int8Type); -NUMERIC_ARRAY_DECL(UInt16Array, UInt16Type); -NUMERIC_ARRAY_DECL(Int16Array, Int16Type); -NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type); -NUMERIC_ARRAY_DECL(Int32Array, Int32Type); -NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type); -NUMERIC_ARRAY_DECL(Int64Array, Int64Type); -NUMERIC_ARRAY_DECL(TimestampArray, TimestampType); -NUMERIC_ARRAY_DECL(FloatArray, FloatType); -NUMERIC_ARRAY_DECL(DoubleArray, DoubleType); - template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: - typedef typename Type::c_type value_type; + using value_type = typename Type::c_type; explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type), data_(nullptr) {} @@ -183,101 +171,27 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::raw_data_; }; -template <> -struct TypeTraits { - typedef UInt8Array ArrayType; - - static inline int bytes_required(int elements) { return elements; } -}; - -template <> -struct TypeTraits { - typedef Int8Array ArrayType; - - static inline int bytes_required(int elements) { return elements; } -}; - -template <> -struct TypeTraits { - typedef UInt16Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } -}; - -template <> -struct TypeTraits { - typedef Int16Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } -}; - -template <> -struct TypeTraits { - typedef UInt32Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } -}; - -template <> -struct TypeTraits { - typedef Int32Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } -}; - -template <> -struct TypeTraits { - typedef UInt64Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } -}; - -template <> -struct TypeTraits { - typedef Int64Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } -}; - -template <> -struct TypeTraits { - typedef TimestampArray ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } -}; -template <> - -struct TypeTraits { - typedef FloatArray ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(float); } -}; - -template <> -struct TypeTraits { - typedef DoubleArray ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(double); } -}; - // Builders -typedef NumericBuilder UInt8Builder; -typedef NumericBuilder UInt16Builder; -typedef NumericBuilder UInt32Builder; -typedef NumericBuilder UInt64Builder; +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; -typedef NumericBuilder Int8Builder; -typedef NumericBuilder Int16Builder; -typedef NumericBuilder Int32Builder; -typedef NumericBuilder Int64Builder; -typedef NumericBuilder TimestampBuilder; +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; -typedef NumericBuilder FloatBuilder; -typedef NumericBuilder DoubleBuilder; +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: + using TypeClass = BooleanType; + BooleanArray(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); BooleanArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, @@ -288,28 +202,36 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, const ArrayPtr& arr) const override; + Status Accept(ArrayVisitor* visitor) const override; + const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); } }; -template <> -struct TypeTraits { - typedef BooleanArray ArrayType; - - static inline int bytes_required(int elements) { - return BitUtil::BytesForBits(elements); - } -}; - -class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { +class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) - : PrimitiveBuilder(pool, type) {} + : ArrayBuilder(pool, type), data_(nullptr) {} virtual ~BooleanBuilder() {} - using PrimitiveBuilder::Append; + using ArrayBuilder::Advance; + + // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int32_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + Status AppendNull() { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + std::shared_ptr data() const { return data_; } // Scalar append Status Append(bool val) { @@ -324,9 +246,39 @@ class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { return Status::OK(); } - Status Append(uint8_t val) { return Append(static_cast(val)); } + // Vector append + // + // If passed, valid_bytes is of equal length to values, and any zero byte + // will be considered as a null for that slot + Status Append( + const uint8_t* values, int32_t length, const uint8_t* valid_bytes = nullptr); + + Status Finish(std::shared_ptr* out) override; + Status Init(int32_t capacity) override; + + // Increase the capacity of the builder to accommodate at least the indicated + // number of elements + Status Resize(int32_t capacity) override; + + protected: + std::shared_ptr data_; + uint8_t* raw_data_; }; +// Only instantiate these templates once +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; + } // namespace arrow #endif // ARROW_TYPES_PRIMITIVE_H diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index af87a14a8b3..3c4b12b7bc7 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -47,7 +47,7 @@ TEST(TypesTest, BinaryType) { TEST(TypesTest, TestStringType) { StringType str; ASSERT_EQ(str.type, Type::STRING); - ASSERT_EQ(str.name(), std::string("string")); + ASSERT_EQ(str.ToString(), std::string("string")); } // ---------------------------------------------------------------------- @@ -66,8 +66,8 @@ class TestStringContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - value_buf_ = test::to_buffer(chars_); - offsets_buf_ = test::to_buffer(offsets_); + value_buf_ = test::GetBufferFromVector(chars_); + offsets_buf_ = test::GetBufferFromVector(offsets_); null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); @@ -131,7 +131,7 @@ TEST_F(TestStringContainer, TestGetString) { TEST_F(TestStringContainer, TestEmptyStringComparison) { offsets_ = {0, 0, 0, 0, 0, 0}; - offsets_buf_ = test::to_buffer(offsets_); + offsets_buf_ = test::GetBufferFromVector(offsets_); length_ = offsets_.size() - 1; auto strings_a = std::make_shared( @@ -227,8 +227,8 @@ class TestBinaryContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - value_buf_ = test::to_buffer(chars_); - offsets_buf_ = test::to_buffer(offsets_); + value_buf_ = test::GetBufferFromVector(chars_); + offsets_buf_ = test::GetBufferFromVector(offsets_); null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index f6d26df3167..db963dfa0de 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -94,6 +94,10 @@ bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_ return true; } +Status BinaryArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) @@ -104,6 +108,10 @@ Status StringArray::Validate() const { return BinaryArray::Validate(); } +Status StringArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + // This used to be a static member variable of BinaryBuilder, but it can cause // valgrind to report a (spurious?) memory leak when needed in other shared // libraries. The problem came up while adding explicit visibility to libarrow @@ -122,8 +130,8 @@ Status BinaryBuilder::Finish(std::shared_ptr* out) { const auto list = std::dynamic_pointer_cast(result); auto values = std::dynamic_pointer_cast(list->values()); - *out = std::make_shared(list->length(), list->offset_buffer(), - values->data(), list->null_count(), list->null_bitmap()); + *out = std::make_shared(list->length(), list->offsets(), values->data(), + list->null_count(), list->null_bitmap()); return Status::OK(); } @@ -134,8 +142,8 @@ Status StringBuilder::Finish(std::shared_ptr* out) { const auto list = std::dynamic_pointer_cast(result); auto values = std::dynamic_pointer_cast(list->values()); - *out = std::make_shared(list->length(), list->offset_buffer(), - values->data(), list->null_count(), list->null_bitmap()); + *out = std::make_shared(list->length(), list->offsets(), values->data(), + list->null_count(), list->null_bitmap()); return Status::OK(); } diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index aaba49c6023..c8752439f16 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -37,6 +37,8 @@ class MemoryPool; class ARROW_EXPORT BinaryArray : public Array { public: + using TypeClass = BinaryType; + BinaryArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); @@ -60,6 +62,8 @@ class ARROW_EXPORT BinaryArray : public Array { std::shared_ptr data() const { return data_buffer_; } std::shared_ptr offsets() const { return offset_buffer_; } + const int32_t* raw_offsets() const { return offsets_; } + int32_t offset(int i) const { return offsets_[i]; } // Neither of these functions will perform boundschecking @@ -73,6 +77,8 @@ class ARROW_EXPORT BinaryArray : public Array { Status Validate() const override; + Status Accept(ArrayVisitor* visitor) const override; + private: std::shared_ptr offset_buffer_; const int32_t* offsets_; @@ -83,6 +89,8 @@ class ARROW_EXPORT BinaryArray : public Array { class ARROW_EXPORT StringArray : public BinaryArray { public: + using TypeClass = StringType; + StringArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); @@ -96,6 +104,8 @@ class ARROW_EXPORT StringArray : public BinaryArray { } Status Validate() const override; + + Status Accept(ArrayVisitor* visitor) const override; }; // BinaryBuilder : public ListBuilder @@ -109,6 +119,12 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { return byte_builder_->Append(value, length); } + Status Append(const char* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } + Status Finish(std::shared_ptr* out) override; protected: @@ -121,13 +137,9 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : BinaryBuilder(pool, type) {} - Status Finish(std::shared_ptr* out) override; - - Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } + using BinaryBuilder::Append; - Status Append(const char* value, int32_t length) { - return BinaryBuilder::Append(reinterpret_cast(value), length); - } + Status Finish(std::shared_ptr* out) override; Status Append(const std::vector& values, uint8_t* null_bytes); }; diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 8e82c389a94..197d7d4ad1f 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -80,7 +80,7 @@ void ValidateBasicStructArray(const StructArray* result, ASSERT_EQ(4, list_char_arr->length()); ASSERT_EQ(10, list_char_arr->values()->length()); for (size_t i = 0; i < list_offsets.size(); ++i) { - ASSERT_EQ(list_offsets[i], list_char_arr->offsets()[i]); + ASSERT_EQ(list_offsets[i], list_char_arr->raw_offsets()[i]); } for (size_t i = 0; i < list_values.size(); ++i) { ASSERT_EQ(list_values[i], char_arr->Value(i)); diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 369c29d15ef..0e0db23544b 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -87,6 +87,10 @@ Status StructArray::Validate() const { return Status::OK(); } +Status StructArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + Status StructBuilder::Finish(std::shared_ptr* out) { std::vector> fields(field_builders_.size()); for (size_t i = 0; i < field_builders_.size(); ++i) { diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 65b8daf214a..035af051325 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -31,6 +31,8 @@ namespace arrow { class ARROW_EXPORT StructArray : public Array { public: + using TypeClass = StructType; + StructArray(const TypePtr& type, int32_t length, std::vector& field_arrays, int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) : Array(type, length, null_count, null_bitmap) { @@ -55,6 +57,8 @@ class ARROW_EXPORT StructArray : public Array { bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, const std::shared_ptr& arr) const override; + Status Accept(ArrayVisitor* visitor) const override; + protected: // The child arrays corresponding to each field of the struct data type. std::vector field_arrays_; diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 1957636b141..6e6ab85ad4e 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -24,6 +24,8 @@ #include "gtest/gtest.h" +#include "arrow/array.h" +#include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/memory-pool.h" @@ -49,6 +51,20 @@ class TestBuilder : public ::testing::Test { unique_ptr builder_nn_; }; +template +Status MakeArray(const std::vector& valid_bytes, const std::vector& values, + int size, Builder* builder, ArrayPtr* out) { + // Append the first 1000 + for (int i = 0; i < size; ++i) { + if (valid_bytes[i] > 0) { + RETURN_NOT_OK(builder->Append(values[i])); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return builder->Finish(out); +} + } // namespace arrow #endif // ARROW_TYPES_TEST_COMMON_H diff --git a/cpp/src/arrow/types/union.cc b/cpp/src/arrow/types/union.cc index c891b4a5357..cc2934b2e4a 100644 --- a/cpp/src/arrow/types/union.cc +++ b/cpp/src/arrow/types/union.cc @@ -24,25 +24,4 @@ #include "arrow/type.h" -namespace arrow { - -static inline std::string format_union(const std::vector& child_types) { - std::stringstream s; - s << "union<"; - for (size_t i = 0; i < child_types.size(); ++i) { - if (i) { s << ", "; } - s << child_types[i]->ToString(); - } - s << ">"; - return s.str(); -} - -std::string DenseUnionType::ToString() const { - return format_union(child_types_); -} - -std::string SparseUnionType::ToString() const { - return format_union(child_types_); -} - -} // namespace arrow +namespace arrow {} // namespace arrow diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index d2ee9bde04d..44f39cc6994 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -24,32 +24,11 @@ #include "arrow/array.h" #include "arrow/type.h" -#include "arrow/types/collection.h" namespace arrow { class Buffer; -struct DenseUnionType : public CollectionType { - typedef CollectionType Base; - - explicit DenseUnionType(const std::vector& child_types) : Base() { - child_types_ = child_types; - } - - virtual std::string ToString() const; -}; - -struct SparseUnionType : public CollectionType { - typedef CollectionType Base; - - explicit SparseUnionType(const std::vector& child_types) : Base() { - child_types_ = child_types; - } - - virtual std::string ToString() const; -}; - class UnionArray : public Array { protected: // The data are types encoded as int16 diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 06ee8411e28..b22f07dd634 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -118,9 +118,9 @@ class CerrLog { class FatalLog : public CerrLog { public: explicit FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL) {} // NOLINT + : CerrLog(ARROW_FATAL){} // NOLINT - [[noreturn]] ~FatalLog() { + [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } std::exit(1); } diff --git a/format/Metadata.md b/format/Metadata.md index 653a4c73e83..a4878f34707 100644 --- a/format/Metadata.md +++ b/format/Metadata.md @@ -98,6 +98,11 @@ Union: "typeIds" : [ /* integer */ ] } ``` + +The `typeIds` field in the Union are the codes used to denote each type, which +may be different from the index of the child array. This is so that the union +type ids do not have to be enumerated from 0. + Int: ``` { From 58bd7bedc63d66d5898297bab25b54dfb67665db Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 18 Nov 2016 12:48:11 -0800 Subject: [PATCH 202/210] implement dense unions --- cpp/src/arrow/builder.h | 9 +++++ cpp/src/arrow/ipc/adapter.cc | 24 +++++++++++++ cpp/src/arrow/ipc/ipc-metadata-test.cc | 17 ++++++++++ cpp/src/arrow/ipc/metadata-internal.cc | 32 ++++++++++++++++-- cpp/src/arrow/type.cc | 10 ++++++ cpp/src/arrow/type.h | 5 +++ cpp/src/arrow/types/union.cc | 47 +++++++++++++++++++++++++- cpp/src/arrow/types/union.h | 38 ++++++++++++++++----- 8 files changed, 170 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index cef17e5aaba..3babf4ba78e 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -130,6 +130,15 @@ class ARROW_EXPORT ArrayBuilder { DISALLOW_COPY_AND_ASSIGN(ArrayBuilder); }; +class ARROW_EXPORT NullArrayBuilder : public ArrayBuilder { + public: + explicit NullArrayBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type) {} + virtual ~NullArrayBuilder() {}; + Status Finish(std::shared_ptr* out) override { + return Status::OK(); + } +}; + } // namespace arrow #endif // ARROW_BUILDER_H_ diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index da718c08d54..4628ace3abf 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -37,6 +37,7 @@ #include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/types/union.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" @@ -115,6 +116,13 @@ Status VisitArray(const Array* arr, std::vector* field_nodes RETURN_NOT_OK( VisitArray(field.get(), field_nodes, buffers, max_recursion_depth - 1)); } + } else if (arr->type_enum() == Type::UNION) { + const auto union_arr = static_cast(arr); + buffers->push_back(union_arr->types()); + buffers->push_back(union_arr->offset_buf()); + for (auto& child_arr : union_arr->children()) { + RETURN_NOT_OK(VisitArray(child_arr.get(), field_nodes, buffers, max_recursion_depth - 1)); + } } else { return Status::NotImplemented("Unrecognized type"); } @@ -363,6 +371,22 @@ class RecordBatchReader::RecordBatchReaderImpl { out->reset(new StructArray( type, field_meta.length, fields, field_meta.null_count, null_bitmap)); return Status::OK(); + } else if (type->type == Type::UNION) { + std::shared_ptr types; + RETURN_NOT_OK(GetBuffer(buffer_index_++, &types)); + std::shared_ptr offset_buf; + RETURN_NOT_OK(GetBuffer(buffer_index_++, &offset_buf)); + auto union_type = std::dynamic_pointer_cast(type); + const int num_children = union_type->num_children(); + std::vector results; + for (int child_idx = 0; child_idx < num_children; ++child_idx) { + std::shared_ptr result; + RETURN_NOT_OK(NextArray(union_type->child(child_idx).get(), max_recursion_depth - 1, &result)); + results.push_back(result); + } + out->reset(new UnionArray( + type, field_meta.length, results, types, offset_buf, field_meta.null_count, null_bitmap)); + return Status::OK(); } return Status::NotImplemented("Non-primitive types not complete yet"); diff --git a/cpp/src/arrow/ipc/ipc-metadata-test.cc b/cpp/src/arrow/ipc/ipc-metadata-test.cc index 1dc39692332..8d101cf586a 100644 --- a/cpp/src/arrow/ipc/ipc-metadata-test.cc +++ b/cpp/src/arrow/ipc/ipc-metadata-test.cc @@ -26,6 +26,7 @@ #include "arrow/schema.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/types/union.h" #include "arrow/util/status.h" namespace arrow { @@ -97,6 +98,22 @@ TEST_F(TestSchemaMessage, NestedFields) { CheckRoundtrip(&schema); } +TEST_F(TestSchemaMessage, UnionType) { + auto f0 = std::make_shared("f0", TypePtr(new Int32Type())); + auto f1 = std::make_shared("f1", TypePtr(new Int64Type())); + std::vector type_ids = {}; // TODO(pcm): Implement typeIds + auto ud = TypePtr(new UnionType(std::vector>({f0, f1}), + type_ids, UnionMode::DENSE)); + auto fd = std::make_shared("f", ud); + Schema schema_dense({fd}); + CheckRoundtrip(&schema_dense); + auto us = TypePtr(new UnionType(std::vector>({f0, f1}), + type_ids, UnionMode::SPARSE)); + auto fs = std::make_shared("f", us); + Schema schema_sparse({fs}); + CheckRoundtrip(&schema_sparse); +} + class TestFileFooter : public ::testing::Test { public: void SetUp() {} diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc index 7102012c29a..19a810ac1c6 100644 --- a/cpp/src/arrow/ipc/metadata-internal.cc +++ b/cpp/src/arrow/ipc/metadata-internal.cc @@ -28,6 +28,7 @@ #include "arrow/ipc/Message_generated.h" #include "arrow/schema.h" #include "arrow/type.h" +#include "arrow/types/union.h" #include "arrow/util/buffer.h" #include "arrow/util/status.h" @@ -119,8 +120,20 @@ static Status TypeFromFlatbuffer(flatbuf::Type type, const void* type_data, case flatbuf::Type_Struct_: *out = std::make_shared(children); return Status::OK(); - case flatbuf::Type_Union: - return Status::NotImplemented("Type is not implemented"); + case flatbuf::Type_Union: { + std::vector type_ids = {}; // TODO(pcm): Implement typeIds + auto union_data = static_cast(type_data); + UnionMode mode; + if (union_data->mode() == flatbuf::UnionMode_Sparse) { + mode = UnionMode::SPARSE; + } else if (union_data->mode() == flatbuf::UnionMode_Dense) { + mode = UnionMode::DENSE; + } else { + return Status::Invalid("Unrecognized UnionMode"); + } + *out = std::make_shared(children, type_ids, mode); + } + return Status::OK(); default: return Status::Invalid("Unrecognized type"); } @@ -158,6 +171,18 @@ static Status StructToFlatbuffer(FBB& fbb, const std::shared_ptr& type return Status::OK(); } +static Status UnionToFlatbuffer(FBB& fbb, const std::shared_ptr& type, + std::vector* out_children, Offset* offset) { + auto union_type = std::dynamic_pointer_cast(type); + FieldOffset field; + for (int i = 0; i < union_type->num_children(); ++i) { + RETURN_NOT_OK(FieldToFlatbuffer(fbb, union_type->child(i), &field)); + out_children->push_back(field); + } + *offset = flatbuf::CreateUnion(fbb).Union(); + return Status::OK(); +} + #define INT_TO_FB_CASE(BIT_WIDTH, IS_SIGNED) \ *out_type = flatbuf::Type_Int; \ *offset = IntToFlatbuffer(fbb, BIT_WIDTH, IS_SIGNED); \ @@ -208,6 +233,9 @@ static Status TypeToFlatbuffer(FBB& fbb, const std::shared_ptr& type, case Type::STRUCT: *out_type = flatbuf::Type_Struct_; return StructToFlatbuffer(fbb, type, children, offset); + case Type::UNION: + *out_type = flatbuf::Type_Union; + return UnionToFlatbuffer(fbb, type, children, offset); default: *out_type = flatbuf::Type_NONE; // Make clang-tidy happy std::stringstream ss; diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 589bdadb77c..e82486d0ed2 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -105,6 +105,16 @@ std::string UnionType::ToString() const { return s.str(); } +bool UnionType::Equals(const DataType* other) const { + if (!DataType::Equals(other)) { + return false; + } + const UnionType *union_type = dynamic_cast(other); + return union_type && type_id == union_type->type_id + && std::equal(type_ids.begin(), type_ids.end(), + union_type->type_ids.begin()); +} + int NullType::bit_width() const { return 0; } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 5b4d7bc42bd..e2f47f11b07 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -375,6 +375,11 @@ struct ARROW_EXPORT UnionType : public DataType { static std::string name() { return "union"; } Status Accept(TypeVisitor* visitor) const override; + bool Equals(const DataType* other) const override; + bool Equals(const std::shared_ptr& other) const { + return Equals(other.get()); + } + UnionMode mode; std::vector type_ids; }; diff --git a/cpp/src/arrow/types/union.cc b/cpp/src/arrow/types/union.cc index cc2934b2e4a..2a5be998694 100644 --- a/cpp/src/arrow/types/union.cc +++ b/cpp/src/arrow/types/union.cc @@ -23,5 +23,50 @@ #include #include "arrow/type.h" +#include "arrow/util/status.h" -namespace arrow {} // namespace arrow +namespace arrow { + +bool UnionArray::Equals(const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (!arr) { return false; } + if (this->type_enum() != arr->type_enum()) { return false; } + if (null_count_ != arr->null_count()) { return false; } + return RangeEquals(0, length_, 0, arr); +} + +bool UnionArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const { + if (this == arr.get()) { return true; } + if (Type::UNION != arr->type_enum()) { return false; } + const auto other = static_cast(arr.get()); + + // TODO(pcm): Handle sparse case here + + int32_t i = start_idx; + int32_t o_i = other_start_idx; + for (size_t c = 0; c < other->children().size(); ++c) { + for (int32_t e = 0; e < other->children()[c]->length(); ++e) { + if (!children()[c]->RangeEquals(e, e + 1, e, other->children()[c])) { // FIXME(pcm): fix this + return false; + } + i += 1; + o_i += 1; + if (i >= end_idx) { + return true; + } + } + } + return false; // to make the compiler happy +} + +Status UnionArray::Validate() const { + // TODO(pcm): what to do here? + return Status::OK(); +} + +Status UnionArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + +} // namespace arrow diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index 44f39cc6994..2e56ecf2e99 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -24,25 +24,45 @@ #include "arrow/array.h" #include "arrow/type.h" +#include "arrow/types/primitive.h" namespace arrow { class Buffer; -class UnionArray : public Array { +class ARROW_EXPORT UnionArray : public Array { + public: + UnionArray(const TypePtr& type, int32_t length, std::vector& children, + std::shared_ptr types, std::shared_ptr offset_buf, + int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) + : Array(type, length, null_count, null_bitmap), types_(types) { + type_ = type; + children_ = children; + offset_buf_ = offset_buf; + } + + const std::shared_ptr& types() const { return types_; } + + const std::vector& children() const { return children_; } + + const std::shared_ptr& offset_buf() const { return offset_buf_; } + + Status Validate() const override; + + Status Accept(ArrayVisitor* visitor) const override; + + bool Equals(const std::shared_ptr& arr) const override; + bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, + const std::shared_ptr& arr) const override; + + ArrayPtr child(int32_t index) const { return children_[index]; } protected: // The data are types encoded as int16 - Buffer* types_; + std::shared_ptr types_; std::vector> children_; + std::shared_ptr offset_buf_; }; -class DenseUnionArray : public UnionArray { - protected: - Buffer* offset_buf_; -}; - -class SparseUnionArray : public UnionArray {}; - } // namespace arrow #endif // ARROW_TYPES_UNION_H From c88bd70c13cf16c07b840623cb466aa98d535be0 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Sat, 19 Nov 2016 22:21:30 -0800 Subject: [PATCH 203/210] Build arrow_io and arrow_ipc as static libraries. --- cpp/src/arrow/io/CMakeLists.txt | 4 +++- cpp/src/arrow/ipc/CMakeLists.txt | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/io/CMakeLists.txt b/cpp/src/arrow/io/CMakeLists.txt index 47bb0893863..2c562bc028d 100644 --- a/cpp/src/arrow/io/CMakeLists.txt +++ b/cpp/src/arrow/io/CMakeLists.txt @@ -80,9 +80,11 @@ if(ARROW_HDFS) ${ARROW_IO_TEST_LINK_LIBS}) endif() -add_library(arrow_io SHARED +add_library(arrow_io STATIC ${ARROW_IO_SRCS} ) +set_property(TARGET arrow_io PROPERTY POSITION_INDEPENDENT_CODE 1) + target_link_libraries(arrow_io LINK_PUBLIC ${ARROW_IO_LINK_LIBS} LINK_PRIVATE ${ARROW_IO_PRIVATE_LINK_LIBS}) diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 6955bcb6c23..d0816c4a42b 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -41,9 +41,10 @@ set(ARROW_IPC_SRCS ) # TODO(wesm): SHARED and STATIC targets -add_library(arrow_ipc SHARED +add_library(arrow_ipc STATIC ${ARROW_IPC_SRCS} ) +set_property(TARGET arrow_ipc PROPERTY POSITION_INDEPENDENT_CODE 1) if(FLATBUFFERS_VENDORED) add_dependencies(arrow_ipc flatbuffers_ep) endif() From bdae7a25d9e89ecbe3eeaa0fb81041e83ae06a3e Mon Sep 17 00:00:00 2001 From: Alexey Tumanov Date: Sat, 4 Mar 2017 21:23:38 -0800 Subject: [PATCH 204/210] builder: speed up bitsetting for large length --- cpp/src/arrow/builder.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 151b257a3d8..ee4b1b9c0ae 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -116,10 +116,24 @@ void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int32_t leng void ArrayBuilder::UnsafeSetNotNull(int32_t length) { const int32_t new_length = length + length_; - // TODO(emkornfield) Optimize for large values of length? - for (int32_t i = length_; i < new_length; ++i) { + + // Fill up the bytes until we have a byte alignment + int32_t pad_to_byte = 8 - (length_ % 8); + if (pad_to_byte == 8) { pad_to_byte = 0; } + for (int32_t i = 0; i < pad_to_byte; ++i) { + BitUtil::SetBit(null_bitmap_data_, i); + } + + // Fast bitsetting + int32_t fast_length = (length - pad_to_byte) / 8; + memset(null_bitmap_data_ + ((length_ + pad_to_byte) / 8), 255, + static_cast(fast_length)); + + // Trailing bytes + for (int32_t i = length_ + pad_to_byte + (fast_length * 8); i < new_length; ++i) { BitUtil::SetBit(null_bitmap_data_, i); } + length_ = new_length; } From 90902e9921858e9fec63c0ccf4ef4051efe95e81 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 7 Mar 2017 19:11:04 -0800 Subject: [PATCH 205/210] upgrade flatbuffers --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0bff7528578..83e4e6dec2b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -28,7 +28,7 @@ set(THIRDPARTY_DIR "${CMAKE_SOURCE_DIR}/thirdparty") set(GTEST_VERSION "1.7.0") set(GBENCHMARK_VERSION "1.0.0") -set(FLATBUFFERS_VERSION "1.3.0") +set(FLATBUFFERS_VERSION "1.6.0") find_package(ClangTools) if ("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1" OR CLANG_TIDY_FOUND) From 1924a78ef4aa307b4fcfadac82df5cdd79bda7f0 Mon Sep 17 00:00:00 2001 From: Alexey Tumanov Date: Thu, 9 Mar 2017 02:12:58 -0800 Subject: [PATCH 206/210] parallelize memcopy in arrow with openmp --- cpp/src/arrow/types/primitive.cc | 12 ++++-- cpp/src/arrow/util/memory-util.h | 66 ++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 cpp/src/arrow/util/memory-util.h diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 14667ee5b6e..8f3ad5b69b7 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -23,6 +23,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" +#include "arrow/util/memory-util.h" namespace arrow { @@ -98,7 +99,7 @@ Status PrimitiveBuilder::Init(int32_t capacity) { int64_t nbytes = TypeTraits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(nbytes)); // TODO(emkornfield) valgrind complains without this - memset(data_->mutable_data(), 0, nbytes); + // memset(data_->mutable_data(), 0, nbytes); raw_data_ = reinterpret_cast(data_->mutable_data()); return Status::OK(); @@ -128,7 +129,12 @@ Status PrimitiveBuilder::Append( RETURN_NOT_OK(Reserve(length)); if (length > 0) { - memcpy(raw_data_ + length_, values, TypeTraits::bytes_required(length)); + size_t numbytes = TypeTraits::bytes_required(length); + if (numbytes >= 1<<20) { + memcopy_frame_aligned((uint8_t *)(raw_data_ + length_), (uint8_t *)values, numbytes, true); + } else { + memcpy(raw_data_ + length_, values, numbytes); + } } // length_ is update by these @@ -172,7 +178,7 @@ Status BooleanBuilder::Init(int32_t capacity) { int64_t nbytes = BitUtil::BytesForBits(capacity); RETURN_NOT_OK(data_->Resize(nbytes)); // TODO(emkornfield) valgrind complains without this - memset(data_->mutable_data(), 0, nbytes); + //memset(data_->mutable_data(), 0, nbytes); raw_data_ = reinterpret_cast(data_->mutable_data()); return Status::OK(); diff --git a/cpp/src/arrow/util/memory-util.h b/cpp/src/arrow/util/memory-util.h new file mode 100644 index 00000000000..9a4f370cb54 --- /dev/null +++ b/cpp/src/arrow/util/memory-util.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_MEM_UTIL_H +#define ARROW_UTIL_MEM_UTIL_H +#if 0 +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#include + +namespace arrow { + +static inline int memcopy_frame_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool runparallel) { + struct timeval tv1, tv2; + double elapsed = 0; + // assume src and dst are ready to go (allocated, populated, etc) + //printf("src=%p\tdst=%p\n", src, dst); + int rv = 0; + int pagesz = getpagesize(); + char *srcbp = (char *)(((uint64_t)src + 4095) & ~(0x0fff)); + char *srcep = (char *)(((uint64_t)src + nbytes) & ~(0x0fff)); + uint64_t prefix = (uint64_t)srcbp - (uint64_t)src; + uint64_t suffix = ((uint64_t)src + nbytes) % 4096; + uint64_t numpages = (nbytes-prefix)/pagesz; + char *dstep = (char *)((uint64_t)dst + prefix + numpages*pagesz); + + //gettimeofday(&tv1, NULL); + memcpy(dst, src, prefix); + #pragma omp parallel for num_threads(8) if (runparallel) + for (uint64_t i = 0; i < numpages; i++) + { + memcpy((char *)(dst) + prefix + i*pagesz, ((char *)srcbp) + i*pagesz, pagesz); + } + memcpy(dstep, srcep, suffix); + //gettimeofday(&tv2, NULL); + //elapsed = ((tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec))/1000000.0; + //printf("copied %ld bytes in time = %8.4f MBps=%8.4f\n", nbytes, elapsed, nbytes/((1<<20)*elapsed)); + return rv; // 0 is good; bad o.w. +} + + +} // namespace arrow + +#endif // ARROW_UTIL_MEM_UTIL_H From d501ad2f302618ab4cd571deefdb7be478f91917 Mon Sep 17 00:00:00 2001 From: Alexey Tumanov Date: Thu, 9 Mar 2017 02:12:58 -0800 Subject: [PATCH 207/210] parallelize memcopy in arrow with openmp --- cpp/src/arrow/types/primitive.cc | 12 ++++-- cpp/src/arrow/util/memory-util.h | 66 ++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 cpp/src/arrow/util/memory-util.h diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 14667ee5b6e..8f3ad5b69b7 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -23,6 +23,7 @@ #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" +#include "arrow/util/memory-util.h" namespace arrow { @@ -98,7 +99,7 @@ Status PrimitiveBuilder::Init(int32_t capacity) { int64_t nbytes = TypeTraits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(nbytes)); // TODO(emkornfield) valgrind complains without this - memset(data_->mutable_data(), 0, nbytes); + // memset(data_->mutable_data(), 0, nbytes); raw_data_ = reinterpret_cast(data_->mutable_data()); return Status::OK(); @@ -128,7 +129,12 @@ Status PrimitiveBuilder::Append( RETURN_NOT_OK(Reserve(length)); if (length > 0) { - memcpy(raw_data_ + length_, values, TypeTraits::bytes_required(length)); + size_t numbytes = TypeTraits::bytes_required(length); + if (numbytes >= 1<<20) { + memcopy_frame_aligned((uint8_t *)(raw_data_ + length_), (uint8_t *)values, numbytes, true); + } else { + memcpy(raw_data_ + length_, values, numbytes); + } } // length_ is update by these @@ -172,7 +178,7 @@ Status BooleanBuilder::Init(int32_t capacity) { int64_t nbytes = BitUtil::BytesForBits(capacity); RETURN_NOT_OK(data_->Resize(nbytes)); // TODO(emkornfield) valgrind complains without this - memset(data_->mutable_data(), 0, nbytes); + //memset(data_->mutable_data(), 0, nbytes); raw_data_ = reinterpret_cast(data_->mutable_data()); return Status::OK(); diff --git a/cpp/src/arrow/util/memory-util.h b/cpp/src/arrow/util/memory-util.h new file mode 100644 index 00000000000..9a4f370cb54 --- /dev/null +++ b/cpp/src/arrow/util/memory-util.h @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_UTIL_MEM_UTIL_H +#define ARROW_UTIL_MEM_UTIL_H +#if 0 +#include +#include +#include +#include +#endif + +#include +#include +#include +#include +#include + +namespace arrow { + +static inline int memcopy_frame_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool runparallel) { + struct timeval tv1, tv2; + double elapsed = 0; + // assume src and dst are ready to go (allocated, populated, etc) + //printf("src=%p\tdst=%p\n", src, dst); + int rv = 0; + int pagesz = getpagesize(); + char *srcbp = (char *)(((uint64_t)src + 4095) & ~(0x0fff)); + char *srcep = (char *)(((uint64_t)src + nbytes) & ~(0x0fff)); + uint64_t prefix = (uint64_t)srcbp - (uint64_t)src; + uint64_t suffix = ((uint64_t)src + nbytes) % 4096; + uint64_t numpages = (nbytes-prefix)/pagesz; + char *dstep = (char *)((uint64_t)dst + prefix + numpages*pagesz); + + //gettimeofday(&tv1, NULL); + memcpy(dst, src, prefix); + #pragma omp parallel for num_threads(8) if (runparallel) + for (uint64_t i = 0; i < numpages; i++) + { + memcpy((char *)(dst) + prefix + i*pagesz, ((char *)srcbp) + i*pagesz, pagesz); + } + memcpy(dstep, srcep, suffix); + //gettimeofday(&tv2, NULL); + //elapsed = ((tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec))/1000000.0; + //printf("copied %ld bytes in time = %8.4f MBps=%8.4f\n", nbytes, elapsed, nbytes/((1<<20)*elapsed)); + return rv; // 0 is good; bad o.w. +} + + +} // namespace arrow + +#endif // ARROW_UTIL_MEM_UTIL_H From a2aeb01abe50720f28d124f74284327efd4e3c31 Mon Sep 17 00:00:00 2001 From: Alexey Tumanov Date: Mon, 13 Mar 2017 00:53:27 -0700 Subject: [PATCH 208/210] fully switch arrow memcpy parallelization to c++ threads --- cpp/src/arrow/types/primitive.cc | 2 +- cpp/src/arrow/util/memory-util.h | 77 ++++++++++++++++++++------------ 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 8f3ad5b69b7..f4a60d57a4f 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -131,7 +131,7 @@ Status PrimitiveBuilder::Append( if (length > 0) { size_t numbytes = TypeTraits::bytes_required(length); if (numbytes >= 1<<20) { - memcopy_frame_aligned((uint8_t *)(raw_data_ + length_), (uint8_t *)values, numbytes, true); + memcopy_aligned((uint8_t *)(raw_data_ + length_), (uint8_t *)values, numbytes, false); } else { memcpy(raw_data_ + length_, values, numbytes); } diff --git a/cpp/src/arrow/util/memory-util.h b/cpp/src/arrow/util/memory-util.h index 9a4f370cb54..b8073e9b207 100644 --- a/cpp/src/arrow/util/memory-util.h +++ b/cpp/src/arrow/util/memory-util.h @@ -17,12 +17,11 @@ #ifndef ARROW_UTIL_MEM_UTIL_H #define ARROW_UTIL_MEM_UTIL_H -#if 0 -#include -#include -#include +//#include +//#include +//#include #include -#endif +#include #include #include @@ -32,34 +31,56 @@ namespace arrow { -static inline int memcopy_frame_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool runparallel) { +int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool timeit) +{ +#ifndef NUMTHREADS +#define NUMTHREADS 8 +#endif + int rv = 0; struct timeval tv1, tv2; double elapsed = 0; - // assume src and dst are ready to go (allocated, populated, etc) - //printf("src=%p\tdst=%p\n", src, dst); - int rv = 0; - int pagesz = getpagesize(); - char *srcbp = (char *)(((uint64_t)src + 4095) & ~(0x0fff)); - char *srcep = (char *)(((uint64_t)src + nbytes) & ~(0x0fff)); - uint64_t prefix = (uint64_t)srcbp - (uint64_t)src; - uint64_t suffix = ((uint64_t)src + nbytes) % 4096; - uint64_t numpages = (nbytes-prefix)/pagesz; - char *dstep = (char *)((uint64_t)dst + prefix + numpages*pagesz); + const uint64_t numthreads = NUMTHREADS; + const uint64_t blocksz = getpagesize(); + const char *srcbp = (char *)(((uint64_t)src + blocksz-1) & ~(blocksz-1)); + char *srcep = (char *)(((uint64_t)src + nbytes) & ~(blocksz-1)); + const uint64_t chunksz = ((uint64_t)srcep - (uint64_t)srcbp) / numthreads;//B + const uint64_t numblocks = (((uint64_t)srcep - (uint64_t)srcbp)) / blocksz; + // Now we divide these blocks between available threads. Remainder is pushed + // to the suffix-handling thread. + // uint64_t remainder = numblocks % numthreads; + // Update the end pointer + srcep = srcep - (numblocks % numthreads)*blocksz; + const uint64_t prefix = (uint64_t)srcbp - (uint64_t)src; // Bytes + const uint64_t suffix = (uint64_t)(src+nbytes) - (uint64_t)srcep; // Bytes + char *dstep = (char *)((uint64_t)dst + prefix + numthreads*chunksz); + // Now data == | prefix | k*numthreads*blocksz | suffix | + // chunksz = k*blocksz => data == | prefix | numthreads*chunksz | suffix | + // Each thread gets a "chunk" of k blocks, except prefix and suffix threads. - //gettimeofday(&tv1, NULL); - memcpy(dst, src, prefix); - #pragma omp parallel for num_threads(8) if (runparallel) - for (uint64_t i = 0; i < numpages; i++) - { - memcpy((char *)(dst) + prefix + i*pagesz, ((char *)srcbp) + i*pagesz, pagesz); + std::vector threads; + // Start the prefix thread. + if (timeit) { + gettimeofday(&tv1, NULL); } - memcpy(dstep, srcep, suffix); - //gettimeofday(&tv2, NULL); - //elapsed = ((tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec))/1000000.0; - //printf("copied %ld bytes in time = %8.4f MBps=%8.4f\n", nbytes, elapsed, nbytes/((1<<20)*elapsed)); - return rv; // 0 is good; bad o.w. -} + threads.push_back(std::thread(memcpy, dst, src, prefix)); + for (int i = 1; i <= numthreads; i++) { + threads.push_back(std::thread( + memcpy, dst+prefix+(i-1)*chunksz, srcbp + (i-1)*chunksz, chunksz)); + } + threads.push_back(std::thread(memcpy, dstep, srcep, suffix)); + // Join the memcpy threads. + for (auto &t: threads) { + t.join(); + } + if (timeit) { + gettimeofday(&tv2, NULL); + elapsed = ((tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec))/1000000.0; + printf("copied %llu bytes in time = %8.4f MBps=%8.4f\n", + nbytes, elapsed, nbytes/((1<<20)*elapsed)); + } + return rv; +} } // namespace arrow From 59fe77a9748cef7ad9626f83e974a4ea05a9e2b2 Mon Sep 17 00:00:00 2001 From: Alexey Tumanov Date: Mon, 13 Mar 2017 02:17:14 -0700 Subject: [PATCH 209/210] parallelize arrow memset: add to memory util --- cpp/src/arrow/types/primitive.cc | 14 ++++++-- cpp/src/arrow/util/memory-util.h | 57 +++++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index f4a60d57a4f..5d50f62b372 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -118,7 +118,12 @@ Status PrimitiveBuilder::Resize(int32_t capacity) { const int64_t new_bytes = TypeTraits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); - memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + if ((new_bytes - old_bytes) >= (uint64_t)(1 << 15)) { + memset_aligned(data_->mutable_data() + old_bytes, 0, + new_bytes - old_bytes, false); + } else { + memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + } } return Status::OK(); } @@ -197,7 +202,12 @@ Status BooleanBuilder::Resize(int32_t capacity) { RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); - memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + if ((new_bytes - old_bytes) >= (1 << 15)) { + memset_aligned(data_->mutable_data() + old_bytes, 0, + new_bytes - old_bytes, false); + } else { + memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + } } return Status::OK(); } diff --git a/cpp/src/arrow/util/memory-util.h b/cpp/src/arrow/util/memory-util.h index b8073e9b207..b256f8ec619 100644 --- a/cpp/src/arrow/util/memory-util.h +++ b/cpp/src/arrow/util/memory-util.h @@ -17,9 +17,7 @@ #ifndef ARROW_UTIL_MEM_UTIL_H #define ARROW_UTIL_MEM_UTIL_H -//#include -//#include -//#include +#include #include #include @@ -31,7 +29,7 @@ namespace arrow { -int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool timeit) +static inline int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool timeit) { #ifndef NUMTHREADS #define NUMTHREADS 8 @@ -43,13 +41,15 @@ int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool time const uint64_t blocksz = getpagesize(); const char *srcbp = (char *)(((uint64_t)src + blocksz-1) & ~(blocksz-1)); char *srcep = (char *)(((uint64_t)src + nbytes) & ~(blocksz-1)); - const uint64_t chunksz = ((uint64_t)srcep - (uint64_t)srcbp) / numthreads;//B + const uint64_t numblocks = (((uint64_t)srcep - (uint64_t)srcbp)) / blocksz; // Now we divide these blocks between available threads. Remainder is pushed // to the suffix-handling thread. // uint64_t remainder = numblocks % numthreads; // Update the end pointer srcep = srcep - (numblocks % numthreads)*blocksz; + const uint64_t chunksz = ((uint64_t)srcep - (uint64_t)srcbp) / numthreads;//B + //assert(srcep >= srcbp); const uint64_t prefix = (uint64_t)srcbp - (uint64_t)src; // Bytes const uint64_t suffix = (uint64_t)(src+nbytes) - (uint64_t)srcep; // Bytes char *dstep = (char *)((uint64_t)dst + prefix + numthreads*chunksz); @@ -82,6 +82,53 @@ int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool time return rv; } +static inline int memset_aligned(uint8_t *dst, int val, uint64_t nbytes, bool timeit) +{ +#ifndef NUMTHREADS +#define NUMTHREADS 8 +#endif + int rv = 0; + struct timeval tv1, tv2; + double elapsed = 0; + const uint64_t numthreads = NUMTHREADS; + const uint64_t blocksz = 64; // cache block aligned + const char *dstbp = (char *)(((uint64_t)dst + blocksz-1) & ~(blocksz-1)); + char *dstep = (char *)(((uint64_t)dst + nbytes) & ~(blocksz-1)); + const uint64_t chunksz = ((uint64_t)dstep - (uint64_t)dstbp) / numthreads;//B + //assert(dstep >= dstbp); + const uint64_t numblocks = (((uint64_t)dstep - (uint64_t)dstbp)) / blocksz; + // Now we divide these blocks between available threads. Remainder is pushed + // to the suffix-handling thread. + // uint64_t remainder = numblocks % numthreads; + // Update the end pointer + dstep = dstep - (numblocks % numthreads)*blocksz; + const uint64_t prefix = (uint64_t)dstbp - (uint64_t)dst; // Bytes + const uint64_t suffix = (uint64_t)(dst+nbytes) - (uint64_t)dstep; // Bytes + std::vector threads; + // Start the prefix thread. + if (timeit) { + gettimeofday(&tv1, NULL); + } + threads.push_back(std::thread(memset, dst, val, prefix)); + for (int i = 1; i <= numthreads; i++) { + threads.push_back(std::thread( + memset, dst+prefix+(i-1)*chunksz, val, chunksz)); + } + threads.push_back(std::thread(memset, dstep, val, suffix)); + + // Join the memcpy threads. + for (auto &t: threads) { + t.join(); + } + if (timeit) { + gettimeofday(&tv2, NULL); + elapsed = ((tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec))/1000000.0; + printf("copied %llu bytes in time = %8.4f MBps=%8.4f\n", + nbytes, elapsed, nbytes/((1<<20)*elapsed)); + } + return rv; +} + } // namespace arrow #endif // ARROW_UTIL_MEM_UTIL_H From a61194ab524dff9c8b9b93c1fdb0176f83173f89 Mon Sep 17 00:00:00 2001 From: Alexey Tumanov Date: Tue, 14 Mar 2017 02:39:05 -0700 Subject: [PATCH 210/210] arrow: adding reusable threadpool for parallel memcpy+memset --- cpp/src/arrow/types/primitive.cc | 17 +++---- cpp/src/arrow/util/memory-util.h | 82 +++++++++++++++++++------------- 2 files changed, 59 insertions(+), 40 deletions(-) diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 5d50f62b372..2aa43ab0e8b 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -118,9 +118,9 @@ Status PrimitiveBuilder::Resize(int32_t capacity) { const int64_t new_bytes = TypeTraits::bytes_required(capacity); RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); - if ((new_bytes - old_bytes) >= (uint64_t)(1 << 15)) { - memset_aligned(data_->mutable_data() + old_bytes, 0, - new_bytes - old_bytes, false); + if ((new_bytes - old_bytes) >= 32*KB) { + memset_page_aligned( + data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); } else { memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); } @@ -135,8 +135,9 @@ Status PrimitiveBuilder::Append( if (length > 0) { size_t numbytes = TypeTraits::bytes_required(length); - if (numbytes >= 1<<20) { - memcopy_aligned((uint8_t *)(raw_data_ + length_), (uint8_t *)values, numbytes, false); + if (numbytes >= MB) { + memcopy_block_aligned((uint8_t *)(raw_data_ + length_), + (uint8_t *)values, numbytes); } else { memcpy(raw_data_ + length_, values, numbytes); } @@ -202,9 +203,9 @@ Status BooleanBuilder::Resize(int32_t capacity) { RETURN_NOT_OK(data_->Resize(new_bytes)); raw_data_ = reinterpret_cast(data_->mutable_data()); - if ((new_bytes - old_bytes) >= (1 << 15)) { - memset_aligned(data_->mutable_data() + old_bytes, 0, - new_bytes - old_bytes, false); + if ((new_bytes - old_bytes) >= 32*KB) { + memset_page_aligned( + data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); } else { memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); } diff --git a/cpp/src/arrow/util/memory-util.h b/cpp/src/arrow/util/memory-util.h index b256f8ec619..6c00f91e685 100644 --- a/cpp/src/arrow/util/memory-util.h +++ b/cpp/src/arrow/util/memory-util.h @@ -27,18 +27,21 @@ #include #include +#define NUMTHREADS 8 +#define MB (1<<20) +#define KB (1<<10) + namespace arrow { -static inline int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nbytes, bool timeit) -{ -#ifndef NUMTHREADS -#define NUMTHREADS 8 -#endif +static std::vector threadpool(NUMTHREADS); + +static inline int memcopy_aligned(uint8_t *dst, const uint8_t *src, + uint64_t nbytes, uint64_t blocksz, + bool timeit) { int rv = 0; struct timeval tv1, tv2; double elapsed = 0; const uint64_t numthreads = NUMTHREADS; - const uint64_t blocksz = getpagesize(); const char *srcbp = (char *)(((uint64_t)src + blocksz-1) & ~(blocksz-1)); char *srcep = (char *)(((uint64_t)src + nbytes) & ~(blocksz-1)); @@ -57,21 +60,21 @@ static inline int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nby // chunksz = k*blocksz => data == | prefix | numthreads*chunksz | suffix | // Each thread gets a "chunk" of k blocks, except prefix and suffix threads. - std::vector threads; - // Start the prefix thread. if (timeit) { gettimeofday(&tv1, NULL); } - threads.push_back(std::thread(memcpy, dst, src, prefix)); - for (int i = 1; i <= numthreads; i++) { - threads.push_back(std::thread( - memcpy, dst+prefix+(i-1)*chunksz, srcbp + (i-1)*chunksz, chunksz)); + // Start memcpy threads and then copy the prefix and suffix while threads run. + for (int i = 0; i < numthreads; i++) { + threadpool[i] = std::thread( + memcpy, dst+prefix+i*chunksz, srcbp + i*chunksz, chunksz); } - threads.push_back(std::thread(memcpy, dstep, srcep, suffix)); +// threads.push_back(std::thread(memcpy, dstep, srcep, suffix)); +// threadpool[NUMTHREADS-1] = std::thread(memcpy, dstep, srcep, suffix); + memcpy(dst, src, prefix); + memcpy(dstep, srcep, suffix); - // Join the memcpy threads. - for (auto &t: threads) { - t.join(); + for (auto &t: threadpool) { + t.join(); // Join all the memcpy threads. } if (timeit) { gettimeofday(&tv2, NULL); @@ -82,16 +85,12 @@ static inline int memcopy_aligned(uint8_t *dst, const uint8_t *src, uint64_t nby return rv; } -static inline int memset_aligned(uint8_t *dst, int val, uint64_t nbytes, bool timeit) -{ -#ifndef NUMTHREADS -#define NUMTHREADS 8 -#endif +static inline int memset_aligned(uint8_t *dst, int val, uint64_t nbytes, + uint64_t blocksz, bool timeit) { int rv = 0; struct timeval tv1, tv2; double elapsed = 0; const uint64_t numthreads = NUMTHREADS; - const uint64_t blocksz = 64; // cache block aligned const char *dstbp = (char *)(((uint64_t)dst + blocksz-1) & ~(blocksz-1)); char *dstep = (char *)(((uint64_t)dst + nbytes) & ~(blocksz-1)); const uint64_t chunksz = ((uint64_t)dstep - (uint64_t)dstbp) / numthreads;//B @@ -105,30 +104,49 @@ static inline int memset_aligned(uint8_t *dst, int val, uint64_t nbytes, bool ti const uint64_t prefix = (uint64_t)dstbp - (uint64_t)dst; // Bytes const uint64_t suffix = (uint64_t)(dst+nbytes) - (uint64_t)dstep; // Bytes std::vector threads; - // Start the prefix thread. if (timeit) { gettimeofday(&tv1, NULL); } - threads.push_back(std::thread(memset, dst, val, prefix)); - for (int i = 1; i <= numthreads; i++) { - threads.push_back(std::thread( - memset, dst+prefix+(i-1)*chunksz, val, chunksz)); +// threads.push_back(std::thread(memset, dst, val, prefix)); + // Start all threads first. Handle leftovers while threads are running. + for (int i = 0; i < numthreads; i++) { +// threads.push_back(std::thread( + threadpool[i] = std::thread(memset, dst+prefix+i*chunksz, val, chunksz); } - threads.push_back(std::thread(memset, dstep, val, suffix)); +// threads.push_back(std::thread(memset, dstep, val, suffix)); + memset(dst, val, prefix); + memset(dstep, val, suffix); // Join the memcpy threads. - for (auto &t: threads) { + for (auto &t : threadpool) { t.join(); } if (timeit) { gettimeofday(&tv2, NULL); - elapsed = ((tv2.tv_sec - tv1.tv_sec)*1000000 + (tv2.tv_usec - tv1.tv_usec))/1000000.0; - printf("copied %llu bytes in time = %8.4f MBps=%8.4f\n", - nbytes, elapsed, nbytes/((1<<20)*elapsed)); + elapsed = + ((tv2.tv_sec - tv1.tv_sec) * 1000000 + (tv2.tv_usec - tv1.tv_usec)) + / 1000000.0; + printf("copied %llu bytes in time = %8.4f MBps=%8.4f\n", nbytes, elapsed, + nbytes / ((1 << 20) * elapsed)); } return rv; } +inline int memset_block_aligned(uint8_t *dst, int val, uint64_t nbytes){ + return memset_aligned(dst, val, nbytes, 64, false); +} + +inline int memset_page_aligned(uint8_t *dst, int val, uint64_t nbytes) { + return memset_aligned(dst, val, nbytes, getpagesize(), false); +} + +inline int memcopy_block_aligned(uint8_t *dst, uint8_t *src, uint64_t nbytes) { + return memcopy_aligned(dst, src, nbytes, 64, false); +} +inline int memcopy_page_aligned(uint8_t *dst, uint8_t *src, uint64_t nbytes) { + return memcopy_aligned(dst, src, nbytes, getpagesize(), false); +} + } // namespace arrow #endif // ARROW_UTIL_MEM_UTIL_H