From bad35849349647b4e4d1f7ba1714e200ae859d61 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 15 Oct 2025 13:22:26 +0800 Subject: [PATCH 1/3] GH-109: Implement Vector Validators for StringView --- .../validate/ValidateVectorBufferVisitor.java | 10 +++++++++- .../validate/ValidateVectorDataVisitor.java | 3 ++- .../validate/ValidateVectorTypeVisitor.java | 9 ++++++++- .../vector/validate/ValidateVectorVisitor.java | 9 +++++++-- .../vector/TestVariableWidthViewVector.java | 17 ++++++++++++++++- 5 files changed, 42 insertions(+), 6 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index 5c7215437f..f1eef5c04c 100644 --- a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -158,7 +158,15 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { @Override public Void visit(BaseVariableWidthViewVector vector, Void value) { - throw new UnsupportedOperationException("View vectors are not supported."); + final int valueCount = vector.getValueCount(); + validateOrThrow( + vector.getValueCount() >= 0, + "Vector valueCount %s is negative.", + vector.getValueCapacity()); + validateOrThrow(vector.getFieldBuffers().size() >= 2, "Expected at least 2 buffers."); + validateValidityBuffer(vector, valueCount); + validateDataBuffer(vector, (long) valueCount * BaseVariableWidthViewVector.ELEMENT_SIZE); + return null; } @Override diff --git a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java index c62bff79f7..9da8cc813e 100644 --- a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java @@ -121,7 +121,8 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { @Override public Void visit(BaseVariableWidthViewVector vector, Void value) { - throw new UnsupportedOperationException("View vectors are not supported."); + vector.validateScalars(); + return null; } @Override diff --git a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java index daad41dbdc..395852ef79 100644 --- a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorTypeVisitor.java @@ -61,6 +61,8 @@ import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.ViewVarBinaryVector; +import org.apache.arrow.vector.ViewVarCharVector; import org.apache.arrow.vector.compare.VectorVisitor; import org.apache.arrow.vector.complex.DenseUnionVector; import org.apache.arrow.vector.complex.FixedSizeListVector; @@ -380,7 +382,12 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { @Override public Void visit(BaseVariableWidthViewVector vector, Void value) { - throw new UnsupportedOperationException("View vectors are not supported."); + if (vector instanceof ViewVarCharVector) { + validateVectorCommon(vector, ArrowType.Utf8View.class); + } else if (vector instanceof ViewVarBinaryVector) { + validateVectorCommon(vector, ArrowType.BinaryView.class); + } + return null; } @Override diff --git a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java index 5004ba488c..2111410016 100644 --- a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorVisitor.java @@ -107,8 +107,13 @@ public Void visit(BaseLargeVariableWidthVector left, Void value) { } @Override - public Void visit(BaseVariableWidthViewVector left, Void value) { - throw new UnsupportedOperationException("View vectors are not supported."); + public Void visit(BaseVariableWidthViewVector vector, Void value) { + if (vector.getValueCount() > 0) { + if (vector.getDataBuffer() == null || vector.getDataBuffer().capacity() == 0) { + throw new IllegalArgumentException("valueBuffer is null or capacity is 0"); + } + } + return null; } @Override diff --git a/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java b/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java index f7c66a00be..baf5e672c8 100644 --- a/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java +++ b/vector/src/test/java/org/apache/arrow/vector/TestVariableWidthViewVector.java @@ -61,6 +61,7 @@ import org.apache.arrow.vector.util.ReusableByteArray; import org.apache.arrow.vector.util.Text; import org.apache.arrow.vector.util.TransferPair; +import org.apache.arrow.vector.validate.ValidateUtil; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -2445,7 +2446,7 @@ public void testSplitAndTransferWithLongStringsOnValiditySplit() { final ViewVarBinaryVector sourceVector = newViewVarBinaryVector(EMPTY_SCHEMA_PATH, allocator)) { testSplitAndTransferOnValiditySplitHelper( - targetVector, sourceVector, startIndex, length, data); + targetVector, sourceVector, startIndex, length, binaryData); } } @@ -2852,4 +2853,18 @@ public void testVectorLoadUnloadOnMixedTypes() { } } } + + @Test + public void testValidate() { + try (final ViewVarCharVector vector = new ViewVarCharVector("v", allocator)) { + vector.validateFull(); + setVector(vector, STR1, STR2, STR3); + vector.validateFull(); + + vector.getDataBuffer().capacity(0); + ValidateUtil.ValidateException e = + assertThrows(ValidateUtil.ValidateException.class, () -> vector.validate()); + assertTrue(e.getMessage().contains("Not enough capacity for data buffer")); + } + } } From bd6b92f6a3daa9504a0bcf3b2f1415ded0b04e43 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 29 Oct 2025 17:33:46 +0800 Subject: [PATCH 2/3] isVariableBuffer --- .../org/apache/arrow/vector/TypeLayout.java | 139 ++++++++++++++++++ .../validate/ValidateVectorBufferVisitor.java | 27 ++-- 2 files changed, 155 insertions(+), 11 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index fa75ef0457..56542fa17a 100644 --- a/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -37,7 +37,10 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.Interval; import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeList; +import org.apache.arrow.vector.types.pojo.ArrowType.LargeListView; import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; +import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; @@ -458,6 +461,142 @@ public Integer visit(RunEndEncoded type) { }); } + public static boolean isVariableBuffer(final ArrowType arrowType) { + return arrowType.accept( + new ArrowTypeVisitor<>() { + + @Override + public Boolean visit(Null type) { + return false; + } + + @Override + public Boolean visit(Struct type) { + return false; + } + + @Override + public Boolean visit(ArrowType.List type) { + return false; + } + + @Override + public Boolean visit(LargeList type) { + return false; + } + + @Override + public Boolean visit(FixedSizeList type) { + return false; + } + + @Override + public Boolean visit(Union type) { + return false; + } + + @Override + public Boolean visit(Map type) { + return false; + } + + @Override + public Boolean visit(Int type) { + return false; + } + + @Override + public Boolean visit(FloatingPoint type) { + return false; + } + + @Override + public Boolean visit(Utf8 type) { + return false; + } + + @Override + public Boolean visit(Utf8View type) { + return true; + } + + @Override + public Boolean visit(LargeUtf8 type) { + return false; + } + + @Override + public Boolean visit(Binary type) { + return false; + } + + @Override + public Boolean visit(BinaryView type) { + return true; + } + + @Override + public Boolean visit(LargeBinary type) { + return false; + } + + @Override + public Boolean visit(FixedSizeBinary type) { + return false; + } + + @Override + public Boolean visit(Bool type) { + return false; + } + + @Override + public Boolean visit(Decimal type) { + return false; + } + + @Override + public Boolean visit(Date type) { + return false; + } + + @Override + public Boolean visit(Time type) { + return false; + } + + @Override + public Boolean visit(Timestamp type) { + return false; + } + + @Override + public Boolean visit(Interval type) { + return false; + } + + @Override + public Boolean visit(Duration type) { + return false; + } + + @Override + public Boolean visit(ListView type) { + return false; + } + + @Override + public Boolean visit(LargeListView type) { + return false; + } + + @Override + public Boolean visit(RunEndEncoded type) { + return false; + } + }); + } + private final List bufferLayouts; private final boolean isFixedBufferCount; diff --git a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index f1eef5c04c..2a144f3429 100644 --- a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -52,14 +52,22 @@ private void validateVectorCommon(ValueVector vector) { if (vector instanceof FieldVector) { FieldVector fieldVector = (FieldVector) vector; - // TODO: https://github.com/apache/arrow/issues/41734 int typeBufferCount = TypeLayout.getTypeBufferCount(arrowType); - validateOrThrow( - fieldVector.getFieldBuffers().size() == typeBufferCount, - "Expected %s buffers in vector of type %s, got %s.", - typeBufferCount, - vector.getField().getType().toString(), - fieldVector.getFieldBuffers().size()); + if (TypeLayout.isVariableBuffer(arrowType)) { + validateOrThrow( + fieldVector.getFieldBuffers().size() >= typeBufferCount, + "Expected at least %s buffers in vector of type %s, got %s.", + typeBufferCount, + vector.getField().getType().toString(), + fieldVector.getFieldBuffers().size()); + } else { + validateOrThrow( + fieldVector.getFieldBuffers().size() == typeBufferCount, + "Expected %s buffers in vector of type %s, got %s.", + typeBufferCount, + vector.getField().getType().toString(), + fieldVector.getFieldBuffers().size()); + } } } @@ -159,10 +167,7 @@ public Void visit(BaseLargeVariableWidthVector vector, Void value) { @Override public Void visit(BaseVariableWidthViewVector vector, Void value) { final int valueCount = vector.getValueCount(); - validateOrThrow( - vector.getValueCount() >= 0, - "Vector valueCount %s is negative.", - vector.getValueCapacity()); + validateVectorCommon(vector); validateOrThrow(vector.getFieldBuffers().size() >= 2, "Expected at least 2 buffers."); validateValidityBuffer(vector, valueCount); validateDataBuffer(vector, (long) valueCount * BaseVariableWidthViewVector.ELEMENT_SIZE); From b68d467502946812215c39aa99a7eecdd7ecc647 Mon Sep 17 00:00:00 2001 From: "chenweiguo.vc" Date: Wed, 29 Oct 2025 17:36:56 +0800 Subject: [PATCH 3/3] check isFixedBufferCount --- .../org/apache/arrow/vector/TypeLayout.java | 139 ------------------ .../validate/ValidateVectorBufferVisitor.java | 10 +- 2 files changed, 5 insertions(+), 144 deletions(-) diff --git a/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java b/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java index 56542fa17a..fa75ef0457 100644 --- a/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java +++ b/vector/src/main/java/org/apache/arrow/vector/TypeLayout.java @@ -37,10 +37,7 @@ import org.apache.arrow.vector.types.pojo.ArrowType.Int; import org.apache.arrow.vector.types.pojo.ArrowType.Interval; import org.apache.arrow.vector.types.pojo.ArrowType.LargeBinary; -import org.apache.arrow.vector.types.pojo.ArrowType.LargeList; -import org.apache.arrow.vector.types.pojo.ArrowType.LargeListView; import org.apache.arrow.vector.types.pojo.ArrowType.LargeUtf8; -import org.apache.arrow.vector.types.pojo.ArrowType.ListView; import org.apache.arrow.vector.types.pojo.ArrowType.Map; import org.apache.arrow.vector.types.pojo.ArrowType.Null; import org.apache.arrow.vector.types.pojo.ArrowType.RunEndEncoded; @@ -461,142 +458,6 @@ public Integer visit(RunEndEncoded type) { }); } - public static boolean isVariableBuffer(final ArrowType arrowType) { - return arrowType.accept( - new ArrowTypeVisitor<>() { - - @Override - public Boolean visit(Null type) { - return false; - } - - @Override - public Boolean visit(Struct type) { - return false; - } - - @Override - public Boolean visit(ArrowType.List type) { - return false; - } - - @Override - public Boolean visit(LargeList type) { - return false; - } - - @Override - public Boolean visit(FixedSizeList type) { - return false; - } - - @Override - public Boolean visit(Union type) { - return false; - } - - @Override - public Boolean visit(Map type) { - return false; - } - - @Override - public Boolean visit(Int type) { - return false; - } - - @Override - public Boolean visit(FloatingPoint type) { - return false; - } - - @Override - public Boolean visit(Utf8 type) { - return false; - } - - @Override - public Boolean visit(Utf8View type) { - return true; - } - - @Override - public Boolean visit(LargeUtf8 type) { - return false; - } - - @Override - public Boolean visit(Binary type) { - return false; - } - - @Override - public Boolean visit(BinaryView type) { - return true; - } - - @Override - public Boolean visit(LargeBinary type) { - return false; - } - - @Override - public Boolean visit(FixedSizeBinary type) { - return false; - } - - @Override - public Boolean visit(Bool type) { - return false; - } - - @Override - public Boolean visit(Decimal type) { - return false; - } - - @Override - public Boolean visit(Date type) { - return false; - } - - @Override - public Boolean visit(Time type) { - return false; - } - - @Override - public Boolean visit(Timestamp type) { - return false; - } - - @Override - public Boolean visit(Interval type) { - return false; - } - - @Override - public Boolean visit(Duration type) { - return false; - } - - @Override - public Boolean visit(ListView type) { - return false; - } - - @Override - public Boolean visit(LargeListView type) { - return false; - } - - @Override - public Boolean visit(RunEndEncoded type) { - return false; - } - }); - } - private final List bufferLayouts; private final boolean isFixedBufferCount; diff --git a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java index 2a144f3429..5cfe64b14e 100644 --- a/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java +++ b/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java @@ -53,17 +53,17 @@ private void validateVectorCommon(ValueVector vector) { if (vector instanceof FieldVector) { FieldVector fieldVector = (FieldVector) vector; int typeBufferCount = TypeLayout.getTypeBufferCount(arrowType); - if (TypeLayout.isVariableBuffer(arrowType)) { + if (TypeLayout.getTypeLayout(arrowType).isFixedBufferCount()) { validateOrThrow( - fieldVector.getFieldBuffers().size() >= typeBufferCount, - "Expected at least %s buffers in vector of type %s, got %s.", + fieldVector.getFieldBuffers().size() == typeBufferCount, + "Expected %s buffers in vector of type %s, got %s.", typeBufferCount, vector.getField().getType().toString(), fieldVector.getFieldBuffers().size()); } else { validateOrThrow( - fieldVector.getFieldBuffers().size() == typeBufferCount, - "Expected %s buffers in vector of type %s, got %s.", + fieldVector.getFieldBuffers().size() >= typeBufferCount, + "Expected at least %s buffers in vector of type %s, got %s.", typeBufferCount, vector.getField().getType().toString(), fieldVector.getFieldBuffers().size());