@@ -42,7 +42,7 @@ use crate::encryption::decrypt::FileDecryptionProperties;
4242use crate :: errors:: { ParquetError , Result } ;
4343use crate :: file:: metadata:: {
4444 PageIndexPolicy , ParquetMetaData , ParquetMetaDataOptions , ParquetMetaDataReader ,
45- RowGroupMetaData ,
45+ ParquetStatisticsPolicy , RowGroupMetaData ,
4646} ;
4747use crate :: file:: reader:: { ChunkReader , SerializedPageReader } ;
4848use crate :: schema:: types:: SchemaDescriptor ;
@@ -557,6 +557,30 @@ impl ArrowReaderOptions {
557557 self
558558 }
559559
560+ /// Set whether to convert the [`encoding_stats`] in the Parquet `ColumnMetaData` to a bitmask
561+ /// (defaults to `false`).
562+ ///
563+ /// See [`ColumnChunkMetaData::page_encoding_stats_mask`] for an explanation of why this
564+ /// might be desirable.
565+ ///
566+ /// [`ColumnChunkMetaData::page_encoding_stats_mask`]:
567+ /// crate::file::metadata::ColumnChunkMetaData::page_encoding_stats_mask
568+ /// [`encoding_stats`]:
569+ /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
570+ pub fn with_encoding_stats_as_mask ( mut self , val : bool ) -> Self {
571+ self . metadata_options . set_encoding_stats_as_mask ( val) ;
572+ self
573+ }
574+
575+ /// Sets the decoding policy for [`encoding_stats`] in the Parquet `ColumnMetaData`.
576+ ///
577+ /// [`encoding_stats`]:
578+ /// https://github.com/apache/parquet-format/blob/786142e26740487930ddc3ec5e39d780bd930907/src/main/thrift/parquet.thrift#L917
579+ pub fn with_encoding_stats_policy ( mut self , policy : ParquetStatisticsPolicy ) -> Self {
580+ self . metadata_options . set_encoding_stats_policy ( policy) ;
581+ self
582+ }
583+
560584 /// Provide the file decryption properties to use when reading encrypted parquet files.
561585 ///
562586 /// If encryption is enabled and the file is encrypted, the `file_decryption_properties` must be provided.
@@ -1420,7 +1444,7 @@ pub(crate) mod tests {
14201444 FloatType , Int32Type , Int64Type , Int96 , Int96Type ,
14211445 } ;
14221446 use crate :: errors:: Result ;
1423- use crate :: file:: metadata:: ParquetMetaData ;
1447+ use crate :: file:: metadata:: { ParquetMetaData , ParquetStatisticsPolicy } ;
14241448 use crate :: file:: properties:: { EnabledStatistics , WriterProperties , WriterVersion } ;
14251449 use crate :: file:: writer:: SerializedFileWriter ;
14261450 use crate :: schema:: parser:: parse_message_type;
@@ -1474,6 +1498,69 @@ pub(crate) mod tests {
14741498 assert_eq ! ( expected. as_ref( ) , builder. metadata. as_ref( ) ) ;
14751499 }
14761500
1501+ #[ test]
1502+ fn test_page_encoding_stats_mask ( ) {
1503+ let testdata = arrow:: util:: test_util:: parquet_test_data ( ) ;
1504+ let path = format ! ( "{testdata}/alltypes_tiny_pages.parquet" ) ;
1505+ let file = File :: open ( path) . unwrap ( ) ;
1506+
1507+ let arrow_options = ArrowReaderOptions :: new ( ) . with_encoding_stats_as_mask ( true ) ;
1508+ let builder =
1509+ ParquetRecordBatchReaderBuilder :: try_new_with_options ( file, arrow_options) . unwrap ( ) ;
1510+
1511+ let row_group_metadata = builder. metadata . row_group ( 0 ) ;
1512+
1513+ // test page encoding stats
1514+ let page_encoding_stats = row_group_metadata
1515+ . column ( 0 )
1516+ . page_encoding_stats_mask ( )
1517+ . unwrap ( ) ;
1518+ assert ! ( page_encoding_stats. is_only( Encoding :: PLAIN ) ) ;
1519+ let page_encoding_stats = row_group_metadata
1520+ . column ( 2 )
1521+ . page_encoding_stats_mask ( )
1522+ . unwrap ( ) ;
1523+ assert ! ( page_encoding_stats. is_only( Encoding :: PLAIN_DICTIONARY ) ) ;
1524+ }
1525+
1526+ #[ test]
1527+ fn test_page_encoding_stats_skipped ( ) {
1528+ let testdata = arrow:: util:: test_util:: parquet_test_data ( ) ;
1529+ let path = format ! ( "{testdata}/alltypes_tiny_pages.parquet" ) ;
1530+ let file = File :: open ( path) . unwrap ( ) ;
1531+
1532+ // test skipping all
1533+ let arrow_options =
1534+ ArrowReaderOptions :: new ( ) . with_encoding_stats_policy ( ParquetStatisticsPolicy :: SkipAll ) ;
1535+ let builder = ParquetRecordBatchReaderBuilder :: try_new_with_options (
1536+ file. try_clone ( ) . unwrap ( ) ,
1537+ arrow_options,
1538+ )
1539+ . unwrap ( ) ;
1540+
1541+ let row_group_metadata = builder. metadata . row_group ( 0 ) ;
1542+ for column in row_group_metadata. columns ( ) {
1543+ assert ! ( column. page_encoding_stats( ) . is_none( ) ) ;
1544+ assert ! ( column. page_encoding_stats_mask( ) . is_none( ) ) ;
1545+ }
1546+
1547+ // test skipping all but one column and converting to mask
1548+ let arrow_options = ArrowReaderOptions :: new ( )
1549+ . with_encoding_stats_as_mask ( true )
1550+ . with_encoding_stats_policy ( ParquetStatisticsPolicy :: skip_except ( & [ 0 ] ) ) ;
1551+ let builder = ParquetRecordBatchReaderBuilder :: try_new_with_options (
1552+ file. try_clone ( ) . unwrap ( ) ,
1553+ arrow_options,
1554+ )
1555+ . unwrap ( ) ;
1556+
1557+ let row_group_metadata = builder. metadata . row_group ( 0 ) ;
1558+ for ( idx, column) in row_group_metadata. columns ( ) . iter ( ) . enumerate ( ) {
1559+ assert ! ( column. page_encoding_stats( ) . is_none( ) ) ;
1560+ assert_eq ! ( column. page_encoding_stats_mask( ) . is_some( ) , idx == 0 ) ;
1561+ }
1562+ }
1563+
14771564 #[ test]
14781565 fn test_arrow_reader_single_column ( ) {
14791566 let file = get_test_file ( "parquet/generated_simple_numerics/blogs.parquet" ) ;
0 commit comments