Skip to content

Commit

Permalink
cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar committed May 14, 2024
1 parent 359be26 commit e909e92
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 18 deletions.
32 changes: 15 additions & 17 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -762,27 +762,25 @@ cdef class ParquetWriter:
cdef class ParquetReader:
cdef bool initialized
cdef unique_ptr[cpp_chunked_parquet_reader] reader
cdef cudf_io_types.source_info source
cdef table_input_metadata tbl_meta
cdef cudf_io_types.sink_info sink
cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sink
cdef cudf_io_types.statistics_freq stat_freq
cdef cudf_io_types.compression_type comp_type
cdef object index
cdef size_t chunk_read_limit
cdef size_t row_group_size_bytes
cdef size_type row_group_size_rows
cdef size_t max_page_size_bytes
cdef size_type max_page_size_rows
cdef size_t max_dictionary_size
cdef cudf_io_types.dictionary_policy dict_policy
cdef table_metadata result_meta
cdef vector[unordered_map[string, string]] per_file_user_data
cdef object pandas_meta
cdef list pa_buffers
cdef bool allow_range_index
cdef object row_groups
cdef object filepaths_or_buffers
cdef object names
cdef object column_index_type
cdef object index_col_names
cdef bool is_range_index
cdef object index_col
cdef bool cpp_use_pandas_metadata

def __cinit__(self, filepaths_or_buffers, columns=None, row_groups=None,
use_pandas_metadata=True,
Expression filters=None, int chunk_read_limit=100000):
Expression filters=None, int chunk_read_limit=1024000000):

# Convert NativeFile buffers to NativeFileDatasource,
# but save original buffers in case we need to use
Expand All @@ -798,7 +796,7 @@ cdef class ParquetReader:
cdef cudf_io_types.source_info source = make_source_info(
filepaths_or_buffers)

cdef bool cpp_use_pandas_metadata = use_pandas_metadata
self.cpp_use_pandas_metadata = use_pandas_metadata

cdef vector[vector[size_type]] cpp_row_groups
cdef data_type cpp_timestamp_type = cudf_types.data_type(
Expand All @@ -813,7 +811,7 @@ cdef class ParquetReader:
builder = (
parquet_reader_options.builder(source)
.row_groups(cpp_row_groups)
.use_pandas_metadata(cpp_use_pandas_metadata)
.use_pandas_metadata(self.cpp_use_pandas_metadata)
.timestamp_type(cpp_timestamp_type)
)
if filters is not None:
Expand Down Expand Up @@ -882,8 +880,8 @@ cdef class ParquetReader:
move(c_result.tbl),
column_names=self.names,
))
if not self.initialized:
self.initialized = True

self.initialized = True
return df

def read(self):
Expand Down
9 changes: 8 additions & 1 deletion python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -858,12 +858,19 @@ def _read_parquet(
"cudf engine doesn't support the "
f"following positional arguments: {list(args)}"
)
return libparquet.read_parquet(
x = libparquet.ParquetReader(
filepaths_or_buffers,
columns=columns,
row_groups=row_groups,
use_pandas_metadata=use_pandas_metadata,
)
return x.read()
# return libparquet.read_parquet(
# filepaths_or_buffers,
# columns=columns,
# row_groups=row_groups,
# use_pandas_metadata=use_pandas_metadata,
# )
else:
if (
isinstance(filepaths_or_buffers, list)
Expand Down

0 comments on commit e909e92

Please sign in to comment.