-
Notifications
You must be signed in to change notification settings - Fork 3.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-237: Implement parquet-cpp's abstract IO interfaces for memory allocation and file reading #101
ARROW-237: Implement parquet-cpp's abstract IO interfaces for memory allocation and file reading #101
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#include "arrow/parquet/io.h" | ||
|
||
#include <cstdint> | ||
#include <memory> | ||
|
||
#include "parquet/api/io.h" | ||
|
||
#include "arrow/parquet/utils.h" | ||
#include "arrow/util/memory-pool.h" | ||
#include "arrow/util/status.h" | ||
|
||
// To assist with readability | ||
using ArrowROFile = arrow::io::RandomAccessFile; | ||
|
||
namespace arrow { | ||
namespace parquet { | ||
|
||
// ---------------------------------------------------------------------- | ||
// ParquetAllocator | ||
|
||
ParquetAllocator::ParquetAllocator() : pool_(default_memory_pool()) {} | ||
|
||
ParquetAllocator::ParquetAllocator(MemoryPool* pool) : pool_(pool) {} | ||
|
||
ParquetAllocator::~ParquetAllocator() {} | ||
|
||
uint8_t* ParquetAllocator::Malloc(int64_t size) { | ||
uint8_t* result; | ||
PARQUET_THROW_NOT_OK(pool_->Allocate(size, &result)); | ||
return result; | ||
} | ||
|
||
void ParquetAllocator::Free(uint8_t* buffer, int64_t size) { | ||
// Does not report Status | ||
pool_->Free(buffer, size); | ||
} | ||
|
||
// ---------------------------------------------------------------------- | ||
// ParquetReadSource | ||
|
||
ParquetReadSource::ParquetReadSource( | ||
const std::shared_ptr<ArrowROFile>& file, ParquetAllocator* allocator) | ||
: file_(file), allocator_(allocator) {} | ||
|
||
void ParquetReadSource::Close() { | ||
PARQUET_THROW_NOT_OK(file_->Close()); | ||
} | ||
|
||
int64_t ParquetReadSource::Tell() const { | ||
int64_t position; | ||
PARQUET_THROW_NOT_OK(file_->Tell(&position)); | ||
return position; | ||
} | ||
|
||
void ParquetReadSource::Seek(int64_t position) { | ||
PARQUET_THROW_NOT_OK(file_->Seek(position)); | ||
} | ||
|
||
int64_t ParquetReadSource::Read(int64_t nbytes, uint8_t* out) { | ||
int64_t bytes_read; | ||
PARQUET_THROW_NOT_OK(file_->Read(nbytes, &bytes_read, out)); | ||
return bytes_read; | ||
} | ||
|
||
std::shared_ptr<::parquet::Buffer> ParquetReadSource::Read(int64_t nbytes) { | ||
// TODO(wesm): This code is duplicated from parquet/util/input.cc; suggests | ||
// that there should be more code sharing amongst file-like sources | ||
auto result = std::make_shared<::parquet::OwnedMutableBuffer>(0, allocator_); | ||
result->Resize(nbytes); | ||
|
||
int64_t bytes_read = Read(nbytes, result->mutable_data()); | ||
if (bytes_read < nbytes) { result->Resize(bytes_read); } | ||
return result; | ||
} | ||
|
||
} // namespace parquet | ||
} // namespace arrow |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
// Bridges Arrow's IO interfaces and Parquet-cpp's IO interfaces | ||
|
||
#ifndef ARROW_PARQUET_IO_H | ||
#define ARROW_PARQUET_IO_H | ||
|
||
#include <cstdint> | ||
#include <memory> | ||
|
||
#include "parquet/api/io.h" | ||
|
||
#include "arrow/io/interfaces.h" | ||
#include "arrow/util/visibility.h" | ||
|
||
namespace arrow { | ||
|
||
class MemoryPool; | ||
|
||
namespace parquet { | ||
|
||
// An implementation of the Parquet MemoryAllocator API that plugs into an | ||
// existing Arrow memory pool. This way we can direct all allocations to a | ||
// single place rather than tracking allocations in different locations (for | ||
// example: without utilizing parquet-cpp's default allocator) | ||
class ARROW_EXPORT ParquetAllocator : public ::parquet::MemoryAllocator { | ||
public: | ||
// Uses the default memory pool | ||
ParquetAllocator(); | ||
|
||
explicit ParquetAllocator(MemoryPool* pool); | ||
virtual ~ParquetAllocator(); | ||
|
||
uint8_t* Malloc(int64_t size) override; | ||
void Free(uint8_t* buffer, int64_t size) override; | ||
|
||
MemoryPool* pool() { return pool_; } | ||
|
||
private: | ||
MemoryPool* pool_; | ||
}; | ||
|
||
class ARROW_EXPORT ParquetReadSource : public ::parquet::RandomAccessSource { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm actually a bit bothered if it really is a good idea to have the same export macro for different shared libs. Sadly this thought did not come up with the previous review but it would probably be better to have There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting. From my point of view, what matters is not conflicting with any 3rd party macros that you may encounter in headers. Unless you expect to have some differing interpretation of visibility between leaf libraries (does not seem too likely). Since we can control name conflicts within Arrow at least, it's not really a problem There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I cannot make a problem up (yet). I was just bothered if it could be one. But as long as things work fine, we don't need to have to these macros. |
||
public: | ||
ParquetReadSource( | ||
const std::shared_ptr<io::RandomAccessFile>& file, ParquetAllocator* allocator); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using a shared_ptr here is potentially inflexible, but currently the HDFS classes return shared_ptrs so that's why I chose that here |
||
|
||
void Close() override; | ||
int64_t Tell() const override; | ||
void Seek(int64_t pos) override; | ||
int64_t Read(int64_t nbytes, uint8_t* out) override; | ||
std::shared_ptr<::parquet::Buffer> Read(int64_t nbytes) override; | ||
|
||
private: | ||
// An Arrow readable file of some kind | ||
std::shared_ptr<io::RandomAccessFile> file_; | ||
|
||
// The allocator is required for creating managed buffers | ||
ParquetAllocator* allocator_; | ||
}; | ||
|
||
} // namespace parquet | ||
} // namespace arrow | ||
|
||
#endif // ARROW_PARQUET_IO_H |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 for making all these
int64_t
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I only originally did
int32_t
because libhdfs has some size limits, fixed now