当前位置:网站首页>Arrow parquet 之 String Reader
Arrow parquet 之 String Reader
2022-08-09 15:46:00 【zhixingheyi_tian】
Switch
cpp/src/parquet/column_reader.cc
TypedRecordReader(const ColumnDescriptor* descr, LevelInfo leaf_info, MemoryPool* pool)
: BASE(descr, pool) {
leaf_info_ = leaf_info;
nullable_values_ = leaf_info.HasNullableValues();
at_record_start_ = true;
records_read_ = 0;
values_written_ = 0;
values_capacity_ = 0;
null_count_ = 0;
levels_written_ = 0;
levels_position_ = 0;
levels_capacity_ = 0;
uses_values_ = !(descr->physical_type() == Type::BYTE_ARRAY);
if (uses_values_) {
values_ = AllocateBuffer(pool);
}
valid_bits_ = AllocateBuffer(pool);
def_levels_ = AllocateBuffer(pool);
rep_levels_ = AllocateBuffer(pool);
Reset();
}
NextBatch 顺序
以下逻辑均在 cpp/src/parquet/arrow/reader.cc
::arrow::Iterator<RecordBatchIterator> batches = ::arrow::MakeFunctionIterator(
[readers, batch_schema, num_rows,
this]() mutable -> ::arrow::Result<RecordBatchIterator> {
::arrow::ChunkedArrayVector columns(readers.size());
// don't reserve more rows than necessary
int64_t batch_size = std::min(properties().batch_size(), num_rows);
num_rows -= batch_size;
RETURN_NOT_OK(::arrow::internal::OptionalParallelFor(
reader_properties_.use_threads(), static_cast<int>(readers.size()),
[&](int i) { return readers[i]->NextBatch(batch_size, &columns[i]); }));
for (const auto& column : columns) {
if (column == nullptr || column->length() == 0) {
return ::arrow::IterationTraits<RecordBatchIterator>::End();
}
}
auto table = ::arrow::Table::Make(batch_schema, std::move(columns));
auto table_reader = std::make_shared<::arrow::TableBatchReader>(*table);
// NB: explicitly preserve table so that table_reader doesn't outlive it
return ::arrow::MakeFunctionIterator(
[table, table_reader] { return table_reader->Next(); });
});
先通过 metadata 算出了 num_rows
int64_t num_rows = 0;
for (int row_group : row_groups) {
num_rows += parquet_reader()->metadata()->RowGroup(row_group)->num_rows();
}
然后计算 真实的batch_size
// don't reserve more rows than necessary
int64_t batch_size = std::min(properties().batch_size(), num_rows);
num_rows -= batch_size;
::arrow::Status NextBatch(int64_t batch_size,
std::shared_ptr<::arrow::ChunkedArray>* out) final {
RETURN_NOT_OK(LoadBatch(batch_size));
RETURN_NOT_OK(BuildArray(batch_size, out));
for (int x = 0; x < (*out)->num_chunks(); x++) {
RETURN_NOT_OK((*out)->chunk(x)->Validate());
}
return Status::OK();
}
NextRowGroup and TransferColumnData 位置
Status LoadBatch(int64_t records_to_read) final {
BEGIN_PARQUET_CATCH_EXCEPTIONS
out_ = nullptr;
record_reader_->Reset();
// Pre-allocation gives much better performance for flat columns
record_reader_->Reserve(records_to_read);
while (records_to_read > 0) {
if (!record_reader_->HasMoreData()) {
break;
}
int64_t records_read = record_reader_->ReadRecords(records_to_read);
records_to_read -= records_read;
if (records_read == 0) {
NextRowGroup();
}
}
RETURN_NOT_OK(TransferColumnData(record_reader_.get(), field_->type(), descr_,
ctx_->pool, &out_));
return Status::OK();
END_PARQUET_CATCH_EXCEPTIONS
}
边栏推荐
- A50 - 基于51单片机的太阳能充电路灯设计
- 【Web渗透】信息收集篇——Google搜索引擎(一)
- SQL trill interview: send you a universal template, to?(key, each user to log on to the maximum number of consecutive monthly)
- STM32课设-智能物联网家居系统(UCOSIII+STEMWIN)
- 计组——大端方式和小端方式相关题目
- OpenCV 图像变换之 —— 拉伸、收缩、扭曲和旋转
- 插入一个数并排序「建议收藏」
- Sigrity PowerSI Characteristic Impedance and Coupling Simulation
- 无需支付688苹果开发者账号,xcode13打包导出ipa,提供他人进行内测
- 微信开发者工具报错,提示 未找到入口 app.json 文件
猜你喜欢
随机推荐
The Chinese Academy of Sciences slaps Google in the face: ordinary computers catch up with quantum superiority, and can solve calculations that would have taken 10,000 years in a few hours...
NFT+IDO预售代币合约模式系统开发
【嵌入式入门篇】嵌入式0基础沉浸式刷题篇1
B40 - 基于STM32单片机的电热蚊香蓝牙控制系统
Video chat source code - how to improve the quality of one-to-one live broadcast?
【燃】是时候展现真正的实力了!一文看懂2022华为开发者大赛技术亮点
视频聊天源码——一对一直播如何提高直播质量?
网络——数据交换方式
Volatile:JVM 我警告你,我的人你别乱动
网络——IPv6 vs IPv4
「我觉得AI领域乙烷」网友:你说的太多了,让AI来总结一下
STM32课设-智能物联网家居系统(UCOSIII+STEMWIN)
B45 - 基于STM32单片机的家庭防火防盗系统的设计
5G NR Paging
Knowledge Bits - How to Write a Project Summary
现在,怎么挑选舞台租赁LED显示屏?
yolov5训练并生成rknn模型以及3588平台部署
网络——涉及的相关协议和设备汇总
二分法
B49 - 基于STM32单片机的心率血氧检测与远程定位报警装置









