我有一些代码,它迭代地接收转储到HDF5文件中的数据。这是我要实现的玩具示例:
#include <HDFql.hpp>
void createHDF(const std::string & filepath)
{
char script_[1024];
sprintf(script_, "CREATE TRUNCATE FILE %s", filepath.c_str());
HDFql::execute(script_);
sprintf(script_, "USE FILE %s", filepath.c_str());
HDFql::execute(script_);
sprintf(script_, "CREATE GROUP events");
HDFql::execute(script_);
HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");
sprintf(script_, "CREATE GROUP frames");
HDFql::execute(script_);
sprintf(script_, "CREATE GROUP optic_flow");
HDFql::execute(script_);
}
void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v,
std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
//Input arrays are all the same size
const int data_size = ts_v.size();
//Open file
sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
HDFql::execute(script_);
//Add events
sprintf(script_, "ALTER DIMENSION events/xs TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "ALTER DIMENSION events/ys TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "ALTER DIMENSION events/ts TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "ALTER DIMENSION events/ps TO +%d", data_size);
HDFql::execute(script_);
HDFql::variableRegister(&xs_v[0]);
sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&xs_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&xs_v[0]);
HDFql::variableRegister(&ys_v[0]);
sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&ys_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&ys_v[0]);
HDFql::variableRegister(&ts_v[0]);
sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&ts_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&ts_v[0]);
HDFql::variableRegister(&ps_v[0]);
sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY %d", data_size,
data_size, HDFql::variableGetNumber(&ps_v[0]));
HDFql::execute(script_);
HDFql::variableUnregister(&ps_v[0]);
total_events_added_ += data_size;
events_idx_++;
}
int main (int argc, const char * argv[]) {
std::string path = "/tmp/test.h5";
createHDF(path);
const int data_size = 1000;
const int iterations = 10000;
std::vector<double> ts(data_size);
std::vector<int16_t> xs(data_size);
std::vector<int16_t> ys(data_size);
std::vector<int8_t> ps(data_size);
for(int i=0; i<data_size; i++)
{
ts_v.push_back(i);
xs_v.push_back(i);
ys_v.push_back(i);
ps_v.push_back(1);
}
for(int i=0; i<iterations; i++)
{
writeData(ts, xs, ys, ps);
}
}
此代码运行极端缓慢。使用诸如cnpy
之类的其他二进制库,此过程一眨眼就可以执行,因此问题不在于写入的数据量。我想知道这是否就是HDFql中的情况,还是某处代码中存在一些错误。
非常感谢!
cnpy
是否执行与HDFql中相同的操作(例如,使用等于1的块大小来扩展数据集events/xs
,events/ys
,events/ts
和events/ps
的维度)?
查看您的代码,您可能希望显式指定等于ts_v.size()
的数据集的块大小,因为这很可能会大大提高性能。现在,有了它,HDFql会自动为您的方便(使用最佳猜测方法)计算块大小,这可能不会导致最佳性能。您需要明确指定块大小,例如CREATE CHUNKED(10) DATASET events/xs AS SMALLINT(UNLIMITED)
。
您的代码更优化:
#include <HDFql.hpp>
void createHDF(const std::string & filepath)
{
char script_[1024];
sprintf(script_, "CREATE TRUNCATE FILE %s", filepath.c_str());
HDFql::execute(script_);
sprintf(script_, "USE FILE %s", filepath.c_str());
HDFql::execute(script_);
HDFql::execute("CREATE GROUP events, frames, optic_flow");
HDFql::execute("CREATE CHUNKED DATASET events/xs AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ys AS SMALLINT(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ts AS DOUBLE(UNLIMITED)");
HDFql::execute("CREATE CHUNKED DATASET events/ps AS TINYINT(UNLIMITED)");
}
void writeData(const std::vector<double>& ts_v, std::vector<int16_t>& xs_v, std::vector<int16_t>& ys_v, std::vector<int8_t>& ps_v)
{
//Input arrays are all the same size
const int data_size = ts_v.size();
//Open file
sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
HDFql::execute(script_);
//Add events
sprintf(script_, "ALTER DIMENSION events/xs, events/ys, events/ts, events/ps TO +%d", data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/xs(-%d:1:1:%d) VALUES FROM MEMORY 0", data_size, data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/ys(-%d:1:1:%d) VALUES FROM MEMORY 1", data_size, data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/ts(-%d:1:1:%d) VALUES FROM MEMORY 2", data_size, data_size);
HDFql::execute(script_);
sprintf(script_, "INSERT INTO events/ps(-%d:1:1:%d) VALUES FROM MEMORY 3", data_size, data_size);
HDFql::execute(script_);
total_events_added_ += data_size;
events_idx_++;
}
int main (int argc, const char * argv[]) {
std::string path = "/tmp/test.h5";
createHDF(path);
const int data_size = 1000;
const int iterations = 10000;
std::vector<double> ts(data_size);
std::vector<int16_t> xs(data_size);
std::vector<int16_t> ys(data_size);
std::vector<int8_t> ps(data_size);
for(int i=0; i<data_size; i++)
{
ts_v.push_back(i);
xs_v.push_back(i);
ys_v.push_back(i);
ps_v.push_back(1);
}
HDFql::variableRegister(&xs_v);
HDFql::variableRegister(&ys_v);
HDFql::variableRegister(&ts_v);
HDFql::variableRegister(&ps_v);
for(int i=0; i<iterations; i++)
{
writeData(ts, xs, ys, ps);
}
}
此外,如果可能的话,请您将这段代码sprintf(script_, "USE FILE %s", HDF5_path_.c_str());
移动到writeData
函数之外,然后只打开该文件一次?