主库地址:
https://github.com/hosseinmoein/DataFrame
安装方式如下,
配置vcpkg.json
{
"name": "cppwork",
"version-string": "",
"homepage": "",
"description": "",
"dependencies": [
{
"name": "dataframe",
"version>=": "1.17.0#1"
}
],
"builtin-baseline": "8275003fd8c5119cb9438fea1425ebe774f2e527"
}
切换到ports工程目录,然后vcpkg install。
本例子的主要思路是,因为这个C++的DataFrame库不支持多列join和多列remove_duplicate,所以在将json转换成DataFrame的时候,添加一列叫keys_,把所有列的字符串表示都放到keys_里面,加起来,为了实现泛化,返回的keys_列也将是json类型。
然后join和remove_duplicates的时候,使用keys_列进行操作就可以。
这样处理json类型的网络数据比较方便。
不用一个json写一个struct来映射dataframe,每个字段都是json对象(可以理解为std::any)。
代码结构如下,

代码如下,
CMakeLists.txt
cmake_minimum_required(VERSION 2.6)
project(df_test)
set(CMAKE_CXX_STANDARD 17)
add_definitions(-g)
if(APPLE)
message(STATUS "This is Apple, do nothing.")
elseif(UNIX)
message(STATUS "This is linux, set CMAKE_PREFIX_PATH.")
set(CMAKE_PREFIX_PATH /vcpkg/ports/cppwork/vcpkg_installed/x64-linux/share)
endif(APPLE)
project(get_incident_file)
add_definitions(-std=c++14)
add_definitions(-g)
find_package(ZLIB)
find_package(glog REQUIRED)
find_package(OpenCV REQUIRED )
find_package(Boost REQUIRED COMPONENTS
system
filesystem
serialization
program_options
thread
)
find_package(DataFrame REQUIRED)
if(APPLE)
MESSAGE(STATUS "This is APPLE, set INCLUDE_DIRS")
set(INCLUDE_DIRS ${Boost_INCLUDE_DIRS} /usr/local/include /usr/local/iODBC/include /opt/snowflake/snowflakeodbc/include/ ${CMAKE_CURRENT_SOURCE_DIR}/../../)
elseif(UNIX)
MESSAGE(STATUS "This is linux, set INCLUDE_DIRS")
set(INCLUDE_DIRS ${Boost_INCLUDE_DIRS} /usr/local/include ${CMAKE_CURRENT_SOURCE_DIR}/../../)
endif(APPLE)
if(APPLE)
MESSAGE(STATUS "This is APPLE, set LINK_DIRS")
set(LINK_DIRS /usr/local/lib /usr/local/iODBC/lib /opt/snowflake/snowflakeodbc/lib/universal)
elseif(UNIX)
MESSAGE(STATUS "This is linux, set LINK_DIRS")
set(LINK_DIRS ${Boost_INCLUDE_DIRS} /usr/local/lib /vcpkg/ports/cppwork/vcpkg_installed/x64-linux/lib)
endif(APPLE)
if(APPLE)
MESSAGE(STATUS "This is APPLE, set ODBC_LIBS")
set(ODBC_LIBS iodbc iodbcinst)
elseif(UNIX)
MESSAGE(STATUS "This is linux, set LINK_DIRS")
set(ODBC_LIBS odbc odbcinst ltdl)
endif(APPLE)
include_directories(${INCLUDE_DIRS})
LINK_DIRECTORIES(${LINK_DIRS})
file( GLOB APP_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h
${CMAKE_CURRENT_SOURCE_DIR}/../impl/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
foreach( sourcefile ${APP_SOURCES} )
file(RELATIVE_PATH filename ${CMAKE_CURRENT_SOURCE_DIR} ${sourcefile})
string(FIND "${filename}" "test.cpp" "TEMP")
if( NOT "${TEMP}" STREQUAL "-1" )
string(REPLACE ".cpp" "" file ${filename})
add_executable(${file} ${APP_SOURCES})
target_link_libraries(${file} ${Boost_LIBRARIES} glog::glog DataFrame::DataFrame)
target_link_libraries(${file} ssl crypto libgtest.a libgtest_main.a libgmock.a ${ODBC_LIBS} pthread)
endif()
endforeach( sourcefile ${APP_SOURCES})
df.h
#ifndef _FREDRIC_DF_H_
#define _FREDRIC_DF_H_
#include "json/json.hpp"
#include <DataFrame/DataFrame.h>
#include <DataFrame/DataFrameFinancialVisitors.h>
#include <DataFrame/DataFrameMLVisitors.h>
#include <DataFrame/DataFrameOperators.h>
#include <DataFrame/DataFrameStatsVisitors.h>
#include <vector>
using json = nlohmann::json;
using CDataFrame = hmdf::StdDataFrame<unsigned int>;
using concat_policy = hmdf::concat_policy;
using join_policy = hmdf::join_policy;
const std::string Keys = "keys_";
struct df_op {
static CDataFrame convert_json_to_df(const json& js);
static std::vector<std::string> get_df_keys(const json& js);
static CDataFrame remove_duplicate(const CDataFrame& df, const std::vector<std::string>& keys_);
static bool write_to_csv(const CDataFrame& df, const std::string& csv_file_name);
};
#endif
impl/df.cpp
#include "df/df.h"
#include <glog/logging.h>
CDataFrame df_op::convert_json_to_df(const json& js) {
CDataFrame::set_thread_level(10);
CDataFrame df;
unsigned long idx = 1ul;
std::vector<unsigned long> ulidxs{};
std::vector<std::string> keys_{};
std::map<std::string, std::vector<json>> columns {};
if(js.size() == 0) {
LOG(ERROR) << "Json list size is zero, empty list!!" << "\n";
return df;
}
auto ele_0 = js[0];
for (auto &&begin = ele_0.begin(), end = ele_0.end(); begin != end; ++begin) {
auto key = begin.key();
keys_.emplace_back(key);
}
for (auto&& ele_js : js) {
std::string key {};
for(auto column_key: keys_) {
if(columns.find(column_key) == columns.end()) {
std::vector<json> tmp_v {ele_js[column_key]};
columns[column_key] = std::move(tmp_v);
} else {
columns[column_key].emplace_back(std::move(ele_js[column_key]));
}
key += ele_js[column_key].dump();
}
if(columns.find(Keys) == columns.end()) {
std::vector<json> tmp_v {json(key)};
columns[Keys] = std::move(tmp_v);
} else {
columns[Keys].emplace_back(std::move(json(key)));
}
ulidxs.emplace_back(idx++);
}
df.load_index(ulidxs.begin(), ulidxs.end());
for(auto&& key: keys_) {
df.load_column<json>(key.c_str(), {columns[key].begin(), columns[key].end()}, hmdf::nan_policy::pad_with_nans);
}
df.load_column<json>(Keys.c_str(), {columns[Keys].begin(), columns[Keys].end()}, hmdf::nan_policy::pad_with_nans);
return df;
}
std::vector<std::string> df_op::get_df_keys(const json& js) {
std::vector<std::string> keys_{};
std::map<std::string, std::vector<json>> columns {};
if(js.size() == 0) {
LOG(ERROR) << "Json list size is zero, empty list!!" << "\n";
return keys_;
}
auto ele_0 = js[0];
for (auto &&begin = ele_0.begin(), end = ele_0.end(); begin != end; ++begin) {
auto key = begin.key();
keys_.emplace_back(key);
}
return keys_;
}
CDataFrame df_op::remove_duplicate(const CDataFrame& df, const std::vector<std::string>& keys_) {
auto size_ = keys_.size();
if(size_ == 1) {
return df.remove_duplicates<json>(keys_[0].c_str(), false, hmdf::remove_dup_spec::keep_none);
} else if(size_ == 2) {
return df.remove_duplicates<json, json>(keys_[0].c_str(), keys_[1].c_str(), false, hmdf::remove_dup_spec::keep_none);
} else if(size_ == 3) {
return df.remove_duplicates<json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), false, hmdf::remove_dup_spec::keep_none);
} else if(size_ == 4) {
return df.remove_duplicates<json, json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), keys_[3].c_str(), false, hmdf::remove_dup_spec::keep_none);
} else if(size_ == 5) {
return df.remove_duplicates<json, json, json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), keys_[3].c_str(), keys_[4].c_str(), false, hmdf::remove_dup_spec::keep_none);
} else if(size_ == 6) {
return df.remove_duplicates<json, json, json, json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), keys_[3].c_str(), keys_[4].c_str(), keys_[5].c_str() , false, hmdf::remove_dup_spec::keep_none);
} else {
throw std::runtime_error("Not supported argument length, greater than 6!");
}
}
bool df_op::write_to_csv(const CDataFrame& df, const std::string& csv_file_name) {
std::fstream fs {csv_file_name, std::ios::out | std::ios::trunc};
if(!fs.is_open()) {
LOG(ERROR) << "Open file failed" << "\n";
return false;
}
df.write<std::ostream, json>(fs, hmdf::io_format::csv2, true);
fs.close();
return true;
}
test/df_test.cpp
#include "df/df.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "json/json.hpp"
#include <fstream>
#include <sstream>
using json = nlohmann::json;
int main(int argc, char** argv) {
FLAGS_log_dir = "./";
FLAGS_alsologtostderr = true;
google::InitGoogleLogging("./logs.log");
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
GTEST_TEST(DFTests, ConcatTest) {
auto js_str =
R"([{"name": "Lisi", "age": 20}, {"name": "Zhangsan", "age": 18}, {"name": "WangWu", "age": 28}])";
auto js_str1 =
R"([{"name": "WangWu", "age": 28}, {"name": "Lisi", "age": 20}, {"name": "Zhangsan", "age": 18}])";
auto js = json::parse(js_str);
auto js1 = json::parse(js_str1);
auto df = df_op::convert_json_to_df(js);
auto df1 = df_op::convert_json_to_df(js1);
auto df3 = df.concat<decltype(df1), json>(df1, concat_policy::all_columns);
df3.write<std::ostream, json>(LOG(INFO));
std::vector<std::string> keys_{Keys};
auto df4 = df_op::remove_duplicate(df3, keys_);
df4.write<std::ostream, json>(LOG(INFO));
LOG(INFO) << df4.shape().first<< " " << df4.shape().second << "\n";
auto df_rows = df4.shape().first;
ASSERT_EQ(0, df_rows);
}
GTEST_TEST(DFTests, JoinTest) {
auto js_str =
R"([{"name": "Lisi", "age": 20}, {"name": "Zhangsan", "age": 18}, {"name": "WangWu", "age": 28}])";
auto js_str1 =
R"([{"name": "WangWu", "age": 38}, {"name": "Lisi", "age": 20}, {"name": "Zhangsan", "age": 18}])";
auto js = json::parse(js_str);
auto js1 = json::parse(js_str1);
auto df = df_op::convert_json_to_df(js);
auto df1 = df_op::convert_json_to_df(js1);
auto df3 = df.join_by_column<decltype(df1), json, json, json>(df1, Keys.c_str(), join_policy::inner_join);
df3.write<std::ostream, json>(LOG(INFO));
df_op::write_to_csv(df3, "./2.csv");
}
程序输出如下,


网友评论