hosseinmoein DataFrame自定义列类似spar

作者: FredricZhu | 来源:发表于2022-02-10 13:44 被阅读0次

hosseinmoein DataFrame自定义列类似spar
C++ 使用hosseinmoein DataFrame实现类似
PySpark 2.0 SparkSession, DataFr
Pandas数据结构之DataFrame
Pandas数据结构之DataFrame常见操作
Pandas
Pandas (二) ：DataFrame数据结构
Panada——数据框DataFrame
pyspark列合并为一行
pandas VS baseR

CMakeLists.txt


cmake_minimum_required(VERSION 2.6)
project(df_test)

set(CMAKE_CXX_STANDARD 17)
add_definitions(-g)

if(APPLE)
    message(STATUS "This is Apple, do nothing.")
elseif(UNIX)
    message(STATUS "This is linux, set CMAKE_PREFIX_PATH.")
    set(CMAKE_PREFIX_PATH /vcpkg/ports/cppwork/vcpkg_installed/x64-linux/share)
endif(APPLE)

add_definitions(-std=c++14)
add_definitions(-g)

find_package(ZLIB)

find_package(glog REQUIRED)
find_package(OpenCV REQUIRED )

find_package(Boost REQUIRED COMPONENTS
    system
    filesystem
    serialization
    program_options
    thread
    )

find_package(DataFrame REQUIRED)

if(APPLE)
    MESSAGE(STATUS "This is APPLE, set INCLUDE_DIRS")
set(INCLUDE_DIRS ${Boost_INCLUDE_DIRS} /usr/local/include /usr/local/iODBC/include /opt/snowflake/snowflakeodbc/include/ ${CMAKE_CURRENT_SOURCE_DIR}/../../)
elseif(UNIX)
    MESSAGE(STATUS "This is linux, set INCLUDE_DIRS")
    set(INCLUDE_DIRS ${Boost_INCLUDE_DIRS} /usr/local/include ${CMAKE_CURRENT_SOURCE_DIR}/../../)
endif(APPLE)


if(APPLE)
    MESSAGE(STATUS "This is APPLE, set LINK_DIRS")
    set(LINK_DIRS /usr/local/lib /usr/local/iODBC/lib /opt/snowflake/snowflakeodbc/lib/universal)
elseif(UNIX)
    MESSAGE(STATUS "This is linux, set LINK_DIRS")
    set(LINK_DIRS ${Boost_INCLUDE_DIRS} /usr/local/lib /vcpkg/ports/cppwork/vcpkg_installed/x64-linux/lib)
endif(APPLE)

if(APPLE)
    MESSAGE(STATUS "This is APPLE, set ODBC_LIBS")
    set(ODBC_LIBS iodbc iodbcinst)
elseif(UNIX)
    MESSAGE(STATUS "This is linux, set LINK_DIRS")
    set(ODBC_LIBS odbc odbcinst ltdl)
endif(APPLE)

include_directories(${INCLUDE_DIRS})
LINK_DIRECTORIES(${LINK_DIRS})

file( GLOB test_file_list ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) 


file( GLOB APP_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.h
    ${CMAKE_CURRENT_SOURCE_DIR}/../impl/*.cpp)

add_library(${PROJECT_NAME}_lib SHARED ${APP_SOURCES})
target_link_libraries(${PROJECT_NAME}_lib ${Boost_LIBRARIES} ZLIB::ZLIB libgtest.a glog::glog DataFrame::DataFrame ${OpenCV_LIBS})

foreach( test_file ${test_file_list} )
    file(RELATIVE_PATH filename ${CMAKE_CURRENT_SOURCE_DIR} ${test_file})
    string(REPLACE ".cpp" "" file ${filename})
    add_executable(${file}  ${test_file})
    target_link_libraries(${file} ${PROJECT_NAME}_lib)
endforeach( test_file ${test_file_list})

df/df.h

#ifndef _FREDRIC_DF_H_
#define _FREDRIC_DF_H_

#include "json/json.hpp"

#include <DataFrame/DataFrame.h>
#include <DataFrame/DataFrameFinancialVisitors.h>
#include <DataFrame/DataFrameMLVisitors.h>
#include <DataFrame/DataFrameOperators.h>
#include <DataFrame/DataFrameStatsVisitors.h>

#include <vector>


using json = nlohmann::json;

// 主键为unsigned int的DataFrame
using CDataFrame = hmdf::StdDataFrame<unsigned int>;
// 主键为unsigned long的DataFrame
using CLDataFrame = hmdf::StdDataFrame<unsigned long>;

// DataFrame Iterator type
template <typename T>
using CDFIdxIteratorT = typename hmdf::StdDataFrame<T>::IndexVecType::iterator;

using CDataFrameIdxItType = CDFIdxIteratorT<unsigned int>;
using CLDataFrameIdxItType = CDFIdxIteratorT<unsigned long>;

using concat_policy = hmdf::concat_policy;
using join_policy = hmdf::join_policy;

const std::string Keys = "keys_";

struct df_op {
    static CDataFrame convert_json_to_df(const json& js, const std::vector<std::string>& pri_keys_);
    static std::vector<std::string> get_df_keys(const json& js);
    static CDataFrame remove_duplicate(const CDataFrame& df, const std::vector<std::string>& keys_);
    static bool write_to_csv(const CDataFrame& df, const std::string& csv_file_name);
}; 

#endif

df/impl/df.cpp

#include "df/df.h"

#include <glog/logging.h>

CDataFrame df_op::convert_json_to_df(const json& js, const std::vector<std::string>& pri_keys_) {
    CDataFrame::set_thread_level(10);
    CDataFrame df;

    unsigned long idx = 1ul;
    std::vector<unsigned long> ulidxs{};

    std::vector<std::string> keys_ = get_df_keys(js);
    if(keys_.size() == 0) {
        return df;
    }
    
    std::map<std::string, std::vector<json>> columns {};

    for (auto&& ele_js : js) {
        std::string key {};
        for(auto column_key: keys_) {
            if(columns.find(column_key) == columns.end()) {
                std::vector<json> tmp_v {ele_js[column_key]};
                columns[column_key] = std::move(tmp_v);
            } else {
                columns[column_key].emplace_back(std::move(ele_js[column_key]));
            }   
            // No primary keys specified, all columns are considered as primary keys
            if(pri_keys_.size() == 0) {
                key +=  ele_js[column_key].dump();
            } else {
                auto key_it_ = std::find(pri_keys_.begin(), pri_keys_.end(), column_key);
                if(key_it_ != pri_keys_.end()) {
                    key +=  ele_js[column_key].dump();
                }
            }
        }

        if(columns.find(Keys) == columns.end()) {
            std::vector<json> tmp_v {json(key)};
            columns[Keys] = std::move(tmp_v);
        } else {
            columns[Keys].emplace_back(std::move(json(key)));
        }

        ulidxs.emplace_back(idx++);
    }
    
    df.load_index(ulidxs.begin(), ulidxs.end());
    for(auto&& key: keys_) {
        df.load_column<json>(key.c_str(), {columns[key].begin(), columns[key].end()}, hmdf::nan_policy::pad_with_nans);
    }

    df.load_column<json>(Keys.c_str(), {columns[Keys].begin(), columns[Keys].end()}, hmdf::nan_policy::pad_with_nans);
    return df;
}

std::vector<std::string> df_op::get_df_keys(const json& js) {
    std::vector<std::string> keys_{};
    if(js.size() == 0) {
        LOG(ERROR) << "Json list size is zero, empty list!!" << "\n";
        return keys_;
    }

    auto ele_0 = js[0];
    for (auto &&begin = ele_0.begin(), end = ele_0.end(); begin != end; ++begin) {
        auto key = begin.key();
        keys_.emplace_back(key);
    }
    return keys_;
}

CDataFrame df_op::remove_duplicate(const CDataFrame& df, const std::vector<std::string>& keys_) {
    auto size_ = keys_.size();
    if(size_ == 1) {
        return df.remove_duplicates<json>(keys_[0].c_str(), false, hmdf::remove_dup_spec::keep_none);
    } else if(size_ == 2) {
        return df.remove_duplicates<json, json>(keys_[0].c_str(), keys_[1].c_str(), false, hmdf::remove_dup_spec::keep_none);
    } else if(size_ == 3) {
        return df.remove_duplicates<json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), false, hmdf::remove_dup_spec::keep_none);
    } else if(size_ == 4) {
        return df.remove_duplicates<json, json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), keys_[3].c_str(), false, hmdf::remove_dup_spec::keep_none);
    } else if(size_ == 5) {
        return df.remove_duplicates<json, json, json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), keys_[3].c_str(), keys_[4].c_str(), false, hmdf::remove_dup_spec::keep_none);
    } else if(size_ == 6) {
        return df.remove_duplicates<json, json, json, json, json, json>(keys_[0].c_str(), keys_[1].c_str(),keys_[2].c_str(), keys_[3].c_str(), keys_[4].c_str(), keys_[5].c_str() , false, hmdf::remove_dup_spec::keep_none);
    } else {
        throw std::runtime_error("Not supported argument length, greater than 6!");
    }
}

bool df_op::write_to_csv(const CDataFrame& df, const std::string& csv_file_name) {
    std::fstream fs {csv_file_name, std::ios::out | std::ios::trunc};
    if(!fs.is_open()) {
        LOG(ERROR) << "Open file failed" << "\n";
        return false;
    }
    
    df.write<std::ostream, json>(fs, hmdf::io_format::csv2, true);
    fs.close();
    return true;
}

df_combine_col_test.cpp

#include "df/df.h"

#include <glog/logging.h>
#include <gtest/gtest.h>

#include "json/json.hpp"

#include <fstream>
#include <sstream>

using json = nlohmann::json;

int main(int argc, char** argv) {
    FLAGS_log_dir = "./";
    FLAGS_alsologtostderr = true;
    google::InitGoogleLogging("./logs.log");
    testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();
}

static double my_max_3(double const& d1, double const& d2, double const& d3) {
    return std::max<double>({d1, d2, d3});
}

static double my_max_4(double const& d1, double const& d2, double const& d3, double const& d4) {
    return std::max<double>({d1, d2, d3, d4});
}

GTEST_TEST(DFCombineColTests, CombineThreeCol) {
    // 三列合并取最大值
    LOG(INFO) << "\nTesting combine() three cols ...\n";

    std::vector<unsigned long> idx1 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<unsigned long> idx2 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<unsigned long> idx3 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<unsigned long> idx4 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<double> d1 {
        1, 2, 100, 4, 5, 6, 7, 8, 9, 10, 11, 300, 13, 14, 15, 16, 17, 18, 19, 20, 200
    };
    std::vector<double> d2 {
        1, 2, 1000, 4, 5, 6, 7, 8, 9, 10, 11, 3000, 13, 14, 15, 16, 17, 18, 19, 20, 2000
    };
    std::vector<double> d3 {
        1, 2, 5000, 4, 5, 6, 7, 8, 9, 10, 11, 7000, 13, 14, 15, 16, 17, 18, 19, 20, 8000 
    };
    std::vector<double> d4 {
        1, 2, 10000, 4, 5, 6, 7, 8, 9, 10, 11, 20000, 13, 14, 15, 16, 17, 18, 19, 20, 30000
    };

    CLDataFrame df1, df2, df3, df4;

    df1.load_data(
        std::move(idx1),
        std::make_pair("d1_col", d1)
    );

    df2.load_data(
        std::move(idx2),
        std::make_pair("d1_col", d2)
    );

    df3.load_data(
        std::move(idx3),
        std::make_pair("d1_col", d3)
    );

    df4.load_data(
        std::move(idx4),
        std::make_pair("d1_col", d4)
    );

    df1.load_column("d2_col", std::move(df1.combine<double>("d1_col", df2, df3, my_max_3)));

    df1.write<std::ostream, double>(std::cout);

    std::vector<double> result {1,2,5000,4,5,6,7,8,9,10,11,7000,13,14,15,16,17,18,19,20,8000};
    ASSERT_EQ(result, df1.get_column<double>("d2_col"));
}

GTEST_TEST(DFCombineColTests, CombineFourCol) {
    // 4列合并取最大值
    LOG(INFO) << "\nTesting combine() four cols ...\n";

    std::vector<unsigned long> idx1 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<unsigned long> idx2 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<unsigned long> idx3 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<unsigned long> idx4 {
        123450, 123451, 123452, 123453, 123454, 123455, 123456,
        123457, 123458, 123459, 123460, 123461, 123462, 123466,
        123467, 123468, 123469, 123470, 123471, 123472, 123473
    };

    std::vector<double> d1 {
        1, 2, 100, 4, 5, 6, 7, 8, 9, 10, 11, 300, 13, 14, 15, 16, 17, 18, 19, 20, 200
    };
    std::vector<double> d2 {
        1, 2, 1000, 4, 5, 6, 7, 8, 9, 10, 11, 3000, 13, 14, 15, 16, 17, 18, 19, 20, 2000
    };
    std::vector<double> d3 {
        1, 2, 5000, 4, 5, 6, 7, 8, 9, 10, 11, 7000, 13, 14, 15, 16, 17, 18, 19, 20, 8000 
    };
    std::vector<double> d4 {
        1, 2, 10000, 4, 5, 6, 7, 8, 9, 10, 11, 20000, 13, 14, 15, 16, 17, 18, 19, 20, 30000
    };

    CLDataFrame df1, df2, df3, df4;

    df1.load_data(
        std::move(idx1),
        std::make_pair("d1_col", d1)
    );

    df2.load_data(
        std::move(idx2),
        std::make_pair("d1_col", d2)
    );

    df3.load_data(
        std::move(idx3),
        std::make_pair("d1_col", d3)
    );

    df4.load_data(
        std::move(idx4),
        std::make_pair("d1_col", d4)
    );

    df1.load_column("d2_col", std::move(df1.combine<double>("d1_col", df2, df3, df4, my_max_4)));

    df1.write<std::ostream, double>(std::cout);

    std::vector<double> result {1, 2, 10000, 4, 5, 6, 7, 8, 9, 10, 11, 20000, 13, 14, 15, 16, 17, 18, 19, 20, 30000};
    ASSERT_EQ(result, df1.get_column<double>("d2_col"));
}

程序输出如下，

image.png

hosseinmoein DataFrame自定义列类似spar
CMakeLists.txt df/df.h df/impl/df.cpp df_combine_col_test...
C++ 使用hosseinmoein DataFrame实现类似
主库地址:https://github.com/hosseinmoein/DataFrame[https://gi...
PySpark 2.0 SparkSession, DataFr
TO DO DataFrame Read and Write DataFrame What new in Spar...
Pandas数据结构之DataFrame
DataFrame DataFrame 是由多种类型的列构成的二维标签数据结构，类似于 Excel 、SQL 表，...
Pandas数据结构之DataFrame常见操作
提取、添加、删除列 DataFrame 就像带索引的 Series 字典，提取、设置、删除列的操作与字典类似：删...
Pandas
DataFrame DataFrame的基本属性 DataFrame.columns 列标DataFrame....
Pandas (二) ：DataFrame数据结构
DataFrame DataFrame是一种类似表格形式的二维数据结构，其数据是以行和列的表格方式排列的。例如我们...
Panada——数据框DataFrame
DataFrame是一个类似表格的数据结构，索引包括列索引和行索引，包含有一组有序的列，每列可以是不同的值类型（数...
pyspark列合并为一行
将dataframe利用pyspark列合并为一行，类似于sql的GROUP_CONCAT函数。例如如下dataf...
pandas VS baseR
创建DataFrame 获取DataFrame维度获取DataFrame列名数据选取按条件选取数据增加新列...