本文使用的C++标准是C++17,使用了两个知识点,
- 一个是使用std::filesystem遍历文件目录
-
第二个是使用nlohmann::json库做json解析。
文件目录结构如下,
image.png
CMakeLists.txt文件如下,
cmake_minimum_required(VERSION 2.6)
if(APPLE)
message(STATUS "This is Apple, do nothing.")
set(CMAKE_MACOSX_RPATH 1)
set(CMAKE_PREFIX_PATH /Users/aabjfzhu/software/vcpkg/ports/cppwork/vcpkg_installed/x64-osx/share )
elseif(UNIX)
message(STATUS "This is linux, set CMAKE_PREFIX_PATH.")
set(CMAKE_PREFIX_PATH /vcpkg/ports/cppwork/vcpkg_installed/x64-linux/share)
endif(APPLE)
project(zepplin_mig)
set(CMAKE_CXX_STANDARD 17)
add_definitions(-g)
find_package(ZLIB)
find_package(OpenCV REQUIRED )
find_package(Arrow CONFIG REQUIRED)
find_package(unofficial-brotli REQUIRED)
find_package(unofficial-utf8proc CONFIG REQUIRED)
find_package(Thrift CONFIG REQUIRED)
find_package(glog REQUIRED)
find_package(OpenSSL REQUIRED)
find_package(Boost REQUIRED COMPONENTS
system
filesystem
serialization
program_options
thread
)
find_package(DataFrame REQUIRED)
if(APPLE)
MESSAGE(STATUS "This is APPLE, set INCLUDE_DIRS")
set(INCLUDE_DIRS ${Boost_INCLUDE_DIRS} /usr/local/include /usr/local/iODBC/include /opt/snowflake/snowflakeodbc/include/ ${CMAKE_CURRENT_SOURCE_DIR}/../include/ ${CMAKE_CURRENT_SOURCE_DIR}/../../../include)
elseif(UNIX)
MESSAGE(STATUS "This is linux, set INCLUDE_DIRS")
set(INCLUDE_DIRS ${Boost_INCLUDE_DIRS} /usr/local/include ${CMAKE_CURRENT_SOURCE_DIR}/../include/ ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/)
endif(APPLE)
if(APPLE)
MESSAGE(STATUS "This is APPLE, set LINK_DIRS")
set(LINK_DIRS /usr/local/lib /usr/local/iODBC/lib /opt/snowflake/snowflakeodbc/lib/universal)
elseif(UNIX)
MESSAGE(STATUS "This is linux, set LINK_DIRS")
set(LINK_DIRS ${Boost_INCLUDE_DIRS} /usr/local/lib /vcpkg/ports/cppwork/vcpkg_installed/x64-linux/lib)
endif(APPLE)
if(APPLE)
MESSAGE(STATUS "This is APPLE, set ODBC_LIBS")
set(ODBC_LIBS iodbc iodbcinst)
elseif(UNIX)
MESSAGE(STATUS "This is linux, set LINK_DIRS")
set(ODBC_LIBS odbc odbcinst ltdl)
endif(APPLE)
include_directories(${INCLUDE_DIRS})
LINK_DIRECTORIES(${LINK_DIRS})
file( GLOB test_file_list ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
file( GLOB APP_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/../impl/utils/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../include/utils/*.h ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/arr_/impl/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/http/impl/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/yaml/impl/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/df/impl/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/death_handler/impl/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/api_accuracy/utils/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../../include/api_accuracy/impl/*.cpp)
add_library(${PROJECT_NAME}_lib SHARED ${APP_SOURCES} ${test_file})
target_link_libraries(${PROJECT_NAME}_lib ${Boost_LIBRARIES} ZLIB::ZLIB glog::glog DataFrame::DataFrame ${OpenCV_LIBS})
target_link_libraries(${PROJECT_NAME}_lib OpenSSL::SSL OpenSSL::Crypto libgtest.a pystring libyaml-cpp.a libgmock.a ${ODBC_LIBS} libnanodbc.a pthread dl backtrace libzstd.a libbz2.a libsnappy.a re2::re2 parquet lz4 unofficial::brotli::brotlidec-static unofficial::brotli::brotlienc-static unofficial::brotli::brotlicommon-static utf8proc thrift::thrift arrow arrow_dataset)
foreach( test_file ${test_file_list} )
file(RELATIVE_PATH filename ${CMAKE_CURRENT_SOURCE_DIR} ${test_file})
string(REPLACE ".cpp" "" file ${filename})
add_executable(${file} ${test_file})
target_link_libraries(${file} ${PROJECT_NAME}_lib)
endforeach( test_file ${test_file_list})
include/utils/conv_util.h
#ifndef _FREDRIC_ZEPPLIN_MIG_CONV_UTIL_H_
#define _FREDRIC_ZEPPLIN_MIG_CONV_UTIL_H_
#include <vector>
#include <string>
struct Notebook {
std::string path;
std::string py_name;
};
struct ConvUtil {
static std::vector<Notebook> get_all_note_books(std::string const& note_path);
static bool conv_notebook_2_py(std::vector<Notebook> const& note_books);
};
#endif
include/utils/zepplin_mig_cfg.h
#ifndef _FREDRIC_ZEPPLIN_MIG_CFG_H_
#define _FREDRIC_ZEPPLIN_MIG_CFG_H_
#include <string>
extern std::string zepplin_js_path;
#endif
impl/utils/conv_util.cpp
#include "utils/conv_util.h"
#include "pystring/pystring.h"
#include "json/json.hpp"
#include "api_accuracy/utils/io_util.h"
#include <filesystem>
#include <iostream>
#include <sstream>
namespace fs = std::filesystem;
using json = nlohmann::json;
std::vector<Notebook> ConvUtil::get_all_note_books(std::string const& note_path) {
std::vector<Notebook> ret_books{};
for(auto const& entry: fs::directory_iterator(note_path)) {
std::string path_ = entry.path();
std::vector<std::string> path_list;
pystring::split(path_, path_list, "/");
std::vector<std::string> path_sep_list;
pystring::split(path_list[path_list.size()-1], path_sep_list, ".");
auto book_name = path_sep_list[0] + ".py";
ret_books.emplace_back(std::move(Notebook{path_, book_name}));
}
return ret_books;
}
bool ConvUtil::conv_notebook_2_py(std::vector<Notebook> const& note_books) {
for(auto const& nb: note_books) {
if(!pystring::endswith(nb.path, ".json")) {
continue;
}
auto nb_content = IOUtil::read_file(nb.path);
auto nb_js = json::parse(nb_content);
auto nb_paragraphs= nb_js["paragraphs"];
std::stringstream ss;
auto i {0};
for(auto const& nb_para: nb_paragraphs) {
if(nb_para.contains("text")) {
auto un_format_para = nb_para["text"].get<std::string>();
if(!pystring::startswith(un_format_para, "%sh")) {
un_format_para = pystring::replace(un_format_para, "%pyspark", "");
ss << "## Section " << ++i << " : \r\n" << un_format_para << "\r\n\r\n";
}
}
}
std::string out_file_name = "../pybooks/" + nb.py_name;
IOUtil::write_file(out_file_name, ss.str());
}
return true;
}
impl/utils/zepplin_mig_cfg.cpp
#include "utils/zepplin_mig_cfg.h"
std::string zepplin_js_path = "../nbooks";
test/conv_nb_2_py_test.cpp
#include "death_handler/death_handler.h"
#include "json/json.hpp"
#include <glog/logging.h>
#include "utils/conv_util.h"
#include "utils/zepplin_mig_cfg.h"
#include <gtest/gtest.h>
#include "df/df.h"
using json = nlohmann::json;
int main(int argc, char** argv) {
FLAGS_log_dir = "./";
FLAGS_alsologtostderr = true;
// 日志级别 INFO, WARNING, ERROR, FATAL 的值分别为0、1、2、3
FLAGS_minloglevel = 0;
Debug::DeathHandler dh;
google::InitGoogleLogging("./logs.log");
testing::InitGoogleTest(&argc, argv);
int ret = RUN_ALL_TESTS();
return ret;
}
GTEST_TEST(ConvNB2PyTests, NB2Py) {
auto res = ConvUtil::get_all_note_books(zepplin_js_path);
auto conv_res = ConvUtil::conv_notebook_2_py(res);
ASSERT_TRUE(conv_res);
}
程序输出如下,
![](https://img.haomeiwen.com/i8982195/91e077227905d85b.png)
![](https://img.haomeiwen.com/i8982195/f3962831c91635e3.png)
![](https://img.haomeiwen.com/i8982195/e1cfe7c70739de40.png)
网友评论