本例来自<C++性能优化指南>一书,对所有并发原语做了简单测试。
其实本例子从性能来讲意义不大,但是对于了解C++ 11都有哪些并发原语有一些意义。
因为不同的并发原语适用于不同的场景,强行对API做性能测试没有多大意义。
本例计算了不同线程数情况下计算fibonacci(30)的效率。
对比了std::atomic变量和普通变量赋值的巨大性能差异。
本例还有一个简单的condition_variable的生产者,消费者模式的example。
对于多线程程序如何做性能测试有一定的启发意义。
但是现实业务中的场景往往比这些简单场景要复杂得多。
程序代码如下,
conanfile.txt
[requires]
boost/1.72.0
[generators]
cmake
CMakeLists.txt
cmake_minimum_required(VERSION 2.6)
project(optimize)
set(ENV{PKG_CONFIG_PATH} "$ENV{PKG_CONFIG_PATH}:/usr/local/lib/pkgconfig/")
set ( CMAKE_CXX_FLAGS "-pthread")
set(CMAKE_CXX_STANDARD 17)
add_definitions(-g)
include(${CMAKE_BINARY_DIR}/conanbuildinfo.cmake)
conan_basic_setup()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
LINK_DIRECTORIES(${LINK_DIRS})
file( GLOB main_file_list ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
file( GLOB source_file_list ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp)
foreach( main_file ${main_file_list} )
file(RELATIVE_PATH filename ${CMAKE_CURRENT_SOURCE_DIR} ${main_file})
string(REPLACE ".cpp" "" file ${filename})
add_executable(${file} ${main_file} ${source_file_list})
target_link_libraries(${file} ${CONAN_LIBS} pthread atomic)
endforeach( main_file ${main_file_list})
concurrency_test.cpp
#include "test_driver.h"
#include <iostream>
int test_atomic(int, unsigned long);
int (*func[])(int, unsigned long) = {
test_atomic,
0
};
int main(int argc, char* argv[]) {
test_driver(func, argc, argv);
return EXIT_SUCCESS;
}
concurrency_impl.cpp
#include "stopwatch11.h"
#include <thread>
#include <vector>
#include <future>
#include <atomic>
typedef unsigned long long counter_t;
bool atomic_test_1() {
struct foo {
int i;
int j;
int k;
int l;
};
std::atomic<foo> f;
foo f1 {2, 3, 4, 5};
f.store(f1);
foo f2 = f.load();
return (f2.i == 2 && f2.j == 3 && f2.k == 4 && f2.l == 5 && f.is_lock_free() == false);
}
void f1(int n) {
std::cout << "thread " << n << std::endl;
}
void thread_example() {
std::thread t1;
t1 = std::thread(f1, 1);
t1.join();
std::thread t2(f1, 2);
std::thread t3(std::move(t2));
std::thread t4([](){
return;
});
t4.detach();
t3.join();
}
// 使用promise 跨线程返回值
void promise_future_example_1() {
auto meaning = [](std::promise<int>& prom) {
prom.set_value(42);
};
std::promise<int> prom;
std::thread(meaning, std::ref(prom)).detach();
std::future<int> result = prom.get_future();
std::cout << "the meaning of life: " << result.get() << "\n";
}
// 使用packaged_task 跨线程返回值
void promise_future_example_2() {
auto meaning = std::packaged_task<int(int)>([](int n) {
return n;
});
auto result = meaning.get_future();
auto t = std::thread(std::move(meaning), 42);
std::cout << "the meaning of life: " << result.get() << "\n";
t.join();
}
// 使用std::async(std::launch::deferred | std::launch::async),
// 注意,这里由操作系统决定调度策略
void promise_future_example_3() {
auto meaning = [](int n) {
return n;
};
auto result = std::async(std::move(meaning), 42);
std::cout << "the meaning of life: " << result.get() << "\n";
}
bool thread_start_stop_test(unsigned long multiplier) {
std::thread t;
{
// 换行join
StopWatch sw("thread start/stop");
for(unsigned i=0; i<10* multiplier; ++i) {
t = std::thread([](){
return;
});
t.join();
}
}
{
// 同行join
StopWatch sw("thread start/stop 2");
for(unsigned i=0; i<10* multiplier; ++i) {
std::thread([](){
return;
}).join();
}
}
{
// 同行detach
StopWatch sw("thread start/detach");
for(unsigned i=0; i<10* multiplier; ++i) {
std::thread([](){
return;
}).detach();
}
}
return true;
}
bool async_startstop_test(unsigned long multiplier) {
{
StopWatch sw("async task start/stop");
for(unsigned i=0; i<10* multiplier; ++i) {
std::async(std::launch::async, [](){
return;
});
}
}
{
StopWatch sw("deferred async task start/stop");
for(unsigned i=0; i<10* multiplier; ++i) {
std::async(std::launch::deferred, []() {
return;
});
}
}
return true;
}
void time_waster(unsigned iterations) {
unsigned fibonacci(unsigned);
for(counter_t i=0; i<iterations; ++i) {
fibonacci(30);
}
}
// 这个函数故意写的负优化,写的那么慢的
unsigned fibonacci(unsigned i) {
if(i>2) {
return fibonacci(i-1) + fibonacci(i-2);
} else {
return 1;
}
}
void multithreaded_timewaster(unsigned iterations, unsigned threads) {
std::vector<std::thread> t;
t.reserve(threads);
for(unsigned i=0; i<threads; ++i) {
t.push_back(std::thread(time_waster, iterations/threads));
}
for(unsigned i=0; i<threads; ++i) {
t[i].join();
}
}
bool multi_thread_test(unsigned long multiplier) {
{
StopWatch sw("unthreaded test");
time_waster(multiplier);
}
{
StopWatch sw("1 thread");
multithreaded_timewaster(multiplier, 1);
}
{
StopWatch sw("2 threads");
multithreaded_timewaster(multiplier, 2);
}
{
StopWatch sw("3 threads");
multithreaded_timewaster(multiplier, 3);
}
{
StopWatch sw("4 threads");
multithreaded_timewaster(multiplier, 4);
}
{
StopWatch sw("5 threads");
multithreaded_timewaster(multiplier, 5);
}
{
StopWatch sw("10 threads");
multithreaded_timewaster(multiplier, 10);
}
// TODO: 可能需要移除100 和 1000, 机器性能不足
{
StopWatch sw("100 threads");
multithreaded_timewaster(multiplier, 100);
}
{
StopWatch sw("1000 threads");
multithreaded_timewaster(multiplier, 1000);
}
return true;
}
// 通过counter计数,生产者把counter赋值给shared_data,consumer消费shared_data
// 使用主线程通知生产者和消费者退出
void cv_example() {
std::mutex m;
std::condition_variable cv;
bool terminate = false;
int shared_data = 0;
int counter = 0;
auto consumer = [&]() {
std::unique_lock<std::mutex> lk(m);
do {
while(!(terminate || shared_data != 0)) {
cv.wait(lk);
}
if(terminate) {
break;
}
shared_data = 0;
cv.notify_one();
} while(true);
};
auto producer = [&]() {
std::unique_lock<std::mutex> lk(m);
for(counter=1; true; ++counter) {
cv.wait(lk, [&]() {
return terminate || shared_data == 0;
});
if(terminate) {
break;
}
shared_data = counter;
cv.notify_one();
}
};
auto p = std::thread(producer);
auto c = std::thread(consumer);
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
{
std::lock_guard<std::mutex> l(m);
terminate = true;
}
std::cout << "Total items consumed " << counter << std::endl;
cv.notify_all();
p.join();
c.join();
}
int test_atomic(int test_no, unsigned long multiplier) {
bool rc = true;
switch (test_no) {
default: return -1;
case 0: return 4;
case 1: {
{
// 验证原子变量特性
std::atomic<counter_t> x;
rc &= x.is_lock_free();
rc &= atomic_test_1();
}
{
// 验证fibonacci 函数的正确性
rc &= (fibonacci(1) == 1);
rc &= (fibonacci(2) == 1);
rc &= (fibonacci(3) == 2);
rc &= (fibonacci(4) == 3);
rc &= (fibonacci(5) == 5);
rc &= (fibonacci(6) == 8);
rc &= (fibonacci(7) == 13);
rc &= (fibonacci(40) > 100000000);
}
{
// 线程构建方式
thread_example();
// 条件变量用法
cv_example();
promise_future_example_1();
promise_future_example_2();
promise_future_example_3();
}
}
break;
case 2: {
volatile unsigned n = 30;
{
// fibonacci(30) < 2 ^ 30
StopWatch sw("fibonacci test");
for(counter_t i = 0, iterations = multiplier; i<iterations; ++i) {
rc &= (fibonacci(n) < (1u << n));
}
}
}
break;
case 3: {
{
// 测试给std::atomic变量赋值的速度
StopWatch sw("atomic test");
std::atomic<counter_t> x;
for(counter_t i=0, iterations = 10'000'000 * multiplier; i<iterations; ++i) {
x = i;
}
rc &= (x != 0);
}
{
// 测试给普通变量赋值的速度
StopWatch sw("non-atomic test");
counter_t x;
for(counter_t i=0, iterations = 10'000'000 * multiplier; i<iterations; ++i) {
x = i;
}
rc &= (x != 0);
}
}
break;
case 4: {
// 各种线程join/detach的速度测试
rc &= thread_start_stop_test(multiplier);
// std::async(std::launch::async和std::launch::deferred)速度测试
rc &= async_startstop_test(multiplier);
// 多线程计算fibonacci测试
rc &= multi_thread_test(multiplier);
}
break;
}
return (rc) ? 1: 0;
}
StopWatch相关源码可以查看之前的代码列表。
程序输出如下,
可以看到,原子变量的赋值速度[lock free的原子变量,不是用锁模拟的],比非原子变量的赋值速度慢10倍以上。但是那又如何呢?该用还是得用。
image.png
对于计算fibonacci数列这种简单的任务,线程更多更占优势。
image.png
网友评论