我正在尝试利用多线程来优化 C++ 代码的运行时间。我尝试了几种不同的解决方案。
我有这个使用 boost 的代码:
#include <iostream>
#include <vector>
#include <chrono>
#include <atomic>
#include <thread>
#include <boost/asio.hpp>
#include <boost/bind/bind.hpp>
void Test::boost_worker_task() {
char new_state[3][3];
MyEngine::random_start_state(new_state);
MyEngine::solve_game(new_state);
++games_solved_counter;
}
void Test::run(const unsigned int games_to_solve, const bool use_mul_thread) {
MyEngine::init_rand();
const auto start = std::chrono::high_resolution_clock::now();
const unsigned int num_threads = use_mul_thread ? std::thread::hardware_concurrency() : 1;
std::cout << "Using " << num_threads << " threads to solve " << games_to_solve << " games" << std::endl;
boost::asio::io_service io_service;
boost::asio::thread_pool pool(num_threads);
for (unsigned int i = 0; i < games_to_solve; ++i) {
io_service.post([] { return Test::boost_worker_task(); });
}
// Run and wait for all tasks to complete
io_service.run();
pool.join();
const auto end = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double, std::milli> elapsed = end - start;
std::cout << "Solved " << games_solved_counter << " games!" << std::endl;
std::cout << "Elapsed time: " << elapsed.count() / 1000.0 << " seconds" << std::endl;
std::cout << "Elapsed time: " << elapsed.count() << " milliseconds\n" << std::endl;
}
还有这段代码:
#include <iostream>
#include <vector>
#include <thread>
#include <mutex>
#include <chrono>
#include <atomic>
#include <future>
std::atomic<int> games_solved(0);
void TestMulThread::worker_task(const unsigned int num_iterations, std::mutex& games_solved_mutex) {
for (unsigned int i = 0; i < num_iterations; ++i) {
char new_state[3][3];
MyEngine::random_start_state(new_state);
MyEngine::solve_game(new_state);
++games_solved;
}
}
void TestMulThread::run(const unsigned int total_games_to_solve) {
MyEngine::init_rand();
const auto start_time = std::chrono::high_resolution_clock::now();
const unsigned int num_threads = std::thread::hardware_concurrency();
const unsigned int games_per_thread = total_games_to_solve / num_threads;
const unsigned int remaining_games = total_games_to_solve % games_per_thread;
std::cout << "Using " << num_threads << " threads to solve " << total_games_to_solve << " games" << std::endl;
// Distribute the remaining games
std::vector<unsigned int> games_for_each_thread(num_threads, games_per_thread);
for (unsigned int i = 0; i < remaining_games; ++i) {
games_for_each_thread[i]++;
}
std::vector<std::future<void>> futures;
std::mutex games_solved_mutex;
for (unsigned int i = 0; i < num_threads; ++i) {
futures.push_back(std::async(std::launch::async, worker_task, games_for_each_thread[i], std::ref(games_solved_mutex)));
}
for (auto& future : futures) {
future.get();
}
const auto end_time = std::chrono::high_resolution_clock::now();
const auto elapsed_time = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
std::cout << "Solved " << games_solved << " games!" << std::endl;
std::cout << "Elapsed time: " << elapsed_time / 1000.0 << " seconds" << std::endl;
std::cout << "Elapsed time: " << elapsed_time << " milliseconds\n" << std::endl;
}
我的问题有两个:
boost 版本的运行速度比第二个代码慢得多。它甚至比使用简单的 for 循环并且不尝试利用多个线程运行得更慢。我知道尝试并行运行任务可能会导致不同类型的开销,但第二个代码运行速度非常快,我想了解原因。
使用 Visual Studio 时,第二个代码片段运行得非常快(使用 Release 和 -O2 标志进行编译作为 C++ 优化),但是当我使用 g++ 在我的 Linux 机器上编译并运行相同的代码时,它的运行速度再次比使用 for 慢。循环运行相同数量的游戏。我尝试使用一些不同的设置进行编译,例如:
g++ -O2 -o test Test.cpp -std=c++20 -lpthread
g++ -O2 -o test Test.cpp -std=c++20 -pthread
对于为什么会出现这种情况有什么想法吗?
谢谢!
第一个程序从不使用线程池。除了加入。
您不知道/显示
hardware_concurrency
的值,因此您无法知道您正在将苹果与苹果进行比较。
如果至少修复第一个,则两者之间的性能匹配:
文件
test.h
#pragma once
#include <atomic>
namespace Test {
static void boost_worker_task();
static void run(unsigned, bool);
static inline std::atomic_uint games_solved_counter{0};
}; // namespace Test
namespace TestMulThread {
static void worker_task(unsigned);
static void run(unsigned);
static inline std::atomic_uint games_solved{0};
}; // namespace TestMulThread
struct MyEngine {
using State = char[3][3];
static void init_rand();
static void random_start_state(State&);
static void solve_game(State&);
};
文件
test.cpp
#include "test.h"
#include <boost/asio.hpp>
#include <iostream>
#include <random>
#include <thread>
using namespace std::chrono_literals;
static constexpr auto now = std::chrono::high_resolution_clock::now;
void MyEngine::init_rand() {}
void MyEngine::random_start_state(State&) {}
void MyEngine::solve_game(State&) {
thread_local static std::mt19937 prng{std::random_device{}()};
thread_local static std::uniform_int_distribution work(3, 8);
std::this_thread::sleep_for(1ms * work(prng));
};
void TestMulThread::worker_task(unsigned num_iterations) {
MyEngine::init_rand();
for (unsigned int i = 0; i < num_iterations; ++i) {
char new_state[3][3];
MyEngine::random_start_state(new_state);
MyEngine::solve_game(new_state);
++games_solved;
}
}
void TestMulThread::run(unsigned total_games_to_solve) {
auto start_time = now();
unsigned num_threads = std::thread::hardware_concurrency();
unsigned games_per_thread = total_games_to_solve / num_threads;
unsigned remaining_games = total_games_to_solve % games_per_thread;
std::cout << "Using " << num_threads << " threads to solve " << total_games_to_solve << " games"
<< std::endl;
// Distribute the remaining games
std::vector games_for_each_thread(num_threads, games_per_thread);
for (unsigned int i = 0; i < remaining_games; ++i) {
games_for_each_thread[i]++;
}
std::vector<std::future<void>> futures;
for (unsigned i = 0; i < num_threads; ++i)
futures.push_back(std::async(std::launch::async, worker_task, games_for_each_thread[i]));
for (auto& future : futures)
future.get();
auto elapsed_time = now() - start_time;
std::cout << "Solved " << games_solved << " games!" << std::endl;
std::cout << "Elapsed time: " << elapsed_time / 1.s << " seconds" << std::endl;
std::cout << "Elapsed time: " << elapsed_time / 1ms << " milliseconds\n" << std::endl;
}
void Test::boost_worker_task() {
char new_state[3][3];
MyEngine::random_start_state(new_state);
MyEngine::solve_game(new_state);
++games_solved_counter;
}
void Test::run(unsigned games_to_solve, bool threaded) {
auto const start = now();
unsigned num_threads = threaded ? std::thread::hardware_concurrency() : 1;
std::cout << "Using " << num_threads << " threads to solve " << games_to_solve << " games" << std::endl;
boost::asio::thread_pool pool(num_threads);
for (unsigned i = 0; i < games_to_solve; ++i)
post(pool, Test::boost_worker_task);
// Run and wait for all tasks to complete
pool.join();
auto elapsed = now() - start;
std::cout << "Solved " << games_solved_counter << " games!" << std::endl;
std::cout << "Elapsed time: " << elapsed / 1.s << " seconds" << std::endl;
std::cout << "Elapsed time: " << elapsed / 1ms << " milliseconds\n" << std::endl;
}
int main() { //
auto N = 10'000u;
Test::run(N, true);
TestMulThread::run(N);
}
打印,在线:
Using 4 threads to solve 10000 games
Solved 10000 games!
Elapsed time: 14.04 seconds
Elapsed time: 14039 milliseconds
Using 4 threads to solve 10000 games
Solved 10000 games!
Elapsed time: 14.1067 seconds
Elapsed time: 14106 milliseconds
对我来说是本地的:
Using 8 threads to solve 10000 games
Solved 10000 games!
Elapsed time: 6.99751 seconds
Elapsed time: 6997 milliseconds
Using 8 threads to solve 10000 games
Solved 10000 games!
Elapsed time: 7.03147 seconds
Elapsed time: 7031 milliseconds