任何人都可以建议任何更快的方法将矩阵乘以此函数内的向量?
inline void multiply(
std::vector< std::vector<double> > &matrix,
std::vector<double> &vector,
std::vector<double> &result
){
int size = (int) vector.size();
result.resize(size);
#pragma omp parallel for
for(int i = 0; i < size; ++i){
int j = 0;
for(; j <= size - 16; j += 16){
result[i] += matrix[i][j] * vector[j]
+ matrix[i][j + 1] * vector[j + 1]
+ matrix[i][j + 2] * vector[j + 2]
+ matrix[i][j + 3] * vector[j + 3]
+ matrix[i][j + 4] * vector[j + 4]
+ matrix[i][j + 5] * vector[j + 5]
+ matrix[i][j + 6] * vector[j + 6]
+ matrix[i][j + 7] * vector[j + 7]
+ matrix[i][j + 8] * vector[j + 8]
+ matrix[i][j + 9] * vector[j + 9]
+ matrix[i][j + 10] * vector[j + 10]
+ matrix[i][j + 11] * vector[j + 11]
+ matrix[i][j + 12] * vector[j + 12]
+ matrix[i][j + 13] * vector[j + 13]
+ matrix[i][j + 14] * vector[j + 14]
+ matrix[i][j + 15] * vector[j + 15];
}
for(; j < size; ++j){
result[i] += matrix[i][j] * vector[j];
}
}
}
在运行时期间,此函数被多次调用,因此它对总计算时间具有非常关键的影响。
根据您的硬件,使用GPU并行化(例如:CUDA)可能会有很大帮助。