我目前正在尝试使用一种简单的运动结构方法来重建相机姿势。思路如下:
cv::xfeatures2d::SURF::detectAndCompute()
cv::DescriptorMatcher::knnMatch()
cv::findEssentialMat()
cv::recoverPose()
作为输入数据,我使用了来自免费提供的New Tsukuba Stereo Dataset 左侧摄像头的一系列图片。这个数据集还包含一个带有真实姿势的 CSV 文件,我用它来缩放重建的姿势并绘制它们。
对于实现,我使用的是 OpenCV,我大致遵循 Epipolar Geometry Example。然而,我的重建过程中有些东西没有正确实施。我的结果到处都是,我不太确定这里出了什么问题:
白线是实际轨迹,红线是重建的轨迹。我创建了一个最小的代码示例来演示我的过程:
#include <cstdlib>
#include <iostream>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/calib3d.hpp>
#include <opencv2/viz.hpp>
#include <opencv2/xfeatures2d/nonfree.hpp>
namespace fs = std::filesystem;
constexpr double LOWE_THRESHOLD = 0.8;
constexpr double RANSAC_CONFIDENCE_LVL = 0.99;
constexpr double MAX_DIST_TO_EPIPOLAR = 3;
constexpr int MAX_RANSAC_ITERS = 500;
cv::Matx33d eulerToRotationMat(const cv::Vec3d& euler) {
cv::Matx33d Rx(1, 0, 0, 0, cos(euler[0]), -sin(euler[0]), 0, sin(euler[0]), cos(euler[0]));
cv::Matx33d Ry(cos(euler[1]), 0, sin(euler[1]),0, 1, 0, -sin(euler[1]), 0, cos(euler[1]));
cv::Matx33d Rz(cos(euler[2]), -sin(euler[2]), 0, sin(euler[2]),cos(euler[2]), 0, 0, 0, 1);
return Rz * Ry * Rx;
}
void loadData(const fs::path& rootDir, const std::size_t num, std::vector<cv::Mat>& pictures,
std::vector<cv::Affine3d>& poses) {
// load true poses
std::ifstream inputFile(rootDir / "groundtruth" / "camera_track.txt");
if (!inputFile.is_open())
throw std::runtime_error("Unable to open ground truth file");
else {
std::string line, cell;
for(int row = 0; std::getline(inputFile, line) && row < num; ++row) {
std::stringstream lineStream(line);
cv::Vec3d translation, eulerAngles;
for(int col = 0; std::getline(lineStream, cell, ',') && col < 6; ++col) {
if(!cell.empty()) {
if(col < 3) {
translation[col] = std::atof(cell.c_str());
} else {
eulerAngles[col - 3] = std::atof(cell.c_str()) * CV_PI / 180;
}
}
}
poses.push_back(cv::Affine3d(eulerToRotationMat(eulerAngles), translation));
}
}
// load pictures
char fileName[sizeof("L_00001.png")];
for(std::size_t i = 1; i <= num; ++i) {
std::snprintf(fileName, sizeof(fileName), "L_%05d.png", i);
pictures.push_back(cv::imread(rootDir / "illumination" / "fluorescent" / fileName, cv::IMREAD_GRAYSCALE));
}
}
cv::Affine3d reconstructPose(const cv::Mat& firstPic, const cv::Mat& secondPic, const cv::Matx33d& cameraMat, const double scale) {
// initialize data structures
std::vector<cv::KeyPoint> firstKeyPoints, secondKeyPoints;
cv::Mat firstDescriptors, secondDescriptors, inlierMask;
cv::Matx33d essentialMat, rotation;
cv::Vec3d translation;
std::vector<std::vector<cv::DMatch>> knnFeatureMatches;
std::vector<cv::Point2f> firstInlierPts, secondInlierPts;
// initialize algorithms
cv::Ptr<cv::xfeatures2d::SURF> detector = cv::xfeatures2d::SURF::create(400);
cv::Ptr<cv::DescriptorMatcher> matcher = cv::DescriptorMatcher::create(cv::DescriptorMatcher::FLANNBASED);
// compute features
detector->detectAndCompute(firstPic, cv::noArray(), firstKeyPoints, firstDescriptors);
detector->detectAndCompute(secondPic, cv::noArray(), secondKeyPoints, secondDescriptors);
// find matching features
matcher->knnMatch(firstDescriptors, secondDescriptors, knnFeatureMatches, 2);
for(std::size_t i = 0; i < knnFeatureMatches.size(); ++i) {
if(knnFeatureMatches[i][0].distance < LOWE_THRESHOLD * knnFeatureMatches[i][1].distance) {
const cv::DMatch& m = knnFeatureMatches[i][0];
firstInlierPts.push_back(firstKeyPoints[m.queryIdx].pt);
secondInlierPts.push_back(secondKeyPoints[m.trainIdx].pt);
}
}
// estimate essential matrix
essentialMat = cv::findEssentialMat(firstInlierPts, secondInlierPts, cameraMat, cv::RANSAC,
RANSAC_CONFIDENCE_LVL, MAX_DIST_TO_EPIPOLAR, MAX_RANSAC_ITERS, inlierMask);
// estimate pose from the essential matrix
cv::recoverPose(essentialMat, firstInlierPts, secondInlierPts, cameraMat, rotation, translation, inlierMask);
// apply scaling to translation
translation *= scale;
// DEBUG CODE (see below)
if (numPoints > 5) {
return cv::Affine3d(rotation, translation);
} else {
return cv::Affine3d(cv::Matx33d::eye(), cv::Vec3d::zeros());
}
}
int main(int argc, char* argv[]) {
// using the tsukuba stereo dataset
const fs::path dataSet = "NewTsukubaStereoDataset";
const std::size_t numberOfPictures = 90;
// loading the data
std::vector<cv::Mat> pictures;
std::vector<cv::Affine3d> groundTruth;
loadData(dataSet, numberOfPictures, pictures, groundTruth);
// reconstruct poses
cv::Matx33d cameraMat(615, 0, 640, 0, 615, 480, 0, 0, 1);
std::vector<cv::Affine3d> reconstructed;
for(std::size_t i = 1; i < numberOfPictures; ++i) {
const double trueScale = cv::norm(groundTruth[i].translation());
reconstructed.push_back(reconstructPose(pictures[0], pictures[i], cameraMat, trueScale));
}
// visualize results
cv::viz::Viz3d window = cv::viz::getWindowByName("3D Visualization");
window.setBackgroundColor();
window.showWidget("CoordinateSystem", cv::viz::WCoordinateSystem());
window.showWidget("GroundTruth", cv::viz::WTrajectory(groundTruth, cv::viz::WTrajectory::PATH));
cv::viz::WTrajectory trj(reconstructed, cv::viz::WTrajectory::PATH);
trj.setColor(cv::viz::Color::red());
window.showWidget("CameraTrajectory", trj);
window.spin();
return EXIT_SUCCESS;
}
在 Linux 上,我这样编译这段代码:
g++ $(pkg-config --libs --cflags opencv4) -std=c++17 e_epipolar_example.cpp -o runnable
然后在与解压数据集相同的目录中执行runnable。我已经通过并排绘制图像对并在其上绘制匹配特征来在每一帧显示结果来测试特征检测和特征匹配过程:
cv::Affine3d reconstructPose(const cv::Mat& firstPic, const cv::Mat& secondPic, const cv::Matx33d& cameraMat, const double scale) {
...
//DEBUG
cv::Mat combinedPic = cv::Mat::zeros(firstPic.rows, 2 * firstPic.cols, CV_8UC1);
firstPic.copyTo(combinedPic.colRange(0, firstPic.cols));
secondPic.copyTo(combinedPic.colRange(firstPic.cols, combinedPic.cols));
std::vector<cv::Mat> layers;
for(std::size_t i = 0; i < 3; ++i) {
layers.push_back(combinedPic);
}
cv::Mat dbgPic;
cv::merge(layers, dbgPic);
for(std::size_t i = 0, j = 0; i < firstInlierPts.size() && j < 25; ++i) {
if(inlierMask.at<bool>(i)) {
++j;
cv::Point2f shiftedPt = secondInlierPts[i] + cv::Point2f(firstPic.cols, 0);
cv::circle(dbgPic, firstInlierPts[i], 5, {255, 255, 0});
cv::circle(dbgPic, shiftedPt, 5, {0, 255, 255});
cv::line(dbgPic, firstInlierPts[i], shiftedPt, {255, 255, 255});
}
}
cv::imshow("Debug", dbgPic);
cv::waitKey();
if (numPoints > 5) {
return cv::Affine3d(rotation, translation);
} else {
return cv::Affine3d(cv::Matx33d::eye(), cv::Vec3d::zeros());
}
}
结果图片显示了
cv::recoverPose()
函数的所有内点,这意味着,在大多数帧中匹配至少5个关键点的帧之间存在有效转换(我这里只显示最多25个内点以保持清洁,但平均有 600-900 个):
这个结果让我感到困惑,因为 3D 场景清楚地显示了无意义的姿势,即使
cv::recoverPose()
函数能够恢复具有数百个内点的姿势。有谁知道为什么我在 3D 视图中显示的姿势如此错误?