在我的小型 3D 重建脚本中找不到错误,OpenCVs recoverPose() 方法产生了错误的结果[关闭]

  • 加载一张初始图片并使用
  • 计算其 SURF 特征
  • 加载 N 张额外的图片并使用
  • 将它们的特征与初始图片匹配
  • cv::findEssentialMat()
  • 估计每个图片对的基本矩阵
  • 从每个基本矩阵中恢复姿势,并使用带有
  • 的地面实况数据缩放平移向量
  • 绘制重建轨迹和真实轨迹并进行比较

作为输入数据,我使用了来自免费提供的New Tsukuba Stereo Dataset 左侧摄像头的一系列图片。这个数据集还包含一个带有真实姿势的 CSV 文件,我用它来缩放重建的姿势并绘制它们。

对于实现,我使用的是 OpenCV,我大致遵循 Epipolar Geometry Example。然而,我的重建过程中有些东西没有正确实施。我的结果到处都是,我不太确定这里出了什么问题:


#include <cstdlib>
#include <iostream>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/calib3d.hpp>
#include <opencv2/viz.hpp>
#include <opencv2/xfeatures2d/nonfree.hpp>

namespace fs = std::filesystem;

constexpr double LOWE_THRESHOLD = 0.8;
constexpr double RANSAC_CONFIDENCE_LVL = 0.99;
constexpr double MAX_DIST_TO_EPIPOLAR = 3;
constexpr int MAX_RANSAC_ITERS = 500;

cv::Matx33d eulerToRotationMat(const cv::Vec3d& euler) {
    cv::Matx33d Rx(1, 0, 0, 0, cos(euler[0]), -sin(euler[0]), 0, sin(euler[0]), cos(euler[0]));
    cv::Matx33d Ry(cos(euler[1]), 0, sin(euler[1]),0, 1, 0, -sin(euler[1]), 0, cos(euler[1]));
    cv::Matx33d Rz(cos(euler[2]), -sin(euler[2]), 0, sin(euler[2]),cos(euler[2]), 0, 0, 0, 1);
    return Rz * Ry * Rx;

void loadData(const fs::path& rootDir, const std::size_t num, std::vector<cv::Mat>& pictures,
              std::vector<cv::Affine3d>& poses) {
    // load true poses
    std::ifstream inputFile(rootDir / "groundtruth" / "camera_track.txt");
    if (!inputFile.is_open())
        throw std::runtime_error("Unable to open ground truth file");
    else {
        std::string line, cell;
        for(int row = 0; std::getline(inputFile, line) && row < num; ++row) {
            std::stringstream lineStream(line);
            cv::Vec3d translation, eulerAngles;
            for(int col = 0; std::getline(lineStream, cell, ',') && col < 6; ++col) {
                if(!cell.empty()) {
                    if(col < 3) {
                        translation[col] = std::atof(cell.c_str());
                    } else {
                        eulerAngles[col - 3] = std::atof(cell.c_str()) * CV_PI / 180;
            poses.push_back(cv::Affine3d(eulerToRotationMat(eulerAngles), translation));

    // load pictures
    char fileName[sizeof("L_00001.png")];
    for(std::size_t i = 1; i <= num; ++i) {
        std::snprintf(fileName, sizeof(fileName), "L_%05d.png", i);
        pictures.push_back(cv::imread(rootDir / "illumination" / "fluorescent" / fileName, cv::IMREAD_GRAYSCALE));

cv::Affine3d reconstructPose(const cv::Mat& firstPic, const cv::Mat& secondPic, const cv::Matx33d& cameraMat, const double scale) {
    // initialize data structures
    std::vector<cv::KeyPoint> firstKeyPoints, secondKeyPoints;
    cv::Mat firstDescriptors, secondDescriptors, inlierMask;
    cv::Matx33d essentialMat, rotation;
    cv::Vec3d translation;
    std::vector<std::vector<cv::DMatch>> knnFeatureMatches;
    std::vector<cv::Point2f> firstInlierPts, secondInlierPts;

    // initialize algorithms
    cv::Ptr<cv::xfeatures2d::SURF> detector = cv::xfeatures2d::SURF::create(400);
    cv::Ptr<cv::DescriptorMatcher> matcher = cv::DescriptorMatcher::create(cv::DescriptorMatcher::FLANNBASED);

    // compute features
    detector->detectAndCompute(firstPic, cv::noArray(), firstKeyPoints, firstDescriptors);
    detector->detectAndCompute(secondPic, cv::noArray(), secondKeyPoints, secondDescriptors);

    // find matching features
    matcher->knnMatch(firstDescriptors, secondDescriptors, knnFeatureMatches, 2);
    for(std::size_t i = 0; i < knnFeatureMatches.size(); ++i) {
        if(knnFeatureMatches[i][0].distance < LOWE_THRESHOLD * knnFeatureMatches[i][1].distance) {
            const cv::DMatch& m = knnFeatureMatches[i][0];

    // estimate essential matrix
    essentialMat = cv::findEssentialMat(firstInlierPts, secondInlierPts, cameraMat, cv::RANSAC,
                                        RANSAC_CONFIDENCE_LVL, MAX_DIST_TO_EPIPOLAR, MAX_RANSAC_ITERS, inlierMask);
    // estimate pose from the essential matrix
    cv::recoverPose(essentialMat, firstInlierPts, secondInlierPts, cameraMat, rotation, translation, inlierMask);
    // apply scaling to translation
    translation *= scale;

    // DEBUG CODE (see below)

    if (numPoints > 5) {
        return cv::Affine3d(rotation, translation);
    } else {
        return cv::Affine3d(cv::Matx33d::eye(), cv::Vec3d::zeros());

int main(int argc, char* argv[]) {
    // using the tsukuba stereo dataset
    const fs::path dataSet = "NewTsukubaStereoDataset";
    const std::size_t numberOfPictures = 90;

    // loading the data
    std::vector<cv::Mat> pictures;
    std::vector<cv::Affine3d> groundTruth;
    loadData(dataSet, numberOfPictures, pictures, groundTruth);

    // reconstruct poses
    cv::Matx33d cameraMat(615, 0, 640, 0, 615, 480, 0, 0, 1);
    std::vector<cv::Affine3d> reconstructed;
    for(std::size_t i = 1; i < numberOfPictures; ++i) {
        const double trueScale = cv::norm(groundTruth[i].translation());
        reconstructed.push_back(reconstructPose(pictures[0], pictures[i], cameraMat, trueScale));

    // visualize results
    cv::viz::Viz3d window = cv::viz::getWindowByName("3D Visualization");
    window.showWidget("CoordinateSystem", cv::viz::WCoordinateSystem());
    window.showWidget("GroundTruth", cv::viz::WTrajectory(groundTruth, cv::viz::WTrajectory::PATH));
    cv::viz::WTrajectory trj(reconstructed, cv::viz::WTrajectory::PATH);
    window.showWidget("CameraTrajectory", trj);

    return EXIT_SUCCESS;

在 Linux 上,我这样编译这段代码:

g++ $(pkg-config --libs --cflags opencv4) -std=c++17 e_epipolar_example.cpp -o runnable


cv::Affine3d reconstructPose(const cv::Mat& firstPic, const cv::Mat& secondPic, const cv::Matx33d& cameraMat, const double scale) {
    cv::Mat combinedPic = cv::Mat::zeros(firstPic.rows, 2 * firstPic.cols, CV_8UC1);
    firstPic.copyTo(combinedPic.colRange(0, firstPic.cols));
    secondPic.copyTo(combinedPic.colRange(firstPic.cols, combinedPic.cols));
    std::vector<cv::Mat> layers;
    for(std::size_t i = 0; i < 3; ++i) {
    cv::Mat dbgPic;
    cv::merge(layers, dbgPic);
    for(std::size_t i = 0, j = 0; i < firstInlierPts.size() && j < 25; ++i) {
        if(inlierMask.at<bool>(i)) {
            cv::Point2f shiftedPt = secondInlierPts[i] + cv::Point2f(firstPic.cols, 0);
            cv::circle(dbgPic, firstInlierPts[i], 5, {255, 255, 0});
            cv::circle(dbgPic, shiftedPt, 5, {0, 255, 255});
            cv::line(dbgPic, firstInlierPts[i], shiftedPt, {255, 255, 255});
    cv::imshow("Debug", dbgPic);

    if (numPoints > 5) {
        return cv::Affine3d(rotation, translation);
    } else {
        return cv::Affine3d(cv::Matx33d::eye(), cv::Vec3d::zeros());


函数的所有内点,这意味着,在大多数帧中匹配至少5个关键点的帧之间存在有效转换(我这里只显示最多25个内点以保持清洁,但平均有 600-900 个):

这个结果让我感到困惑,因为 3D 场景清楚地显示了无意义的姿势,即使

函数能够恢复具有数百个内点的姿势。有谁知道为什么我在 3D 视图中显示的姿势如此错误?

c++ opencv computer-vision augmented-reality structure-from-motion
