重复文件查找器算法的建议（使用C）

Question

我想编写一个程序来测试两个文件是否重复（具有完全相同的内容）。首先，我测试文件是否具有相同的大小，如果他们开始比较它们的内容。

我的第一个想法是将文件“拆分”为固定大小的块，然后为每个块启动一个线程，fseek启动每个块的启动字符并继续并行比较。当一个线程的比较失败时，其他工作线程被取消，程序退出线程产生循环。

代码如下所示：dupf.h

#ifndef __NM__DUPF__H__
#define __NM__DUPF__H__
#define NUM_THREADS 15
#define BLOCK_SIZE 8192

/* Thread argument structure */
struct thread_arg_s {
    const char *name_f1;        /* First file name */
    const char *name_f2;        /* Second file name */
    int cursor;                 /* Where to seek in the file */
};
typedef struct thread_arg_s thread_arg;

/**
 * 'arg' is of type thread_arg.
 * Checks if the specified file blocks are 
 * duplicates.
 */
void *check_block_dup(void *arg);

/**
 * Checks if two files are duplicates
 */
int check_dup(const char *name_f1, const char *name_f2);

/**
* Returns a valid pointer to a file.
* If the file (given by the path/name 'fname') cannot be opened
* in 'mode', the program is interrupted an error message is shown.
**/
FILE *safe_fopen(const char *name, const char *mode);

#endif

dupf.c

#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "dupf.h"

FILE *safe_fopen(const char *fname, const char *mode)
{
    FILE *f = NULL;
    f = fopen(fname, mode);
    if (f == NULL) {
        char emsg[255];
        sprintf(emsg, "FOPEN() %s\t", fname);
        perror(emsg);
        exit(-1);
    }
    return (f);
}

void *check_block_dup(void *arg)
{
    const char *name_f1 = NULL, *name_f2 = NULL;    /* File names */
    FILE *f1 = NULL, *f2 = NULL;                    /* Streams */
    int cursor = 0;                                 /* Reading cursor */
    char buff_f1[BLOCK_SIZE], buff_f2[BLOCK_SIZE];  /* Character buffers */
    int rchars_1, rchars_2;                         /* Readed characters */
    /* Initializing variables from 'arg' */
    name_f1 = ((thread_arg*)arg)->name_f1;
    name_f2 = ((thread_arg*)arg)->name_f2;
    cursor = ((thread_arg*)arg)->cursor;
    /* Opening files */
    f1 = safe_fopen(name_f1, "r");
    f2 = safe_fopen(name_f2, "r");
    /* Setup cursor in files */
    fseek(f1, cursor, SEEK_SET);
    fseek(f2, cursor, SEEK_SET);
    /* Initialize buffers */
    rchars_1 = fread(buff_f1, 1, BLOCK_SIZE, f1);
    rchars_2 = fread(buff_f2, 1, BLOCK_SIZE, f2);
    if (rchars_1 != rchars_2) {
        /* fread failed to read the same portion.
         * program cannot continue */
        perror("ERROR WHEN READING BLOCK");
        exit(-1);
    }
    while (rchars_1-->0) {
        if (buff_f1[rchars_1] != buff_f2[rchars_1]) {
            /* Different characters */
            fclose(f1);
            fclose(f2);
            pthread_exit("notdup");
        }
    }
    /* Close streams */
    fclose(f1);
    fclose(f2);
    pthread_exit("dup");
}

int check_dup(const char *name_f1, const char *name_f2)
{
    int num_blocks = 0;             /* Number of 'blocks' to check */
    int num_tsp = 0;                /* Number of threads spawns */
    int tsp_iter = 0;               /* Iterator for threads spawns */
    pthread_t *tsp_threads = NULL;
    thread_arg *tsp_threads_args = NULL;
    int tsp_threads_iter = 0;
    int thread_c_res = 0;           /* Thread creation result */
    int thread_j_res = 0;           /* Thread join res */
    int loop_res = 0;               /* Function result */
    int cursor;
    struct stat buf_f1;
    struct stat buf_f2;

    if (name_f1 == NULL || name_f2 == NULL) {
        /* Invalid input parameters */
        perror("INVALID FNAMES\t");
        return (-1);
    }

    if (stat(name_f1, &buf_f1) != 0 || stat(name_f2, &buf_f2) != 0) {
        /* Stat fails */
        char emsg[255];
        sprintf(emsg, "STAT() ERROR: %s %s\t", name_f1, name_f2);
        perror(emsg);
        return (-1);
    }

    if (buf_f1.st_size != buf_f2.st_size) {
        /* File have different sizes */
        return (1);
    }

    /* Files have the same size, function exec. is continued */
    num_blocks = (buf_f1.st_size / BLOCK_SIZE) + 1;
    num_tsp = (num_blocks / NUM_THREADS) + 1;
    cursor = 0;
    for (tsp_iter = 0; tsp_iter < num_tsp; tsp_iter++) {
        loop_res = 0;
        /* Create threads array for this spawn */
        tsp_threads = malloc(NUM_THREADS * sizeof(*tsp_threads));
        if (tsp_threads == NULL) {
            perror("TSP_THREADS ALLOC FAILURE\t");
            return (-1);
        }
        /* Create arguments for every thread in the current spawn */
        tsp_threads_args = malloc(NUM_THREADS * sizeof(*tsp_threads_args));
        if (tsp_threads_args == NULL) {
            perror("TSP THREADS ARGS ALLOCA FAILURE\t");
            return (-1);
        }
        /* Initialize arguments and create threads */
        for (tsp_threads_iter = 0; tsp_threads_iter < NUM_THREADS;
                tsp_threads_iter++) {
            if (cursor >= buf_f1.st_size) {
                break;
            }
            tsp_threads_args[tsp_threads_iter].name_f1 = name_f1;
            tsp_threads_args[tsp_threads_iter].name_f2 = name_f2;
            tsp_threads_args[tsp_threads_iter].cursor = cursor;
            thread_c_res = pthread_create(
                               &tsp_threads[tsp_threads_iter],
                               NULL,
                               check_block_dup,
                               (void*)&tsp_threads_args[tsp_threads_iter]);
            if (thread_c_res != 0) {
                perror("THREAD CREATION FAILURE");
                return (-1);
            }
            cursor+=BLOCK_SIZE;
        }
        /* Join last threads and get their status */
        while (tsp_threads_iter-->0) {
            void *thread_res = NULL;
            thread_j_res = pthread_join(tsp_threads[tsp_threads_iter],
                                        &thread_res);
            if (thread_j_res != 0) {
                perror("THREAD JOIN FAILURE");
                return (-1);
            }
            if (strcmp((char*)thread_res, "notdup")==0) {
                loop_res++;
                /* Closing other threads and exiting by condition
                 * from loop. */
                while (tsp_threads_iter-->0) {
                    pthread_cancel(tsp_threads[tsp_threads_iter]);
                }
            }
        }
        free(tsp_threads);
        free(tsp_threads_args);
        if (loop_res > 0) {
            break;
        }
    }
    return (loop_res > 0) ? 1 : 0;
}

该功能正常（至少对于我测试过的）。不过，来自#C（freenode）的一些人认为解决方案过于复杂，并且由于在hddisk上并行读取，它可能表现不佳。

我想知道的：

默认情况下，线程方法是否有缺陷？
fseek（）这么慢吗？
有没有办法以某种方式将文件映射到内存然后比较它们？

有编辑：

今天我有一些时间，我已经按照你的建议。你是对的，这个线程版本实际上比单线程版本更差，并且都是因为硬盘上的并行读数。

另一件事是我编写了一个使用mmap（）的函数，直到现在才是最佳函数。该功能的最大缺点仍然是当文件变得非常大时它会失败。

这是新的实现（非常粗暴和直接的代码）：

#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "dupf.h"

/**
* Safely assures that a file is opened. 
* If cannot open file, the flow of the program is interrupted.
* The error code returned is -1.
**/
FILE *safe_fopen(const char *fname, const char *mode)
{
    FILE *f = NULL;
    f = fopen(fname, mode);
    if (f == NULL) {
        char emsg[1024];
        sprintf(emsg, "Cannot open file: %s\t", fname);
        perror(emsg);
        exit(-1);
    }
    return (f);
}

/**
* Check if two files have the same size.
* Returns:
* -1    Error.
* 0 If they have the same size.
* 1 If the don't have the same size.
**/
int check_same_size(const char *f1_name, const char *f2_name, off_t *f1_size, off_t *f2_size)
{
    struct stat f1_stat, f2_stat;
    if((f1_name == NULL) || (f2_name == NULL)){
        fprintf(stderr, "Invalid filename passed to function [check_same_size].\n");
        return (-1);
    }
    if((stat(f1_name, &f1_stat) != 0) || (stat(f2_name, &f2_stat) !=0)){
        fprintf(stderr, "Cannot apply stat. [check_same_size].\n");
        return (-1);
    }
    if(f1_size != NULL){
        *f1_size = f1_stat.st_size;
    }
    if(f2_size != NULL){
        *f2_size = f2_stat.st_size;
    }
    return (f1_stat.st_size == f2_stat.st_size) ? 0 : 1;
}

/**
* Test if two files are duplicates.
* Returns:
* -1    Error.
* 0 If they are duplicates.
* 1 If they are not duplicates.
**/
int check_dup_plain(char *f1_name, char *f2_name, int block_size)
{
    if ((f1_name == NULL) || (f2_name == NULL)){
        fprintf(stderr, "Invalid filename passed to function [check_dup_plain].\n");
        return (-1);
    }
    FILE *f1 = NULL, *f2 = NULL;
    char f1_buff[block_size], f2_buff[block_size];
    size_t rch1, rch2;
    if(check_same_size(f1_name, f2_name, NULL, NULL) == 1){
        return (1);
    }
    f1 = safe_fopen(f1_name, "r");
    f2 = safe_fopen(f2_name, "r");
    while(!feof(f1) && !feof(f2)){
        rch1 = fread(f1_buff, 1, block_size, f1);
        rch2 = fread(f2_buff, 1, block_size, f2);
        if(rch1 != rch2){
            fprintf(stderr, "Invalid reading from file. Cannot continue. [check_dup_plain].\n");
            return (-1);
        }
        while(rch1-->0){
            if(f1_buff[rch1] != f2_buff[rch1]){
                return (1);
            }
        }
    }
    fclose(f1);
    fclose(f2);
    return (0);
}

/**
* Test if two files are duplicates.
* Returns:
* -1    Error.
* 0 If they are duplicates.
* 1 If they are not duplicates.
**/
int check_dup_memmap(char *f1_name, char *f2_name)
{
    struct stat f1_stat, f2_stat;
    char *f1_array = NULL, *f2_array = NULL;
    off_t f1_size, f2_size;
    int f1_des, f2_des, cont, res;
    if((f1_name == NULL) || (f2_name == NULL)){
        fprintf(stderr, "Invalid filename passed to function [check_dup_memmap].\n");
        return (-1);    
    }
    if(check_same_size(f1_name, f2_name, &f1_size, &f2_size) == 1){
        return (1);
    }
    f1_des = open(f1_name, O_RDONLY);
    f2_des = open(f2_name, O_RDONLY);
    if((f1_des == -1) || (f2_des == -1)){
        perror("Cannot open file");
        exit(-1);       
    }
    f1_array = mmap(0, f1_size * sizeof(*f1_array), PROT_READ, MAP_SHARED, f1_des, 0);
    if(f1_array == NULL){
        fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
        return (-1);
    }
    f2_array = mmap(0, f2_size * sizeof(*f2_array), PROT_READ, MAP_SHARED, f2_des, 0);
    if(f2_array == NULL){
        fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
        return (-1);
    }
    cont = f1_size;
    res = 0;
    while(cont-->0){
        if(f1_array[cont]!=f2_array[cont]){
            res = 1;
            break;
        }
    }
    munmap((void*) f1_array, f1_size * sizeof(*f1_array));
    munmap((void*) f2_array, f2_size * sizeof(*f2_array));
    return res;
}

int main(int argc, char *argv[])
{
    printf("result: %d\n",check_dup_memmap("f2","f1"));
    return (0);
}

我现在计划通过重新添加线程功能来扩展此代码，但这次读取将在内存中。

谢谢你的回答。

Answer 1

如果没有真正的系统进行测试，很难猜测性能（例如，如果你使用固态驱动器，那么没有磁头寻道时间，从不同线程读取不同扇区的成本几乎为零）。

如果这是针对具有常规（旋转盘片）硬盘的合理标准计算机运行的，那么让多个线程竞争他们想要读取的磁盘部分可能会减慢速度（同样取决于硬件和大小）的块）。

如果计算块的“相同性”所花费的时间与从磁盘读取该块所花费的时间相比是快速的，那么拥有一个单独的线程将无济于事，因为第二个（或第三个......）线程将花费大部分时间都在等待IO完成。

另一个因素是CPU的高速缓存大小。如果您一次处理的所有内存都适合CPU缓存，那么事情会比不同的线程在执行指令时将不同的内存块加载到缓存中的速度快得多。

如果您拥有的线程数多于CPU核心数，则只需通过进行不必要的上下文切换（因为线程需要运行核心）来减慢速度。

阅读完所有内容之后，如果您仍然认为多线程对您的目标系统有所帮助，请考虑一个只执行IO的线程，将数据放入队列，并有两个或多个工作线程从队列中取出数据来处理。这样，您可以优化磁盘IO并利用多个内核来处理数字。

史蒂夫建议你可以用内存映射你在Unix上的文件。通过利用低级操作系统功能（用于管理交换文件的相同类型），这将加快对底层数据的访问。这将为您提供一些性能改进，因为操作系统将处理将您正在处理的文件部分有效地加载到内存中，只要该文件适合可用的地址空间即可。仅供参考，你可以做same thing on Windows。

Answer 2

限制因素是磁盘读取，（假设两个文件都在同一个磁盘上）无论如何都会序列化，所以我认为线程根本不会有多大帮助。

Answer 3

您可以通过使用散列来大大简化代码，而不是进行逐字节比较。假设你没有做任何重要的事情，比如删除，md5或类似的哈希函数应该是充足的。 Boost提供了不少，而且它们通常非常快。

if fileA.size == fileB.size
    if fileA.hash() == fileB.hash()
        flag(fileA, fileB, same);

在比较之后我不会删除文件，但是将它们移动到临时目录以进行进一步检查或者只是构建可能重复的列表是非常安全的。

Answer 4

在考虑并行磁盘读取和线程开销等性能影响之前......

有没有理由相信以块的形式扫描文件会比直接更快地发现差异？文件中包含的数据是否主要采用某种格式，如果是这样，是否为其量身定制了拆分方案？如果没有，我不会看到如何通过跳过每n个字节扫描文件（这是所有多线程拆分实际上都在做）可以提供任何改进，而不是按照它们在磁盘上的顺序读取字节。

考虑两种限制情况 - 将文件“拆分”为一个块，并将文件拆分为与文件中的字节一样多的单字节“块”。这两种情况中的任何一种都会比另一种情况更有效，还是一些中间价值？如果没有您知道应该优化的中间值，那么您对数据如何存储在文件中一无所知，因此扫描它们的方式应该没有区别。

即使您将拆分设置为优化磁盘的性能（如块大小），您仍然必须返回读取下一个字节，这可能是非常不理想的位置。最后，无论你如何拆分文件，你都必须阅读文件中的每个字节。

Answer 5

因为您正在使用pthread，我假设您在Unix环境中工作 - 在这种情况下，您可以将两个文件mmap（2）到内存中并直接比较内存数组。

Answer 6

那么，有一个标准的内存映射mmap（）函数将文件映射到内存。你应该能够做类似的事情

int fd1;
int fd2;
int size1;
int size2;

fd1 = open(name1, O_RDONLY);
size1 = lseek(fd1, 0, SEEK_END); 

fd2 = open(name2, O_RDONLY);
size2 = lseek(fd2, 0, SEEK_END);

if ( size1 == size2 )
{
   char * data1 = mmap(0, size1, PROT_READ, MAP_SHARED, fd1, 0);
   char * data2 = mmap(0, size1, PROT_READ, MAP_SHARED, fd2, 0);
   int i;

   /* ...and this is, obviously, where you'd do something more clever */
   for ( i = 0; i < size1 && *data1 == *data2; i++, data1++, data2++ );

   if ( i == size1 )
       printf("Equal\n");
}

close(fd1);
close(fd2);

除此之外，是的，您的解决方案看起来过于复杂;-)线程方法不一定有缺陷，但您可能看不到并行访问提高了性能。对于SAN驱动器或ramdisks，它可能会提高性能，对于普通的旋转盘片驱动器，它可能会阻碍它。但更简单通常更好，除非你真的有性能问题。

关于fseek（）vs其他方法，它取决于您使用的操作系统。 Google是您的朋友，您可以轻松找到至少适用于Solaris和Linux的文章。

Answer 7

即使磁盘访问不是限制因素（它将是），除非你有一个多核处理器可以将不同的线程切换到不同的核心，你不会看到多线程的加速。基本上，你必须以这种或那种方式比较文件的所有N个字节，即使你使用线程，如果它们在同一个核心中执行，它将花费相同的时间而不使用线程。

有些环境可以将工作负载分散到各个内核中，但即使这样，CPU也能够比从磁盘中提取数据的速度快得多，磁盘I / O系统将成为限制因素。

重复文件查找器算法的建议（使用C）

问题描述投票：2回答：7

7个回答

最新问题

重复文件查找器算法的建议（使用C）

问题描述 投票：2回答：7

7个回答

最新问题

问题描述投票：2回答：7