如何在 C++ 中快速安全地从文本文件中读取极长的行？

Question

有一个 6.53 GiB 的大文本文件。它的每一行可以是数据行或注释行。注释行通常很短，少于 80 个字符，而数据行包含超过 200 万个字符，并且长度可变。

考虑到每个数据行都需要作为一个单元来处理，有没有一种简单的方法可以在C++中安全快速地读取行？

safe（对于可变长度数据线是安全的）：该解决方案与

std::getline()

一样易于使用。由于长度在变化，希望避免额外的内存管理。

快：解决方案可以实现与

readline()

中的

python 3.6.0

一样快，甚至可以与

fgets()

中的

stdio.h

一样快。

欢迎纯C解决方案。C和C++都提供了进一步处理的接口。

更新1：感谢Basile Starynkevitch的简短但宝贵的评论，完美的解决方案出现了：

POSIX getline()

。由于进一步的处理仅涉及从字符到数字的转换，并且不使用字符串类的许多功能，因此在此应用程序中，字符数组就足够了。

更新2：感谢Zulan和Galik的评论，他们都报告了

std::getline()

、

fgets()

和

POSIX getline()

之间的性能相当，另一种可能的解决方案是使用更好的标准库实现，例如

 libstdc++

。此外，这里有一个报告声称

std::getline

的Visual C++和libc++实现没有得到很好的优化。

从

libc++

移动到

libstdc++

会极大地改变结果。在不同平台上使用 libstdc++ 3.4.13 / Linux 2.6.32 时，

POSIX getline()

、

std::getline()

和

fgets()

显示出相当的性能。一开始，代码是在Xcode 8.3.2（8E2002）中默认的clang设置下运行的，因此使用了

libc++

。

更多细节和一些努力（很长）：

getline()

的

<string>

可以处理任意长线，但速度有点慢。 python 中的

readline()

在 C++ 中是否有替代方案？

// benchmark on Mac OS X with libc++ and SSD:
readline() of python                         ~550 MiB/s

fgets() of stdio.h, -O0 / -O2               ~1100 MiB/s

getline() of string, -O0                      ~27 MiB/s
getline() of string, -O2                     ~150 MiB/s
getline() of string + stack buffer, -O2      ~150 MiB/s

getline() of ifstream, -O0 / -O2             ~240 MiB/s
read() of ifstream, -O2                      ~340 MiB/s

wc -l                                        ~670 MiB/s

cat data.txt | ./read-cin-unsync              ~20 MiB/s

getline() of stdio.h (POSIX.1-2008), -O0    ~1300 MiB/s

速度进行了非常粗略的舍入，只是为了显示大小，并且所有代码块都会运行多次以确保这些值具有代表性。
'-O0 / -O2' 表示两个优化级别的速度非常相似
代码如下。

readline()
蟒蛇

# readline.py

import time
import os

t_start = time.perf_counter()

fname = 'data.txt'
fin = open(fname, 'rt')

count = 0

while True:
    l = fin.readline()
    length = len(l)
    if length == 0:     # EOF
        break
    if length > 80:     # data line
        count += 1

fin.close()

t_end = time.perf_counter()
time = t_end - t_start

fsize = os.path.getsize(fname)/1024/1024   # file size in MiB
print("speed: %d MiB/s" %(fsize/time))
print("reads %d data lines" %count)

# run as `python readline.py` with python 3.6.0

fgets()

stdio.h

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

int main(int argc, char* argv[]){
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  FILE* fp = fopen(argv[1], "r");
  if(fp == NULL) {
    perror("Failed to open file");
    return EXIT_FAILURE;
  }

  // maximum length of lines, determined previously by python
  const int SIZE = 1024*1024*3;
  char line[SIZE];

  int count = 0;
  while(fgets(line, SIZE, fp) == line) {
    if(strlen(line) > 80) {
      count += 1;
    }
  }

  clock_t t_end = clock();

  const double fsize = 6685;  // file size in MiB

  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines\n", count);

  return EXIT_SUCCESS;
}

getline()

<string>

// readline-string-getline.cpp
#include <string>
#include <fstream>
#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(int argc, char* argv[]) {
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  // manually set the buffer on stack
  const int BUFFERSIZE = 1024*1024*3;   // stack on my platform is 8 MiB
  char buffer[BUFFERSIZE];
  ifstream fin;
  fin.rdbuf()->pubsetbuf(buffer, BUFFERSIZE);
  fin.open(argv[1]);

  // default buffer setting
  // ifstream fin(argv[1]);

  if(!fin) {
    perror("Failed to open file");
    return EXIT_FAILURE;
  }

  // maximum length of lines, determined previously by python
  const int SIZE = 1024*1024*3;
  string line;
  line.reserve(SIZE);

  int count = 0;
  while(getline(fin, line)) {
    if(line.size() > 80) {
      count += 1;
    }
  }

  clock_t t_end = clock();

  const double fsize = 6685;  // file size in MiB

  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines\n", count);

  return EXIT_SUCCESS;
}

getline()

ifstream

// readline-ifstream-getline.cpp
#include <fstream>
#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(int argc, char* argv[]) {
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  ifstream fin(argv[1]);
  if(!fin) {
    perror("Failed to open file");
    return EXIT_FAILURE;
  }

  // maximum length of lines, determined previously by python
  const int SIZE = 1024*1024*3;
  char line[SIZE];

  int count = 0;
  while(fin.getline(line, SIZE)) {
    if(strlen(line) > 80) {
      count += 1;
    }
  }

  clock_t t_end = clock();

  const double fsize = 6685;  // file size in MiB

  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines\n", count);

  return EXIT_SUCCESS;
}

read()

ifstream

// seq-read-bin.cpp
// sequentially read the file to see the speed upper bound of
// ifstream

#include <iostream>
#include <fstream>
#include <ctime>

using namespace std;


int main(int argc, char* argv[]) {
  clock_t t_start = clock();

  if(argc != 2) {
    fprintf(stderr, "needs one input argument\n");
    return EXIT_FAILURE;
  }

  ifstream fin(argv[1], ios::binary);

  const int SIZE = 1024*1024*3;
  char str[SIZE];

  while(fin) {
    fin.read(str,SIZE);
  }

  clock_t t_end = clock();
  double time = (t_end-t_start) / (double)CLOCKS_PER_SEC;

  const double fsize = 6685;  // file size in MiB

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));

  return EXIT_SUCCESS;
}

使用
cat
，然后使用
cin
 从 
cin.sync_with_stdio(false)
读取

#include <iostream>
#include <ctime>
#include <cstdlib>

using namespace std;

int main(void) {
  clock_t t_start = clock();

  string input_line;

  cin.sync_with_stdio(false);

  while(cin) {
    getline(cin, input_line);
  }

  double time = (clock() - t_start) / (double)CLOCKS_PER_SEC;

  const double fsize = 6685;  // file size in MiB

  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));

  return EXIT_SUCCESS;
}

POSIX getline()

// readline-c-getline.c
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

int main(int argc, char *argv[]) {

  clock_t t_start = clock();

  char *line = NULL;
  size_t len = 0;
  ssize_t nread;

  if (argc != 2) {
    fprintf(stderr, "Usage: %s <file>\n", argv[1]);
    exit(EXIT_FAILURE);
  }

  FILE *stream = fopen(argv[1], "r");
  if (stream == NULL) {
    perror("fopen");
    exit(EXIT_FAILURE);
  }

  int length = -1;
  int count = 0;
  while ((nread = getline(&line, &len, stream)) != -1) {
    if (nread > 80) {
      count += 1;
    }
  }

  free(line);
  fclose(stream);

  double time = (clock() - t_start) / (double)CLOCKS_PER_SEC;
  const double fsize = 6685;  // file size in MiB
  fprintf(stdout, "takes %.2f s\n", time);
  fprintf(stdout, "speed: %d MiB/s\n", (int)(fsize/time));
  fprintf(stdout, "reads %d data lines.\n", count);
  // fprintf(stdout, "length of MSA: %d\n", length-1);

  exit(EXIT_SUCCESS);
}

Answer 1

C 标准库是 C++ 标准库的子集。来自 C++ 2014 标准的 n4296 草案：

17.2 C 标准库 [library.c]

C++ 标准库还提供了 C 标准库的功能，并经过适当调整以适应确保静态类型安全。

因此，如果您在评论中解释了性能瓶颈需要它，那么在 C++ 程序中使用

fgets

是完全可以的 - 只是您应该仔细地将其封装在实用程序类中，以便保留 OO 高级结构。

Answer 2

正如我所评论的，在 Linux 和 POSIX 系统上，您可以考虑使用 getline(3);我猜想以下内容可以编译为 C 和 C++（假设您确实有一些有效的

fopen

-ed

FILE*fil;

...）

char* linbuf = NULL; /// or nullptr in C++
size_t linsiz = 0;
ssize_t linlen = 0;

while((linlen=getline(&linbuf, &linsiz,fil))>=0) {
  // do something useful with linbuf; but no C++ exceptions
}
free(linbuf); linsiz=0;

我想这可能适用于（或很容易适应）C++。但是，请注意 C++ 异常，它们不应该通过 while 循环（或者您应该确保适当的析构函数或

catch

正在执行

free(linbuf);

）。

此外，

getline

也可能会失败（例如，如果它调用失败的

malloc

），您可能需要明智地处理该失败。

Answer 3

是的，有一种更快的方法来读取行和创建字符串。

查询文件大小，然后加载到缓冲区中。然后迭代缓冲区，用 null 替换换行符，并将指针存储到下一行。

如果您的平台有一个将文件加载到内存中的调用（很可能），那么速度会快很多。

如何在 C++ 中快速安全地从文本文件中读取极长的行？

问题描述投票：0回答：3

3个回答

最新问题

如何在 C++ 中快速安全地从文本文件中读取极长的行？

问题描述 投票：0回答：3

3个回答

最新问题

问题描述投票：0回答：3