我正在尝试读取以下格式的 CSV 文件:
imdb_id,title,plot,type,rated,year,released_at,added_at,runtime,genre,director,writer,actors,language,country,awards,imdb_rating,imdb_votes
这是 csv
0,tt0147800,10 Things I Hate About You,"A pretty,popular teenager can't go out on a date until her ill-tempered older sister does.",movie,PG-13,1999,31 Mar 1999,"November 12, 2019",97 min,"Comedy, Drama, Romance",Gil Junger,"Karen McCullah, Kirsten Smith","Heath Ledger, Julia Stiles, Joseph Gordon-Levitt, Larisa Oleynik","English, French",USA,2 wins & 13 nominations.,7.3,"283,945"
2,tt0115433,101 Dalmatians,"An evil high-fashion designer plots to steal Dalmatian puppies in order to make an extravagant fur coat, but instead creates an extravagant mess.",movie,G,1996,27 Nov 1996,"November 12, 2019",103 min,"Adventure, Comedy, Crime, Family",Stephen Herek,"Dodie Smith (novel), John Hughes (screenplay)","Glenn Close, Jeff Daniels, Joely Richardson,Joan Plowright","English, Spanish","USA, UK",Nominated for 1 Golden Globe. Another 3 wins &9 nominations.,5.7,"97,785"
并将每一列放入以下结构中:
typedef struct{
char imdb_id[12];
char title[50];
char plot[MAX];
char type[15];
char rated[10];
int year;
char released_at[50];
char added_at[MAX];
char runtime[50];
char genre[MAX];
char director[50];
char writer[MAX];
char actors[MAX];
char language[MAX];
char country[20];
char awards[50];
float imdb_rating;
char imdb_votes[MAX];
} order;
我尝试了以下代码,但第二行国家/地区没有显示。
FILE *f=fopen(file,"r");
if ( f == NULL)
{
printf("Empty\n");
return 1;
}
while (!feof(f))
{
int read = fscanf(f, "%11s,%49[^,],\"%2047[^\"]\",%14[^,],%9[^,],%i,%49[^,],\"%2047[^\"]\",%49[^,],\"%2047[^\"]\",%49[^,],\"%2047[^\"]\",\"%2047[^\"]\",\"%2047[^\"]\",%19[^,],%49[^,],%f,\"%2047[^\"]\"\n",
or[line].imdb_id,
or[line].title,
or[line].plot,
or[line].type,
or[line].rated,
& or[line].year,
or[line].released_at,
or[line].added_at,
or[line].runtime,
or[line].genre,
or[line].director,
or[line].writer,
or[line].actors,
or[line].language,
or[line].country,
or[line].awards,
& or[line].imdb_rating,
or[line].imdb_votes);
line++;
}
fclose(f);
我认为问题在于这个语法 -> %19[^,]。
但我不知道该怎么做,因为在第一个 csv 行中,国家/地区只是美国,但第二行是“美国、英国”,这在引号中并改变了一切。
我只想将此 csv 文件读入数组内的结构,以便我可以迭代每一行。
如果可能,首选现有的 csv 库。虽然下面的代码似乎有效,但我仅使用提供的输入对其进行了轻微测试。 csv 有很多变体。
读取一行并解析它。如果您知道最大行大小,那么
fgets()
是一个不错的选择,否则使用 getline()
可以根据需要自动调整行缓冲区的大小。
虽然您可以使用
sscanf()
来解析输出,但您必须至少有条件地处理文本字段(首先查看它是否被引用,然后尝试不带引号)。如果您想支持带引号的字段中双引号的转义,则必须切换到其他内容(即如下所示的更通用的解析器)。
我跳过了两个标题行。
考虑阅读它并验证它是否是预期的格式或动态计算字段的顺序。如果您采用动态路线,则字段数组将比结构体更自然地表示(尽管您可以使用
offsetof()
和元数据来索引结构体)。例如:
struct field {
char *name; // from header
enum { STRING, INT, FLOAT } type;
size_t len; // if type is a string
};
union value {
char *str_value;
int int_value;
float float_value;
};
您通过读取标头创建一个
field[cols]
,并编写一个小解释器,将类型映射到函数来解析数据(或使用函数指针而不是类型)。然后有一个 union value[cols]
数组来保存一行数据。
下面的解析器将在出现第一个错误时放弃。其他选项是跳过无效行,或者打印您能够解析的任何内容,在这种情况下您希望在使用之前初始化顺序
order o
。
#define _POSIX_C_SOURCE 200112L
#include <assert.h>
#include <limits.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX 256
typedef struct{
char imdb_id[12];
char title[50];
char plot[MAX];
char type[15];
char rated[10];
int year;
char released_at[50];
char added_at[MAX];
char runtime[50];
char genre[MAX];
char director[50];
char writer[MAX];
char actors[MAX];
char language[MAX];
char country[20];
char awards[50];
float imdb_rating;
char imdb_votes[MAX];
} order;
const char *field_sep(const char *l, char sep) {
if(!l || *l != sep) return NULL;
return l + 1;
}
const char *record_term(const char *l, char sep) {
return field_sep(l, sep);
}
const char *float_field(const char *l, float *f) {
if(!l) return NULL;
char *endptr;
*f = strtof(l, &endptr);
if(l == endptr) {
fprintf(stderr, "float not found %.32s\n", l);
return NULL;
}
if(*f == HUGE_VAL || *f == HUGE_VALF || *f == HUGE_VALL) {
fprintf(stderr, "float out of range %.32s", l);
return NULL;
}
return endptr;
}
const char *int_field(const char *l, int *i) {
if(!l) return NULL;
char *endptr;
long tmp = strtol(l, &endptr, 10);
if(l == endptr) {
fprintf(stderr, "int not found %.32s\n", l);
return NULL;
}
if(tmp < INT_MIN || tmp > INT_MAX) {
fprintf(stderr, "int out of range %.32s", l);
return NULL;
}
*i = tmp;
return endptr;
}
const char *text_field(const char *l, size_t n, char field[n]) {
if(!l) return NULL;
assert(n > 0);
if(!*l) return l;
const char sep = *l == '"' ? '"' : ',';
if(sep == '"') l++;
const char *end = strchr(l, sep);
if(!end) {
if(sep == ',')
end = l + strlen(l) + 1;
else {
fprintf(stderr, "end quote missing %.32s\n", l);
return NULL;
}
}
if(end - l > n - 1) {
fprintf(stderr, "truncating %.*s\n", (int) (n - 1), l);
n--;
} else
n = end - l;
memcpy(field, l, n);
field[n] = '\0';
return end + (sep == '"');
}
int main() {
const char *path = "input.csv";
FILE *f = fopen(path, "r");
if(!f) {
perror(path);
return 1;
}
char *lineptr = NULL;
size_t n;
int rv = 0;
for(size_t i = 0;; i++) {
int rv = getline(&lineptr, &n, f);
if(rv == -1)
break;
if(i < 2) continue; // ignore header
order o;
const char *next = lineptr;
next = int_field(next, &(int) {0}); // ignore
next = field_sep(next, ',');
next = text_field(next, 12, o.imdb_id);
next = field_sep(next, ',');
next = text_field(next, 50, o.title);
next = field_sep(next, ',');
next = text_field(next, MAX, o.plot);
next = field_sep(next, ',');
next = text_field(next, 15, o.type);
next = field_sep(next, ',');
next = text_field(next, 10, o.rated);
next = field_sep(next, ',');
next = int_field(next, &o.year);
next = field_sep(next, ',');
next = text_field(next, 50, o.released_at);
next = field_sep(next, ',');
next = text_field(next, MAX, o.added_at);
next = field_sep(next, ',');
next = text_field(next, 50, o.runtime);
next = field_sep(next, ',');
next = text_field(next, MAX, o.genre);
next = field_sep(next, ',');
next = text_field(next, 50, o.director);
next = field_sep(next, ',');
next = text_field(next, MAX, o.writer);
next = field_sep(next, ',');
next = text_field(next, MAX, o.actors);
next = field_sep(next, ',');
next = text_field(next, MAX, o.language);
next = field_sep(next, ',');
next = text_field(next, 20, o.country);
next = field_sep(next, ',');
next = text_field(next, 50, o.awards);
next = field_sep(next, ',');
next = float_field(next, &o.imdb_rating);
next = field_sep(next, ',');
next = text_field(next, MAX, o.imdb_votes);
next = record_term(next, '\n');
if(!next) {
fprintf(stderr, "failed to parse line %.32s", lineptr);
rv |= 1;
goto out;
}
printf(
"imdb_id: %s\n"
"title: %s\n"
"plot: %s\n"
"type: %s\n"
"rated: %s\n"
"year: %d\n"
"released_at: %s\n"
"added_at: %s\n"
"runtime: %s\n"
"genre: %s\n"
"director: %s\n"
"writer: %s\n"
"actors: %s\n"
"language: %s\n"
"country: %s\n"
"awards: %s\n"
"imdb_rating: %f\n"
"imdb_votes: %s\n"
"\n",
o.imdb_id,
o.title,
o.plot,
o.type,
o.rated,
o.year,
o.released_at,
o.added_at,
o.runtime,
o.genre,
o.director,
o.writer,
o.actors,
o.language,
o.country,
o.awards,
o.imdb_rating,
o.imdb_votes
);
}
out:
free(lineptr);
fclose(f);
return rv;
}
示例运行的标准输出:
imdb_id: tt0147800
title: 10 Things I Hate About You
plot: A pretty,popular teenager can't go out on a date until her ill-tempered older sister does.
type: movie
rated: PG-13
year: 1999
released_at: 31 Mar 1999
added_at: November 12, 2019
runtime: 97 min
genre: Comedy, Drama, Romance
director: Gil Junger
writer: Karen McCullah, Kirsten Smith
actors: Heath Ledger, Julia Stiles, Joseph Gordon-Levitt, Larisa Oleynik
language: English, French
country: USA
awards: 2 wins & 13 nominations.
imdb_rating: 7.300000
imdb_votes: 283,945
imdb_id: tt0115433
title: 101 Dalmatians
plot: An evil high-fashion designer plots to steal Dalmatian puppies in order to make an extravagant fur coat, but instead creates an extravagant mess.
type: movie
rated: G
year: 1996
released_at: 27 Nov 1996
added_at: November 12, 2019
runtime: 103 min
genre: Adventure, Comedy, Crime, Family
director: Stephen Herek
writer: Dodie Smith (novel), John Hughes (screenplay)
actors: Glenn Close, Jeff Daniels, Joely Richardson,Joan Plowright
language: English, Spanish
country: USA, UK
awards: Nominated for 1 Golden Globe. Another 3 wins &9 n
imdb_rating: 5.700000
imdb_votes: 97,785
stderr 告诉您其中一项奖项记录正在被截断:
truncating Nominated for 1 Golden Globe. Another 3 wins &9 n