如何将 CSV 文件读入结构并将其放入数组中? [重复]

问题描述 投票:0回答:1

我正在尝试读取以下格式的 CSV 文件:

imdb_id,title,plot,type,rated,year,released_at,added_at,runtime,genre,director,writer,actors,language,country,awards,imdb_rating,imdb_votes

这是 csv


0,tt0147800,10 Things I Hate About You,"A pretty,popular teenager can't go out on a date until her ill-tempered older sister does.",movie,PG-13,1999,31 Mar 1999,"November 12, 2019",97 min,"Comedy, Drama, Romance",Gil Junger,"Karen McCullah, Kirsten Smith","Heath Ledger, Julia Stiles, Joseph Gordon-Levitt, Larisa Oleynik","English, French",USA,2 wins & 13 nominations.,7.3,"283,945"
2,tt0115433,101 Dalmatians,"An evil high-fashion designer plots to steal Dalmatian puppies in order to make an extravagant fur coat, but instead creates an extravagant mess.",movie,G,1996,27 Nov 1996,"November 12, 2019",103 min,"Adventure, Comedy, Crime, Family",Stephen Herek,"Dodie Smith (novel), John Hughes (screenplay)","Glenn Close, Jeff Daniels, Joely Richardson,Joan Plowright","English, Spanish","USA, UK",Nominated for 1 Golden Globe. Another 3 wins &9 nominations.,5.7,"97,785"

并将每一列放入以下结构中:

typedef struct{

    char imdb_id[12];
    char title[50];
    char plot[MAX];
    char type[15];
    char rated[10];
    int year;
    char released_at[50];
    char added_at[MAX];
    char runtime[50];
    char genre[MAX];
    char director[50];
    char writer[MAX];
    char actors[MAX];
    char language[MAX];
    char country[20];
    char awards[50];
    float imdb_rating;
    char imdb_votes[MAX];


} order;

我尝试了以下代码,但对于第二行,国家/地区没有显示。

FILE *f=fopen(file,"r");

    if ( f == NULL)
    {
        printf("Empty\n");
        return 1;
    }


    while (!feof(f))
    {
        int read = fscanf(f, "%11s,%49[^,],\"%2047[^\"]\",%14[^,],%9[^,],%i,%49[^,],\"%2047[^\"]\",%49[^,],\"%2047[^\"]\",%49[^,],\"%2047[^\"]\",\"%2047[^\"]\",\"%2047[^\"]\",%19[^,],%49[^,],%f,\"%2047[^\"]\"\n",
                           or[line].imdb_id,
                           or[line].title,
                           or[line].plot,
                           or[line].type,
                           or[line].rated,
                           & or[line].year,
                           or[line].released_at,
                           or[line].added_at,
                           or[line].runtime,
                           or[line].genre,
                           or[line].director,
                           or[line].writer,
                           or[line].actors,
                           or[line].language,
                           or[line].country,
                           or[line].awards,
                           & or[line].imdb_rating,
                           or[line].imdb_votes);

        line++;
    }

    fclose(f);

我认为问题在于这种语法 -> %19[^,]。

但我不知道该怎么做,因为在第一个 CSV 行中,国家/地区只是美国,但对于第二行,该行是“美国、英国”,这在引号中并改变了所有内容。

我只想将此 CSV 文件读入数组内的结构,以便我可以迭代每一行。

arrays c csv scanf
1个回答
1
投票

如果可能,首选现有的 csv 库。虽然下面的代码似乎有效,但我仅使用提供的输入对其进行了轻微测试。 csv 有很多变体。

读取一行并解析它。如果您知道最大行大小,那么

fgets()
是一个不错的选择,否则使用
getline()
可以根据需要自动调整行缓冲区的大小。

虽然您可以使用

sscanf()
来解析输出,但您必须至少有条件地处理文本字段(首先查看它是否被引用,然后尝试不带引号)。如果您想支持带引号的字段中双引号的转义,则必须切换到其他内容(即如下所示的更通用的解析器)。

我跳过了两个标题行。

考虑阅读它并验证它是否是预期的格式或动态计算字段的顺序。如果您采用动态路线,则字段数组将比结构体更自然地表示(尽管您可以使用

offsetof()
和元数据来索引结构体)。例如:

struct field {
   char *name; // from header
   enum { STRING, INT, FLOAT } type;
   size_t len; // if type is a string
};

union value {
   char *str_value;
   int int_value;
   float float_value;
};

您通过读取标头创建一个

field[cols]
,并编写一个小解释器,将类型映射到函数来解析数据(或使用函数指针而不是类型)。然后有一个
union value[cols]
数组来保存一行数据。

下面的解析器将在出现第一个错误时放弃。其他选项是跳过无效行,或者打印您能够解析的任何内容,在这种情况下您希望在使用之前初始化顺序

order o

#define _POSIX_C_SOURCE 200112L #include <assert.h> #include <limits.h> #include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #define MAX 256 typedef struct{ char imdb_id[12]; char title[50]; char plot[MAX]; char type[15]; char rated[10]; int year; char released_at[50]; char added_at[MAX]; char runtime[50]; char genre[MAX]; char director[50]; char writer[MAX]; char actors[MAX]; char language[MAX]; char country[20]; char awards[50]; float imdb_rating; char imdb_votes[MAX]; } order; const char *field_sep(const char *l, char sep) { if(!l || *l != sep) return NULL; return l + 1; } const char *record_term(const char *l, char sep) { return field_sep(l, sep); } const char *float_field(const char *l, float *f) { if(!l) return NULL; char *endptr; *f = strtof(l, &endptr); if(l == endptr) { fprintf(stderr, "float not found %.32s\n", l); return NULL; } if(*f == HUGE_VAL || *f == HUGE_VALF || *f == HUGE_VALL) { fprintf(stderr, "float out of range %.32s", l); return NULL; } return endptr; } const char *int_field(const char *l, int *i) { if(!l) return NULL; char *endptr; long tmp = strtol(l, &endptr, 10); if(l == endptr) { fprintf(stderr, "int not found %.32s\n", l); return NULL; } if(tmp < INT_MIN || tmp > INT_MAX) { fprintf(stderr, "int out of range %.32s", l); return NULL; } *i = tmp; return endptr; } const char *text_field(const char *l, size_t n, char field[n]) { if(!l) return NULL; assert(n > 0); if(!*l) return l; const char sep = *l == '"' ? '"' : ','; if(sep == '"') l++; const char *end = strchr(l, sep); if(!end) { if(sep == ',') end = l + strlen(l) + 1; else { fprintf(stderr, "end quote missing %.32s\n", l); return NULL; } } if(end - l > n - 1) { fprintf(stderr, "truncating %.*s\n", (int) (n - 1), l); n--; } else n = end - l; memcpy(field, l, n); field[n] = '\0'; return end + (sep == '"'); } int main() { const char *path = "input.csv"; FILE *f = fopen(path, "r"); if(!f) { perror(path); return 1; } char *lineptr = NULL; size_t n; int rv = 0; for(size_t i = 0;; i++) { int rv = getline(&lineptr, &n, f); if(rv == -1) break; if(i < 2) continue; // ignore header order o; const char *next = lineptr; next = int_field(next, &(int) {0}); // ignore next = field_sep(next, ','); next = text_field(next, 12, o.imdb_id); next = field_sep(next, ','); next = text_field(next, 50, o.title); next = field_sep(next, ','); next = text_field(next, MAX, o.plot); next = field_sep(next, ','); next = text_field(next, 15, o.type); next = field_sep(next, ','); next = text_field(next, 10, o.rated); next = field_sep(next, ','); next = int_field(next, &o.year); next = field_sep(next, ','); next = text_field(next, 50, o.released_at); next = field_sep(next, ','); next = text_field(next, MAX, o.added_at); next = field_sep(next, ','); next = text_field(next, 50, o.runtime); next = field_sep(next, ','); next = text_field(next, MAX, o.genre); next = field_sep(next, ','); next = text_field(next, 50, o.director); next = field_sep(next, ','); next = text_field(next, MAX, o.writer); next = field_sep(next, ','); next = text_field(next, MAX, o.actors); next = field_sep(next, ','); next = text_field(next, MAX, o.language); next = field_sep(next, ','); next = text_field(next, 20, o.country); next = field_sep(next, ','); next = text_field(next, 50, o.awards); next = field_sep(next, ','); next = float_field(next, &o.imdb_rating); next = field_sep(next, ','); next = text_field(next, MAX, o.imdb_votes); next = record_term(next, '\n'); if(!next) { fprintf(stderr, "failed to parse line %.32s", lineptr); rv |= 1; goto out; } printf( "imdb_id: %s\n" "title: %s\n" "plot: %s\n" "type: %s\n" "rated: %s\n" "year: %d\n" "released_at: %s\n" "added_at: %s\n" "runtime: %s\n" "genre: %s\n" "director: %s\n" "writer: %s\n" "actors: %s\n" "language: %s\n" "country: %s\n" "awards: %s\n" "imdb_rating: %f\n" "imdb_votes: %s\n" "\n", o.imdb_id, o.title, o.plot, o.type, o.rated, o.year, o.released_at, o.added_at, o.runtime, o.genre, o.director, o.writer, o.actors, o.language, o.country, o.awards, o.imdb_rating, o.imdb_votes ); } out: free(lineptr); fclose(f); return rv; }
示例运行的标准输出:

imdb_id: tt0147800 title: 10 Things I Hate About You plot: A pretty,popular teenager can't go out on a date until her ill-tempered older sister does. type: movie rated: PG-13 year: 1999 released_at: 31 Mar 1999 added_at: November 12, 2019 runtime: 97 min genre: Comedy, Drama, Romance director: Gil Junger writer: Karen McCullah, Kirsten Smith actors: Heath Ledger, Julia Stiles, Joseph Gordon-Levitt, Larisa Oleynik language: English, French country: USA awards: 2 wins & 13 nominations. imdb_rating: 7.300000 imdb_votes: 283,945 imdb_id: tt0115433 title: 101 Dalmatians plot: An evil high-fashion designer plots to steal Dalmatian puppies in order to make an extravagant fur coat, but instead creates an extravagant mess. type: movie rated: G year: 1996 released_at: 27 Nov 1996 added_at: November 12, 2019 runtime: 103 min genre: Adventure, Comedy, Crime, Family director: Stephen Herek writer: Dodie Smith (novel), John Hughes (screenplay) actors: Glenn Close, Jeff Daniels, Joely Richardson,Joan Plowright language: English, Spanish country: USA, UK awards: Nominated for 1 Golden Globe. Another 3 wins &9 n imdb_rating: 5.700000 imdb_votes: 97,785
stderr 告诉您其中一项奖项记录正在被截断:

truncating Nominated for 1 Golden Globe. Another 3 wins &9 n
    
© www.soinside.com 2019 - 2024. All rights reserved.