将 csv 中的字符串读取到 c 中的结构中

问题描述 投票:0回答:1

我想读取具有此列名称的此类 csv 文件。

imdb_id,movie_id,title,plot,type,rated,year,released_at,added_at,runtime,genre,director,writer,actors,language,country,awards,imdb_rating,imdb_votes

这是行。

0,tt0147800,10 Things I Hate About You,"A pretty, popular teenager can't go out on a date until her ill-tempered older sister does.",movie,PG-13,1999,31 Mar 1999,"November 12, 2019",97 min,"Comedy, Drama, Romance",Gil Junger,"Karen McCullah, Kirsten Smith","Heath Ledger, Julia Stiles, Joseph Gordon-Levitt, Larisa Oleynik","English, French",USA,2 wins & 13 nominations.,7.3,"283,945"
2,tt0115433,101 Dalmatians,"An evil high-fashion designer plots to steal Dalmatian puppies in order to make an extravagant fur coat, but instead creates an extravagant mess.",movie,G,1996,27 Nov 1996,"November 12, 2019",103 min,"Adventure, Comedy, Crime, Family",Stephen Herek,"Dodie Smith (novel), John Hughes (screenplay)","Glenn Close, Jeff Daniels, Joely Richardson, Joan Plowright","English, Spanish","USA, UK",Nominated for 1 Golden Globe. Another 3 wins & 9 nominations.,5.7,"97,785"

这是我阅读本文的全部代码。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <string.h>

#define MAX 2048

char *strdup(const char *str);

typedef struct{

    int imdb_id;
    char *movie_id;
    char *title;
    char *plot;
    char *type;
    char *rated;
    int year;
    char *released_at;
    char *added_at;
    char *runtime;
    char *genre;
    char *director;
    char *writer;
    char *coma;
    char *actors;
    char *coma2;
    char *language;
    char *coma3;
    char *country;
    char *awards;
    float imdb_rating;
    char *imdb_votes;


} order;

order or[MAX];

int leni;

int read_txt(char *file);

void getData(char *buff);

int main(int argc,char *argv[]){


    if ( argc < 2)
    {
        printf("No hay suficiente argumentos\n");
        return 1;
    }


    read_txt(argv[1]);



    return 0;

}

int read_txt(char *file){


    FILE *f=fopen(file,"r");

    if ( f == NULL)
    {
        printf("El archivo esta vacio\n");
        return 1;
    }

    order* o;
    char buff[1024];
    char* delimiter = ",\"";
    while (fgets(buff, MAX, f) != NULL)
    {
        o = (order *)malloc(sizeof(order));

        o->imdb_id = atoi(strtok(buff, ",\""));
        o->movie_id = strdup(strtok(NULL, delimiter));
        o->title = strdup(strtok(NULL, delimiter));
        o->plot = strdup(strtok(NULL, "\""));
        o->type = strdup(strtok(NULL, delimiter));
        o->rated = strdup(strtok(NULL, delimiter));
        o->year = atoi(strtok(NULL, delimiter));
        o->released_at=strdup(strtok(NULL, delimiter));
        o->added_at=strdup(strtok(NULL, "\""));
        o->runtime=strdup(strtok(NULL, delimiter));
        o->genre=strdup(strtok(NULL, "\""));
        o->director=strdup(strtok(NULL, delimiter));
        o->writer=strdup(strtok(NULL, "\""));
        o->coma=strdup(strtok(NULL, "\""));
        o->actors=strdup(strtok(NULL, "\""));
        o->coma2=strdup(strtok(NULL, "\""));
        o->language=strdup(strtok(NULL, "\""));
        o->country=strdup(strtok(NULL, delimiter));

        or[leni++] = *o;
     }
     for ( int l = 0;l<leni;l++)
        {
            printf("%s\n", or[l].country);
        }

     fclose(f);

     return 0;

}


char *strdup(const char *str)
{
    size_t len = strlen(str) + 1;
    char *dup = malloc(len);
    if (dup != 0)
        memmove(dup, str, len);  
    return(dup);
}

我的问题是我需要一种方法来阅读国家/地区专栏。

当我打印本专栏时,这是我的输出。

USA
USA

本专栏的预期输出是

USA
USA, UK

问题出在分隔符部分,但我不知道如何在 strtok 中放置一个分隔符来改变两行。

第一行中的国家/地区列是美国,但第二行中的国家/地区列是“美国、英国”。

引号就是问题所在。

c csv struct output
1个回答
0
投票
  1. 不要在数据结构中存储分隔符 (

    coma
    )
    order

  2. 避免全局变量。它使您的程序更难测试。

  3. fgets()
    不是读取 csv 记录的理想选择,因为转义字段(即双引号中的字段)可能包含嵌入的
    \n
    字符。

  4. 如果您输入的行包含超过 MAX 个字符,则

    fgets()
    将返回截断的记录。也许检查一下?

  5. 在逗号分隔文件 (CSV) 中,字段之间的分隔符是

    ,
    而不是
    ,"

  6. 正如您所发现的,转义字段可能包含“,”。这意味着您根本无法使用

    strtok()
    。相反,对于转义字段,您需要找到匹配的结尾双引号,忽略任何连续的两个双引号,然后后跟字段分隔符 ',' 或记录终止符 " “ 在记录中这是可选的。

  7. MAX
    对于符号常量来说并不是一个好名字,因为它不传达任何含义。它是最大订单数,但是您错误地将其用作最大行长度,将更正的
    buff
    变量硬编码为 1024。这将导致任何长度超过 1024 的行出现缓冲区溢出。要么引入不同的符号常量,例如 MAX_LINE
    , or perhaps even better use 
    sizeof buff
    instead in the
    fgets()` 调用。

  8. 不要从

    malloc()
    投射 void 指针,因为它可能会隐藏问题。

  9. 您需要在

    #define _POSIX_C_SOURCE 200809L
    之前
    #include <string.h>
    (或稍后)才能定义
    strdup()
    。也不要实施
    strdup()
    。如果无法在作业中使用标准函数,请使用不同的名称,例如,
    my_strdup()

  10. (不固定)使用

    strtol()
    代替
    atoi()
    ,因为后者不允许错误检查。

  11. (未修复)

    strtol()
    允许通过检查第二个参数中返回的内容来进行错误检查。

我在下面基于 RFC 4180 编写了一些严格的 csv 解析器。它已经过最低限度的测试。对您来说最有趣的部分可能是函数

parse_field()
。它使用
realloc()
根据需要调整大小,而不是固定数组。

CSV 文件不允许在任何地方出现“ ”,因此您可以在

read_file()
中检查该内容,然后仅返回常规字符串,而不是字节数组及其大小。
read_file()
是通用函数,
parse_field()
是所有 CSV 特定处理发生的地方。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define BUFF_SIZE 1024

typedef struct order{
    int imdb_id;
    char *movie_id;
    char *title;
    char *plot;
    char *type;
    char *rated;
    int year;
    char *released_at;
    char *added_at;
    char *runtime;
    char *genre;
    char *director;
    char *writer;
    char *actors;
    char *language;
    char *country;
    char *awards;
    float imdb_rating;
    char *imdb_votes;
} order;

int read_file(const char *file, char **buff, size_t *buff_size) {
    *buff = NULL;
    *buff_size = 0;
    FILE *f = fopen(file, "r");
    if(!f) {
        perror(file);
        goto err;
    }
    for(;;) {
        char *tmp = realloc(*buff, *buff_size + BUFF_SIZE);
        if(!tmp) {
            perror("realloc");
            goto err;
        }
        *buff = tmp;
        size_t n = fread(*buff + *buff_size, 1, BUFF_SIZE, f);
        *buff_size += n;
        if(n < BUFF_SIZE) break;
    }
    char *tmp = realloc(*buff, *buff_size + BUFF_SIZE);
    if(!tmp) {
        perror("realloc");
        goto err;
    }
    *buff = tmp;
    fclose(f);
    return 0;
err:
    free(*buff);
    *buff = NULL;
    if(f) fclose(f);
    return 1;
}

size_t parse_field(const char *csv, size_t csv_len, char **field) {
    if(!csv_len) return 0;

    if(*csv == '"') {
        for(size_t offset=1, twodquote = 0; csv_len - offset; offset++) {
            if(
                (csv[offset] < 0x20 || csv[offset] > 0x21) &&
                (csv[offset] < 0x23 || csv[offset] > 0x2b) &&
                (csv[offset] < 0x2d || csv[offset] > 0x7e) &&
                (csv[offset] != ',') &&
                (csv[offset] != '\r') &&
                (csv[offset] != '\n')
            ) {
                if(csv[offset] == '"') {
                    if(csv_len - offset - sizeof '"' && csv[offset+1] == '"') {
                        twodquote++;
                        offset++;
                        continue;
                    }
                    *field = malloc(offset + twodquote);
                    if(!*field) {
                        fprintf(stderr, "malloc failed\n");
                        return 0;
                    }
                    size_t i = 0;
                    for(size_t j = 1; j < offset; i++, j += csv[i] == '"' ? 2 : 1)
                        (*field)[i] = csv[j];
                    (*field)[offset - twodquote - 1] = '\0';
                    return ++offset;
                }
            }
        }
    } else {
        for(size_t offset = 0; csv_len - offset; offset++) {
            if(
                (csv[offset] < 0x20 || csv[offset] > 0x21) &&
                (csv[offset] < 0x23 || csv[offset] > 0x2b) &&
                (csv[offset] < 0x2d || csv[offset] > 0x7e)
            ) {
                *field = malloc(offset + 1);
                if(!*field) {
                    fprintf(stderr, "malloc failed\n");
                    return 0;
                }
                memcpy(*field, csv, offset);
                (*field)[offset] = '\0';
                return offset;
            }
        }
    }
    return 0;
}

size_t parse_str(const char *s, size_t s_len, const char *expect, size_t expect_len) {
    if(s_len < expect_len) return 0;
    return memcmp(s, expect, expect_len) ? 0 : expect_len;
}

size_t parse_record(const char *csv, size_t csv_len, char ***record, size_t *record_len) {
    *record = NULL;
    *record_len = 0;
    size_t offset = 0;
    for(;;) {
        char *field;
        size_t offset2 = parse_field(csv + offset, csv_len - offset, &field);
        if(!offset2) goto err;
        offset += offset2;
        char **tmp = realloc(*record, sizeof *record * (*record_len + 1));
        if(!tmp) {
            perror("realloc");
            free(field);
            goto err;
        }
        *record = tmp;
        (*record)[(*record_len)++] = field;

        offset2 = parse_str(csv + offset, csv_len - offset,  ",", 1);
        if(!offset2) break;

        offset += offset2;
    }
    return offset;
err:
    for(size_t i = 0; i < *record_len; i++)
        free((*record)[i]);
    free(*record);
    *record = NULL;
    return 0;
}

int main(int argc,char *argv[]) {
    if (argc < 2) {
        fprintf(stderr, "No hay suficiente argumentos\n");
        return 1;
    }

    int rv = EXIT_FAILURE;
    char *csv = NULL;
    char **header = NULL;
    size_t header_len = 0;
    order *orders = NULL;
    size_t orders_len = 0;

    size_t csv_len;
    if(read_file(argv[1], &csv, &csv_len)) goto out;

    size_t offset = parse_record(csv, csv_len, &header, &header_len);
    if(!offset) {
        fprintf(stderr, "could not parse header\n");
        goto out;
    }

    if(header_len != 19) {
        fprintf(stderr, "header should have 19 fields\n");
        goto out;
    }

    size_t offset2 = parse_str(csv + offset, csv_len - offset, "\r\n", 2);
    if(!offset2) {
        fprintf(stderr, "could not parse crnl after header\n");
        goto out;
    }
    offset += offset2;

    // for(size_t i = 0; i < header_len; i++)
    //  printf("%s%s", header[i], i + 1 < header_len ? "," : "\n");

    for(;offset < csv_len;) {
        char **record = NULL;
        size_t record_len;
        offset2 = parse_record(csv + offset, csv_len - offset, &record, &record_len);
        if(!offset) break;
        offset += offset2;

        if(record_len != 19) {
            fprintf(stderr, "record should have 19 fields\n");
            goto err;
        }

        // for(size_t i = 0; i < record_len; i++)
        //  printf("%s%s", record[i], i + 1 < record_len ? "," : "\n");

        order *tmp = realloc(orders, sizeof *orders * (orders_len + 1));
        if(!tmp) {
            fprintf(stderr, "realloc\n");
            goto err;
        }
        orders = tmp;
        orders[orders_len++] = (order) {
            atoi(record[0]),
            record[1],
            record[2],
            record[3],
            record[4],
            record[5],
            atoi(record[6]),
            record[7],
            record[8],
            record[9],
            record[10],
            record[11],
            record[12],
            record[13],
            record[14],
            record[15],
            record[16],
            strtof(record[17], NULL),
            record[18]
        };
        free(record[0]);
        free(record[6]);
        free(record[17]);
        free(record);

        offset2 = parse_str(csv + offset, csv_len - offset, "\r\n", 2);
        if(!offset2 && !csv_len) break;
        offset += offset2;

        continue;
err:
        for(size_t i = 0; i < record_len; i++)
            free(record[i]);
        free(record);
        goto out;
    }

    for(size_t i = 0; i < orders_len; i++)
        printf("%s\n", orders[i].country);

    rv = EXIT_SUCCESS;
out:
    if(orders) {
        for(size_t i = 0; i < orders_len; i++) {
            free(orders[i].movie_id);
            free(orders[i].title);
            free(orders[i].plot);
            free(orders[i].type);
            free(orders[i].rated);
            free(orders[i].released_at);
            free(orders[i].added_at);
            free(orders[i].runtime);
            free(orders[i].genre);
            free(orders[i].director);
            free(orders[i].writer);
            free(orders[i].actors);
            free(orders[i].language);
            free(orders[i].country);
            free(orders[i].awards);
            free(orders[i].imdb_votes);
        }
        free(orders);
    }
    if(header) {
        for(size_t i = 0; i < header_len; i++)
            free(header[i]);
        free(header);
    }
    free(csv);
    return rv;
}

运行示例:

USA
USA, UK
© www.soinside.com 2019 - 2024. All rights reserved.