我正在尝试使用 DocumentSentimentDataset 和 DocumentSentimentDataLoader 制作简单的训练、验证和测试数据,但出现此错误
/usr/local/lib/python3.8/dist-packages/pandas/_libs/parsers.pyx in pandas._libs.parsers.raise_parser_error()
ParserError: Error tokenizing data. C error: Expected 6 fields in line 25, saw 7
这是我的代码:
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS
# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)
###
# common functions
###
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
def count_param(module, trainable=False):
if trainable:
return sum(p.numel() for p in module.parameters() if p.requires_grad)
else:
return sum(p.numel() for p in module.parameters())
def get_lr(optimizer):
for param_group in optimizer.param_groups:
return param_group['lr']
def metrics_to_string(metric_dict):
string_list = []
for key, value in metric_dict.items():
string_list.append('{}:{:.2f}'.format(key, value))
return ' '.join(string_list)
train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True, on_bad_lines='skip')
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True, on_bad_lines='skip')
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True, on_bad_lines='skip')
train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
我该如何解决这个问题?