我用Python编写了一个程序,我想通过BERT算法实现NLP。我有一个数据集和下面的代码,但是当我在colab运行该程序时,我遇到了以下错误
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import hazm
from hazm import Lemmatizer
from cleantext import clean
import nltk
import plotly.graph_objects as go
from tqdm.notebook import tqdm
import os
import re
import json
import copy
import collections
from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features
import tensorflow as tf
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0
MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
from google.colab import files
uploaded = files.upload()
data = pd.read_csv('dataset1_org.csv',header=None)
data.drop(0,inplace=True,axis=1)
data
data[1].replace('ترس','ترس ',inplace=True)
data[1].unique()
with open('stopwords.txt') as stopwords_file:
stopwords = stopwords_file.readlines()
nltk_stopwords = [str(line).replace('\n', '') for line in stopwords]
data['comment_len_by_words'] = data[1].apply(lambda t: len(hazm.word_tokenize(t)))
data
min_max_len = data["comment_len_by_words"].min(), data["comment_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='comment_len_by_words'):
data_length = data[col].values
data_glt = sum([1 for length in data_length if greater_than < length <= less_than])
data_glt_rate = (data_glt / len(data_length)) * 100
print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')
data_gl_than(data, 256, 3)
# remove comments with the length of fewer than three words
minlim, maxlim = 3, 256
data['comment_len_by_words'] = data['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
data = data.dropna(subset=['comment_len_by_words'])
data = data.reset_index(drop=True)
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def cleaning(text):
text = text.strip()
# regular cleaning
text = clean(text,fix_unicode=True,to_ascii=False,lower=True,no_line_breaks=True,no_urls=True,no_emails=True,no_phone_numbers=True,
no_numbers=False,no_digits=False,no_currency_symbols=True,no_punct=False,replace_with_url="",
replace_with_email="",
replace_with_phone_number="",
replace_with_number="",
replace_with_digit="0",
replace_with_currency_symbol="",
)
# cleaning htmls
text = cleanhtml(text)
# normalizing
normalizer = hazm.Normalizer()
text = normalizer.normalize(text)
# removing wierd patterns
wierd_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
u"\u2069"
u"\u2066"
# u"\u200c"
u"\u2068"
u"\u2067"
"]+", flags=re.UNICODE)
text = wierd_pattern.sub(r'', text)
# removing extra spaces, hashtags
text = re.sub("#", "", text)
text = re.sub("\s+", " ", text)
return text
data
data['cleaned_comment'] = data[2].apply(cleaning)
# # calculate the length of comments based on their words
data['cleaned_comment_len_by_words'] = data['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))
# # # remove comments with the length of fewer than three words
data['cleaned_comment_len_by_words'] = data['cleaned_comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
data = data.dropna(subset=['cleaned_comment_len_by_words'])
data = data.reset_index(drop=True)
data.head()
TypeError Traceback(最近调用 最后)
在
() 130 返回文本 131 条数据 --> 132 data['cleaned_comment'] = data[1].apply(cleaning) 133 第134章#根据评论的字数计算评论长度 | 4帧
清洁(文字) 82 83#定期清洗 ---> 84 文本 = 干净(文本,fix_unicode=True,to_ascii=False,lower=True,no_line_breaks=True,no_urls=True,no_emails=True,no_phone_numbers=True, 85 no_numbers=False,no_digits=False,no_currency_symbols=True,no_punct=False,replace_with_url="", 86 替换_with_email="",
TypeError:clean() 得到了意外的关键字参数“fix_unicode”
请帮我解决错误
数据集:
1,2
پрсг,@pejvak313 巴巴 巴克森 托巴尼 莫莫德莫尼
پ״,@mamadporii ཀག་ར་ར་རག་རརག་རག་。
,@ly69204411
پрсг,@kimnacri
پש״,@raziehtaheryan
哎呀,@kimiyawssa 巴克利 巴克利 巴克巴巴德 巴克巴克#巴克巴克 巴克巴克#巴克巴克
پש״,@amoyepesarshoja @bahar_sh_y ེགགག་གག་རསག་རག་
特瑞斯,@kimiyaws 🤦🏻u200d♀️ 科拉鲁纳
您没有指定模块的安装位置,但是通过在互联网上查找,我们可以注意到 有几个名为 cleantext
的 python 包。 我相信发生的事情是,您发现了这个包
并决定使用程序(如 pip)安装它。虽然,您没有考虑到包spelled
cleantext
实际上不是同一个。在这个包中,函数clean
定义为
def clean(text: str, # pylint: disable=too-many-arguments, too-many-branches
clean_all: bool = True,
extra_spaces: bool = False,
stemming: bool = False,
stopwords: bool = False,
lowercase: bool = False,
numbers: bool = False,
punct: bool = False,
reg: str = '',
reg_replace: str = '',
stp_lang: str = 'english') -> str:
其中没有您期望的参数。当 python 解码你的函数调用时,它会意识到 fix_unicode
不存在,并且不会继续执行,只会引发有关此特定参数的错误。
要解决此问题,您应该安装名为
clean-text
(带连字符)的软件包。 当您使用相同名称导入两个软件包时,我建议您卸载 cleantext
(不带连字符)软件包。