如何修复 python 中的错误“clean() 获得意外的关键字参数‘fix_unicode’”

问题描述 投票:0回答:1

我用Python编写了一个程序,我想通过BERT算法实现NLP。我有一个数据集和下面的代码,但是当我在colab运行该程序时,我遇到了以下错误

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle
from bs4 import BeautifulSoup
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import hazm
from hazm import Lemmatizer
from cleantext import clean
import nltk
import plotly.graph_objects as go
from tqdm.notebook import tqdm
import os
import re
import json
import copy
import collections

from transformers import BertConfig, BertTokenizer
from transformers import TFBertModel, TFBertForSequenceClassification
from transformers import glue_convert_examples_to_features

import tensorflow as tf
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 3
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'

os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)
from google.colab import files
uploaded = files.upload()
data = pd.read_csv('dataset1_org.csv',header=None)
data.drop(0,inplace=True,axis=1)
data
data[1].replace('ترس','ترس ',inplace=True)
data[1].unique()
with open('stopwords.txt') as stopwords_file:
   stopwords = stopwords_file.readlines()
nltk_stopwords = [str(line).replace('\n', '') for line in stopwords]
data['comment_len_by_words'] = data[1].apply(lambda t: len(hazm.word_tokenize(t)))
data
min_max_len = data["comment_len_by_words"].min(), data["comment_len_by_words"].max()
print(f'Min: {min_max_len[0]} \tMax: {min_max_len[1]}')
def data_gl_than(data, less_than=100.0, greater_than=0.0, col='comment_len_by_words'):
    data_length = data[col].values

    data_glt = sum([1 for length in data_length if greater_than < length <= less_than])

    data_glt_rate = (data_glt / len(data_length)) * 100

    print(f'Texts with word length of greater than {greater_than} and less than {less_than} includes {data_glt_rate:.2f}% of the whole!')
data_gl_than(data, 256, 3)

# remove comments with the length of fewer than three words
minlim, maxlim = 3, 256

data['comment_len_by_words'] = data['comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else None)
data = data.dropna(subset=['comment_len_by_words'])
data = data.reset_index(drop=True)

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,fix_unicode=True,to_ascii=False,lower=True,no_line_breaks=True,no_urls=True,no_emails=True,no_phone_numbers=True,
        no_numbers=False,no_digits=False,no_currency_symbols=True,no_punct=False,replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )

    # cleaning htmls
    text = cleanhtml(text)
    
    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        # u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text
    data
data['cleaned_comment'] = data[2].apply(cleaning)

# # calculate the length of comments based on their words
data['cleaned_comment_len_by_words'] = data['cleaned_comment'].apply(lambda t: len(hazm.word_tokenize(t)))

# # # remove comments with the length of fewer than three words
data['cleaned_comment_len_by_words'] = data['cleaned_comment_len_by_words'].apply(lambda len_t: len_t if minlim < len_t <= maxlim else len_t)
data = data.dropna(subset=['cleaned_comment_len_by_words'])
data = data.reset_index(drop=True)
data.head()

TypeError Traceback(最近调用 最后)

() 130 返回文本 131 条数据 --> 132 data['cleaned_comment'] = data[1].apply(cleaning) 133 第134章#根据评论的字数计算评论长度

4帧

清洁(文字) 82 83#定期清洗 ---> 84 文本 = 干净(文本,fix_unicode=True,to_ascii=False,lower=True,no_line_breaks=True,no_urls=True,no_emails=True,no_phone_numbers=True, 85 no_numbers=False,no_digits=False,no_currency_symbols=True,no_punct=False,replace_with_url="", 86 替换_with_email="",

TypeError:clean() 得到了意外的关键字参数“fix_unicode”

请帮我解决错误

数据集: 1,2 پрсг,@pejvak313 巴巴 巴克森 托巴尼 莫莫德莫尼 پ׬״,@mamadporii ཀག་ར་ར་རག་རརག་རག་。 ,@ly69204411 پрсг,@kimnacri پש״,@raziehtaheryan 哎呀,@kimiyawssa 巴克利 巴克利 巴克巴巴德 巴克巴克#巴克巴克 巴克巴克#巴克巴克 پש״,@amoyepesarshoja @bahar_sh_y ེགགག་གག་རསག་རག་
特瑞斯,@kimiyaws 🤦🏻u200d♀️ 科拉鲁纳



由于我无法发表评论来询问更多信息,所以我必须自己弄清楚一些事情,尽管我有信心我的回答会解决问题。
python nlp runtime-error bert-language-model transformer-model
1个回答
0
投票

您没有指定模块的安装位置,但是通过在互联网上查找,我们可以注意到 有几个名为 cleantext

的 python 包。 我相信发生的事情是,您发现了这个包

并决定使用程序(如 pip)安装它。虽然,您没有考虑到包
spelledcleantext实际上不是同一个。在这个包中,函数
clean
定义为 def clean(text: str, # pylint: disable=too-many-arguments, too-many-branches clean_all: bool = True, extra_spaces: bool = False, stemming: bool = False, stopwords: bool = False, lowercase: bool = False, numbers: bool = False, punct: bool = False, reg: str = '', reg_replace: str = '', stp_lang: str = 'english') -> str:
其中没有您期望的参数。当 python 解码你的函数调用时,它会意识到 

fix_unicode
 不存在,并且不会继续执行,只会引发有关此特定参数的错误。

要解决此问题,您应该安装名为 
clean-text

(带连字符)的软件包。

当您使用相同名称导入两个软件包时,我建议您卸载 cleantext

(不带连字符)软件包。 

© www.soinside.com 2019 - 2024. All rights reserved.