
问题描述 投票:0回答:1



“在数十万高中摔跤手中,只有一小部分人知道赢得州冠军是什么感觉。{{Elided}} 就是这个比例的一部分。里士满的少年通过赢得……Premium 加入了这个群体。内容仅供订阅者使用。请登录此处访问内容或前往此处购买订阅。


import re
import string
import nltk
from nltk.corpus import stopwords

# function to detect paywall-related text
def detect_paywall(text):
    paywall_keywords = ["login", "subscription", "purchase a subscription", "subscribers"]
    for keyword in paywall_keywords:
        if re.search(r'\b{}\b'.format(keyword), text, flags=re.IGNORECASE):
            return True
    return False

# function for text preprocessing
def preprocess_text(text):
    # Check if the text contains paywall-related content
    if detect_paywall(text):
        # Remove paywall-related sentences or language from the text
        sentences = nltk.sent_tokenize(text)
        cleaned_sentences = [sentence for sentence in sentences if not detect_paywall(sentence)]
        cleaned_text = ' '.join(cleaned_sentences)
        return cleaned_text.strip()  # Remove leading/trailing whitespace

    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Convert to lowercase
    tokens = [token.lower() for token in tokens]
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in stripped if word.isalpha() and word not in stop_words]
    return ' '.join(words)



在数十万高中摔跤运动员中,只有一小部分人知道赢得州冠军是什么感觉。 {{Elided}} 是该百分比的一部分。里士满的小学生通过获胜加入了该群体……优质内容仅向订阅者开放。请登录此处访问内容或前往此处购买订阅。

python pandas nlp nltk


import re

text = "Of the hundreds of thousands of high school wrestlers, only a small percentage know what it’s like to win a state title. {{Elided}} is part of that percentage. The Richmond junior joined that group by winning… Premium Content is available to subscribers only. Please login here to access content or go here to purchase a subscription."
paywall_keywords = ["login", "subscription", "purchase a subscription", "subscribers"]


patt = re.compile('|'.join(['.*' + e for e in paywall_keywords]))

'.*login|.*subscription|.*purchase a subscription|.*subscribers'


phrases = text.split(sep='.')

['Of the hundreds of thousands of high school wrestlers, only a small percentage know what it’s like to win a state title',
 ' {{Elided}} is part of that percentage',
 ' The Richmond junior joined that group by winning… Premium Content is available to subscribers only',
 ' Please login here to access content or go here to purchase a subscription',


found = list(filter(patt.match, phrases))

[' The Richmond junior joined that group by winning… Premium Content is available to subscribers only',
 ' Please login here to access content or go here to purchase a subscription']


'.'.join([p for p in phrases if p not in found])

'Of the hundreds of thousands of high school wrestlers, only a small percentage know what it’s like to win a state title. {{Elided}} is part of that percentage.'
© www.soinside.com 2019 - 2024. All rights reserved.