我有一个Python字符串。我想从中删除 base64 字符串。我读到了base64中的规范,并且环顾了SO,但看起来我无法找到一种干净的方法来删除它们。我尝试了一些hacky正则表达式,但这使我的字符串变得更糟,例如,我将单词
problem
变为 lem
:
def remove_base64_strings(text: str) -> str:
"""
Remove base64 encoded strings from a string.
"""
# Regular expression for matching potential base64 strings
base64_pattern = r"(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?"
# Replace found base64 strings with an empty string
return re.sub(base64_pattern, "", text)
或
import re
base64_regex = r'^([A-Za-z0-9+/]{4})*([A-Za-z0-9+/]{4}|[A-Za-z0-9+/]{3}=|[A-Za-z0-9+/]{2}==)$'
base64_strings = re.findall(base64_regex, text)
有没有一种方法可以删除 Base64 字符串?
我正在考虑用空格分割单词。然后找到一个与上述模式匹配且长度为 12 个字符的字符串,因为 base64 字符串看起来像随机长字符串,我想确定删除它们。
我试过这个:
def remove_base64_words(text: str, threshold_length: int = 24) -> str:
"""
Remove words that are suspected to be Base64 encoded strings from a sentence.
Args:
sentence (str): The sentence from which to remove Base64 encoded words.
threshold_length (int): The minimum length of a word to be considered a Base64 encoded string.
Returns:
str: The sentence with suspected Base64 encoded words removed.
"""
# # Regex pattern for Base64 encoded strings
# base64_pattern = r"\b(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?\b"
# base64_pattern = r"^([A-Za-z0-9+/]{4}){5,}([A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$"
# base64_pattern = r"\b(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?\b"
# # Function to replace suspected Base64 encoded words
# def replace_base64_word(matchobj):
# word = matchobj.group(0)
# if len(word) >= threshold_length:
# return ""
# else:
# return word
# # Replace words in the sentence that match the pattern and are above the threshold length
# return re.sub(base64_pattern, replace_base64_word, sentence)
"""
Remove words from the text that are of length 28 or more,
are multiples of 4, and not found in the English dictionary.
Args:
text (str): The input text.
Returns:
str: The text with suspected Base64-like non-dictionary words removed.
"""
import nltk
nltk.download('words')
from nltk.corpus import words
# Set of English words
english_words = set(words.words())
# Split the text into words
words_in_text = text.split()
# Filter out words of specific length properties that are not in the English dictionary
filtered_words = [word for word in words_in_text if not (len(word) >= threshold_length and len(word) % 4 == 0 and word.lower() not in english_words)]
# Reassemble the text
return ' '.join(filtered_words)
单元测试:
# base64
# Unit tests
test_sentences = [
("This is a test with no base64", "This is a test with no base64"),
("Base64 example: TWFuIGlzIGRpc3Rpbmd1aXNoZWQ=", "Base64 example: "),
("Short== but not base64", "Short== but not base64"),
("ValidBase64== but too short", "ValidBase64== but too short"),
("Mixed example with TWFuIGlzIGRpc3Rpbmd1aXNoZWQ= base64", "Mixed example with base64"),
]
for input_sentence, expected_output in test_sentences:
our_output: str = remove_base64_words(input_sentence)
print(f'Trying to remove Base64: {input_sentence=} --> {our_output=} {expected_output=}')
# print(f'Trying to remove Base64: {input_sentence=} {expected_output=}')
base64.base64decode
来验证 Base64 编码字符串的匹配。您还应该添加否定环视模式,以确保匹配项前面或后面没有另一个有效的 base64 字符:
import re
from base64 import b64decode
b64_char = '[A-Za-z0-9+/]'
b64_pattern = re.compile(rf'(?<!{b64_char})(?={b64_char})(?:{b64_char}{{4}})*(?:{b64_char}{{3}}=|{b64_char}{{2}}==)?(?!{b64_char})')
def remove_b64(s):
try:
b64decode(s.encode())
except ValueError:
return s
return ''
def remove_base64_strings(text: str) -> str:
return b64_pattern.sub(lambda m: remove_b64(m[0]), text)
这样:
print(remove_base64_strings('Hello,ZGF0YSB0byBiZSBlbmNvZGVk World!'))
输出:
Hello, World!