我已经阅读并做了一些研究,我可以在从文件对话框加载列表时删除重复项,但是如何在抓取/解析时自动删除重复项。下面是我的代码,哈希标签是我尝试过的一些字典和列表函数。感谢您的帮助。
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import time
import itertools
from bs4 import BeautifulSoup
# URL LIST:
url = [
'URL0',
'URL1',
]
#OPTIONS/REQUESTS:
options = Options()
options.add_argument("--disable-extensions")
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36')
#options.add_argument("--headerless")
options.page_load_strategy = 'normal'
requests = webdriver.Chrome(options=options)
requests.implicitly_wait(10)
wait = WebDriverWait(requests, 10)
#USE SPIKE = [] FOR APPEND
#spike = []
#spike: list[str, int, set] = []
#ele: dict[set] = {}
spike: dict[set] = []
#REQUESTS/SLEEP FOR LOOP:
for url in url:
#ele = ele.text
requests.get(url)
time.sleep(10)
#FOR ELEMENTS WITHIN XPATH:
try:
for ele in requests.find_elements(By.XPATH, '(//pre["text()"])[position() < 10]'):
try:
spike.append(ele.text.splitlines(keepends=False))
time.sleep(5)
for i in range(len(spike)):
print('\n'.join(spike[i]))
except:
requests.close()
else:
requests.get(url)
except:
requests.close()
else:
requests.quit()
#SAVE ELEMENTS WITHIN XPATH:
with open("text.txt", "wt", encoding="utf-8") as file:
for i in range(len(spike)):
print("\n".join(spike[i]), file=file)
file.close()
我最终只是将此代码添加到我一直在开发的 GUI 中,并使用以下代码删除重复项:
def contain(key):
item = key in listbox.get(0, END)
return item
for key in spike[i]:
if contain(key) == True:
pass
else:
listbox.insert(END,key)