我有一个包含一些表格的 docx,我需要从中创建一个表格。现在,我想保留表格中文本的确切格式。所以,想知道我是否可以做同样的事情。
我从 doc 文件的表格中提取了文本,但是在创建数据框时,请考虑是否可能,否则我会使用不同的算法来解决这个问题。
这是代码,我正在使用:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
from docx import Document
# initializing the chrome webdriver
options = webdriver.ChromeOptions()
options.add_argument("--enable-javascript")
ser = Service("C:\\Users\\ritam\\Downloads\\chromedriver_win32\\chromedriver.exe")
driver = webdriver.Chrome(service=ser, options=options)
#
driver.get("link to the webiste")
driver.maximize_window()
time.sleep(30)
#
driver.switch_to.new_window('tab')
link = "link to the webiste"
driver.get(link)
# creating the instance of the document
doc = Document("C:\\Users\\ritam\\Downloads\\23NICSE06MAT01OC101V01L2_Special Class_Tutor Handout.docx")
# reading the second table from the document
tables = doc.tables[1]
# getting the column headers as the column names
col_data = []
row = tables.rows[0]
for cell in row.cells:
col_data.append(cell.text)
row1 = tables.rows[1:]
data = {}
slide_arr = []
# getting the slide ids
for ro in tables.rows[1:-1]:
slide_num = ro.cells[0]
slide_arr.append(int(slide_num.text))
# getting the description
slide_desc = []
for ro in tables.rows[1:-1]:
ftext = ''
cell_read = ro.cells[2]
for para in cell_read.paragraphs:
ftext += para.text + '\n'
if "Narration:" in ftext:
ftext = ftext.replace("Narration:", "\033[1m Narration:\033[0m")
if "Connect:" in ftext:
ftext = ftext.replace("Connect:", "\033[1m Connect:\033[0m")
if "Tutor points:" in ftext:
ftext = ftext.replace("Tutor points:", "\033[1m Tutor points:\033[0m")
if "<" in ftext or ">" in ftext:
ftext = ftext.replace("<", "\x1B[3m < ")
ftext = ftext.replace(">", "> \x1B[0m")
slide_desc.append(ftext)
print(slide_desc[0])
description_data = dict(zip(slide_arr[:-1], slide_desc[:-1]))
# clicking on the edit button
time.sleep(5)
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.neo_cl_Button.Button--primary")))
driver.find_element(By.CSS_SELECTOR, "div.neo_cl_Button.Button--primary").click()
time.sleep(5)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.SlideList_scroll_wrapper")))
scroller = driver.find_element(By.CSS_SELECTOR, "div.SlideList_scroll_wrapper")
slides = scroller.find_elements(By.CSS_SELECTOR, "div.droppableList__slide_drag_item")
for slide in slides:
slide_num = slide.find_element(By.CSS_SELECTOR, "div.slide__slide_number")
for keys, items in description_data.items():
if int(slide_num.text) == keys:
driver.execute_script("arguments[0].scrollIntoView();", slide)
slide.click()
time.sleep(5)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.ID, "cke_1_contents")))
contents = driver.find_element(By.ID, "cke_1_contents")
contents.click()
# time.sleep(5)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "iframe.cke_wysiwyg_frame")))
editor_frame = driver.find_element(By.CSS_SELECTOR, "iframe.cke_wysiwyg_frame")
driver.switch_to.frame(editor_frame)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
editor_body = driver.find_element(By.TAG_NAME, "body")
editor_body.clear()
editor_body.send_keys(description_data[keys])
driver.switch_to.default_content()
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.neo_cl_Button.Button--primary")))
driver.find_element(By.CSS_SELECTOR,"div.neo_cl_Button.Button--primary").click()
time.sleep(10)