从 DOCX 文件中以精确格式抓取文本并创建数据框

问题描述 投票:0回答:0

我有一个包含一些表格的 docx,我需要从中创建一个表格。现在,我想保留表格中文本的确切格式。所以,想知道我是否可以做同样的事情。

我从 doc 文件的表格中提取了文本,但是在创建数据框时,请考虑是否可能,否则我会使用不同的算法来解决这个问题。

这是代码,我正在使用:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
from docx import Document

# initializing the chrome webdriver
options = webdriver.ChromeOptions()
options.add_argument("--enable-javascript")
ser = Service("C:\\Users\\ritam\\Downloads\\chromedriver_win32\\chromedriver.exe")
driver = webdriver.Chrome(service=ser, options=options)
#
driver.get("link to the webiste")
driver.maximize_window()
time.sleep(30)
#
driver.switch_to.new_window('tab')
link = "link to the webiste"
driver.get(link)
# creating the instance of the document
doc = Document("C:\\Users\\ritam\\Downloads\\23NICSE06MAT01OC101V01L2_Special Class_Tutor Handout.docx")
# reading the second table from the document
tables = doc.tables[1]
# getting the column headers as the column names
col_data = []
row = tables.rows[0]
for cell in row.cells:
    col_data.append(cell.text)

row1 = tables.rows[1:]
data = {}
slide_arr = []
# getting the slide ids
for ro in tables.rows[1:-1]:
    slide_num = ro.cells[0]
    slide_arr.append(int(slide_num.text))

# getting the description
slide_desc = []
for ro in tables.rows[1:-1]:
    ftext = ''
    cell_read = ro.cells[2]
    for para in cell_read.paragraphs:
        ftext += para.text + '\n'
    if "Narration:" in ftext:
        ftext = ftext.replace("Narration:", "\033[1m Narration:\033[0m")
    if "Connect:" in ftext:
        ftext = ftext.replace("Connect:", "\033[1m Connect:\033[0m")
    if "Tutor points:" in ftext:
        ftext = ftext.replace("Tutor points:", "\033[1m Tutor points:\033[0m")
    if "<" in ftext or ">" in ftext:
        ftext = ftext.replace("<", "\x1B[3m < ")
        ftext = ftext.replace(">", "> \x1B[0m")
    slide_desc.append(ftext)
print(slide_desc[0])
description_data = dict(zip(slide_arr[:-1], slide_desc[:-1]))
# clicking on the edit button
time.sleep(5)
wait = WebDriverWait(driver, 20)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.neo_cl_Button.Button--primary")))
driver.find_element(By.CSS_SELECTOR, "div.neo_cl_Button.Button--primary").click()
time.sleep(5)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.SlideList_scroll_wrapper")))
scroller = driver.find_element(By.CSS_SELECTOR, "div.SlideList_scroll_wrapper")
slides = scroller.find_elements(By.CSS_SELECTOR, "div.droppableList__slide_drag_item")
for slide in slides:
    slide_num = slide.find_element(By.CSS_SELECTOR, "div.slide__slide_number")
    for keys, items in description_data.items():
        if int(slide_num.text) == keys:
            driver.execute_script("arguments[0].scrollIntoView();", slide)
            slide.click()
            time.sleep(5)
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.ID, "cke_1_contents")))
            contents = driver.find_element(By.ID, "cke_1_contents")
            contents.click()
            # time.sleep(5)
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "iframe.cke_wysiwyg_frame")))
            editor_frame = driver.find_element(By.CSS_SELECTOR, "iframe.cke_wysiwyg_frame")
            driver.switch_to.frame(editor_frame)
            wait = WebDriverWait(driver, 10)
            wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            editor_body = driver.find_element(By.TAG_NAME, "body")
            editor_body.clear()
            editor_body.send_keys(description_data[keys])
            driver.switch_to.default_content()
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.neo_cl_Button.Button--primary")))
driver.find_element(By.CSS_SELECTOR,"div.neo_cl_Button.Button--primary").click()
time.sleep(10)
python-3.x pandas dataframe python-docx
© www.soinside.com 2019 - 2024. All rights reserved.