如何在嵌入所有图像的 Python 中将 Word DOCX 转换为 HTML

Question

我尝试了多种将 Word 文档 DOCX 转换为 HTML 文件的解决方案，但这些解决方案还会生成一个额外的文件夹，其中包含 (DOCX) HTML 的图像。有没有办法避免那个额外的文件夹并且仍然可以显示 HTML 文件中的所有图像？我只需要 1 个文件 (HTML)，因为我必须将它复制到另一个位置而没有任何额外的文件夹。

下面是 Word 文档的截图。它包含一个包含 2 列和 3 行的表格以及一些图像：

提前谢谢你

Answer 1

要在 Python 中将 Word DOCX 转换为嵌入所有图像的 HTML，您可以使用

python-docx

、

lxml

和

base64

库。

例子：

from docx import Document
from lxml import etree
from base64 import b64encode

doc = Document('path/document.docx')

# create an empty HTML element
html_element = etree.Element('html')

# create the head element
head_element = etree.SubElement(html_element, 'head')
title_element = etree.SubElement(head_element, 'title')
title_element.text = 'Document Title'

# create the body element
body_element = etree.SubElement(html_element, 'body')

# loop through all paragraphs in the document
for para in doc.paragraphs:
    # create a new paragraph element
    para_element = etree.SubElement(body_element, 'p')
    
    # loop through all runs in the paragraph
    for run in para.runs:
        # add the text to the paragraph element
        if para_element.text is not None:
            para_element.text = para_element.text + run.text
        else:
            para_element.text = run.text
    
    # loop through all images in the paragraph
    for inline in para.runs:
        if inline.element.tag == '{http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing}inline':
            for graphic in inline.element.iter('{http://schemas.openxmlformats.org/drawingml/2006/picture}graphic'):
                for graphic_data in graphic.iter('{http://schemas.openxmlformats.org/drawingml/2006/picture}blipFill'):
                    # get the image data
                    image_data = graphic_data.xpath('.//a:blip/@r:embed', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'})[0]
                    
                    # get the image file name
                    image_filename = doc.part.related_parts[image_data].partname.replace('/media/', '')
                    
                    # read the image data
                    with doc.part.related_parts[image_data].open() as image_file:
                        image_data = image_file.read()
                    
                    # create an image element
                    img_element = etree.SubElement(para_element, 'img')
                    img_element.set('src', f"data:image/png;base64,{b64encode(image_data).decode('utf-8')}")
                    img_element.set('alt', image_filename)

# create the HTML file
with open('path/document.html', 'w') as html_file:
    html_file.write(etree.tostring(html_element, pretty_print=True, encoding='unicode'))

Answer 2

这有帮助吗？

import os
import shutil
import base64
import xml.etree.ElementTree as ET
from docx import Document
import pypandoc

def convert_docx_to_html(docx_file_path):
    # Extract images from docx file and save to temp folder
    document = Document(docx_file_path)
    images_folder_path = os.path.join(os.path.dirname(docx_file_path), 'images')
    if os.path.exists(images_folder_path):
        shutil.rmtree(images_folder_path)
    os.mkdir(images_folder_path)
    for image in document.inline_shapes:
        if image.has_image:
            image_data = image._inline.graphic.graphicData.pic.blipFill.blip.blob
            image_file_path = os.path.join(images_folder_path, f'{image._inline.shape_id}.png')
            with open(image_file_path, 'wb') as f:
                f.write(image_data)

    # Convert docx to html with embedded images using pypandoc
    output_file_path = os.path.splitext(docx_file_path)[0] + '.html'
    pypandoc.convert_file(docx_file_path, 'html', outputfile=output_file_path, 
                          extra_args=['--extract-media', images_folder_path])

    # Embed images in html file
    with open(output_file_path, 'r', encoding='utf-8') as f:
        html_content = f.read()
    root = ET.fromstring(html_content)
    for img in root.findall('.//img'):
        image_file_path = os.path.join(images_folder_path, img.get('src').split('/')[-1])
        with open(image_file_path, 'rb') as f:
            image_data = f.read()
            img.set('src', f'data:image/png;base64,{base64.b64encode(image_data).decode()}')
    with open(output_file_path, 'w', encoding='utf-8') as f:
        f.write(ET.tostring(root, encoding='unicode'))

    # Clean up temp images folder
    shutil.rmtree(images_folder_path)

    return output_file_path

如何在嵌入所有图像的 Python 中将 Word DOCX 转换为 HTML

问题描述投票：0回答：2

2个回答

最新问题

如何在嵌入所有图像的 Python 中将 Word DOCX 转换为 HTML

问题描述 投票：0回答：2

2个回答

最新问题

问题描述投票：0回答：2