我尝试了多种将 Word 文档 DOCX 转换为 HTML 文件的解决方案,但这些解决方案还会生成一个额外的文件夹,其中包含 (DOCX) HTML 的图像。有没有办法避免那个额外的文件夹并且仍然可以显示 HTML 文件中的所有图像?我只需要 1 个文件 (HTML),因为我必须将它复制到另一个位置而没有任何额外的文件夹。
下面是 Word 文档的截图。它包含一个包含 2 列和 3 行的表格以及一些图像:
提前谢谢你
要在 Python 中将 Word DOCX 转换为嵌入所有图像的 HTML,您可以使用
python-docx
、lxml
和 base64
库。
例子:
from docx import Document
from lxml import etree
from base64 import b64encode
doc = Document('path/document.docx')
# create an empty HTML element
html_element = etree.Element('html')
# create the head element
head_element = etree.SubElement(html_element, 'head')
title_element = etree.SubElement(head_element, 'title')
title_element.text = 'Document Title'
# create the body element
body_element = etree.SubElement(html_element, 'body')
# loop through all paragraphs in the document
for para in doc.paragraphs:
# create a new paragraph element
para_element = etree.SubElement(body_element, 'p')
# loop through all runs in the paragraph
for run in para.runs:
# add the text to the paragraph element
if para_element.text is not None:
para_element.text = para_element.text + run.text
else:
para_element.text = run.text
# loop through all images in the paragraph
for inline in para.runs:
if inline.element.tag == '{http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing}inline':
for graphic in inline.element.iter('{http://schemas.openxmlformats.org/drawingml/2006/picture}graphic'):
for graphic_data in graphic.iter('{http://schemas.openxmlformats.org/drawingml/2006/picture}blipFill'):
# get the image data
image_data = graphic_data.xpath('.//a:blip/@r:embed', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'})[0]
# get the image file name
image_filename = doc.part.related_parts[image_data].partname.replace('/media/', '')
# read the image data
with doc.part.related_parts[image_data].open() as image_file:
image_data = image_file.read()
# create an image element
img_element = etree.SubElement(para_element, 'img')
img_element.set('src', f"data:image/png;base64,{b64encode(image_data).decode('utf-8')}")
img_element.set('alt', image_filename)
# create the HTML file
with open('path/document.html', 'w') as html_file:
html_file.write(etree.tostring(html_element, pretty_print=True, encoding='unicode'))
这有帮助吗?
import os
import shutil
import base64
import xml.etree.ElementTree as ET
from docx import Document
import pypandoc
def convert_docx_to_html(docx_file_path):
# Extract images from docx file and save to temp folder
document = Document(docx_file_path)
images_folder_path = os.path.join(os.path.dirname(docx_file_path), 'images')
if os.path.exists(images_folder_path):
shutil.rmtree(images_folder_path)
os.mkdir(images_folder_path)
for image in document.inline_shapes:
if image.has_image:
image_data = image._inline.graphic.graphicData.pic.blipFill.blip.blob
image_file_path = os.path.join(images_folder_path, f'{image._inline.shape_id}.png')
with open(image_file_path, 'wb') as f:
f.write(image_data)
# Convert docx to html with embedded images using pypandoc
output_file_path = os.path.splitext(docx_file_path)[0] + '.html'
pypandoc.convert_file(docx_file_path, 'html', outputfile=output_file_path,
extra_args=['--extract-media', images_folder_path])
# Embed images in html file
with open(output_file_path, 'r', encoding='utf-8') as f:
html_content = f.read()
root = ET.fromstring(html_content)
for img in root.findall('.//img'):
image_file_path = os.path.join(images_folder_path, img.get('src').split('/')[-1])
with open(image_file_path, 'rb') as f:
image_data = f.read()
img.set('src', f'data:image/png;base64,{base64.b64encode(image_data).decode()}')
with open(output_file_path, 'w', encoding='utf-8') as f:
f.write(ET.tostring(root, encoding='unicode'))
# Clean up temp images folder
shutil.rmtree(images_folder_path)
return output_file_path