如何通过C#通过OpenXML从Word(.Docx)提取OLE文件

问题描述 投票:0回答:1

我想使用Openxml从“ .docx”文件中提取“ OLE包”。我不知道该怎么做,在官方示例中也找不到任何示例。请帮助我。

这是我的尝试:1.我通过“ MS Office 2016”构建了一个名为“ Test.docx”的Docx文件,并将“ .zip”文件插入到“ Test.docx”中。我打开“打开XML SDK 2.5生产率工具”以观看“ Test.docx”,发现了这个(Figure 1),但是我没有得到有关如何通过反射代码提取此zip文件的任何信息。

2。然后,我尝试使用C#和SharpCompress.dll提取此“ .zip”文件,下面是代码:

class Program
{
    static void Main(string[] args)
    {
        string filepath = @"C:\Users\宇宙无敌帅小伙\Desktop\test.docx";

        OleFileTest(filepath);
    }

    public static void OleFileTest(string filepath)
    {
        try
        {
            using (WordprocessingDocument Docx = WordprocessingDocument.Open(filepath, true))
            {
                Body body = Docx.MainDocumentPart.Document.Body;

                IEnumerable<EmbeddedObjectPart> embd1 = Docx.MainDocumentPart.EmbeddedObjectParts;

                int cnt = 0;
                foreach (EmbeddedObjectPart item in embd1)
                {
                    System.IO.Stream dt = item.GetStream(FileMode.OpenOrCreate);
                    BinaryWriter writer = new BinaryWriter(dt);
                    byte[] bt = new byte[dt.Length];

                    using (FileStream fs = File.Open($"C:\\Users\\宇宙无敌帅小伙\\Desktop\\{cnt}.zip", FileMode.Create, FileAccess.ReadWrite, FileShare.ReadWrite))
                    {

                        fs.Write(bt, 0, bt.Length);
                    }
                    cnt++;
                }
            }
        }
        catch (Exception e)
        {
            Console.WriteLine(e.Message);
        }
    }
}

但是我无法打开我解压缩的“ .zip”文件。有人可以帮我吗?非常感谢!

openxml openxml-sdk
1个回答
0
投票

挑战在于,您从EmbeddedObjectPart中提取的二进制文件是not您的ZIP文件。这是一个结构化存储文件,它包含您的ZIP文件。

以下单元测试显示了如何使用Microsoft Word提取作为OLE对象嵌入到Word文档(ZipContents.zip)中的ZIP文件(例如"Resources\\ZipContainer.docx")。请注意Ole10Native.ExtractFile()方法的用法,该方法将从嵌入在Word文档中的结构化存储文件(例如oleObject1.bin)中提取ZIP文件。

using System.IO;
using CodeSnippets.Windows;
using DocumentFormat.OpenXml.Packaging;
using Xunit;

namespace CodeSnippets.Tests.OpenXml.Wordprocessing
{
    public class EmbeddedObjectPartTests
    {
        private static void ExtractFile(EmbeddedObjectPart part, string destinationFolderPath)
        {
            // Determine the file name and destination path of the binary,
            // structured storage file.
            string binaryFileName = Path.GetFileName(part.Uri.ToString());
            string binaryFilePath = Path.Combine(destinationFolderPath, binaryFileName);

            // Ensure the destination directory exists.
            Directory.CreateDirectory(destinationFolderPath);

            // Copy part contents to structured storage file.
            using (Stream partStream = part.GetStream())
            using (FileStream fileStream = File.Create(binaryFilePath))
            {
                partStream.CopyTo(fileStream);
            }

            // Extract the embedded file from the structured storage file.
            Ole10Native.ExtractFile(binaryFilePath, destinationFolderPath);

            // Remove the structured storage file.
            File.Delete(binaryFilePath);
        }

        [Fact]
        public void CanExtractEmbeddedZipFile()
        {
            const string documentPath = "Resources\\ZipContainer.docx";
            const string destinationFolderPath = "Output";
            string destinationFilePath = Path.Combine(destinationFolderPath, "ZipContents.zip");

            using WordprocessingDocument wordDocument =
                WordprocessingDocument.Open(documentPath, false);

            // Extract all embedded objects.
            foreach (EmbeddedObjectPart part in wordDocument.MainDocumentPart.EmbeddedObjectParts)
            {
                ExtractFile(part, destinationFolderPath);
            }

            Assert.True(File.Exists(destinationFilePath));
        }
    }
}

这里是Ole10Native类的要点,该类曾经由Microsoft发布,但是现在很难找到:

using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Runtime.InteropServices.ComTypes;
using System.Text.RegularExpressions;

namespace CodeSnippets.Windows
{
    public class Ole10Native
    {
        public static void ExtractFile(string sourceFilePath, string destinationFolder)
        {
            StgOpenStorage(sourceFilePath, null, STGM.READWRITE | STGM.SHARE_EXCLUSIVE, IntPtr.Zero, 0, out IStorage iStorage);
            ProcessPackage(iStorage, destinationFolder);
            Marshal.ReleaseComObject(iStorage);
        }

        private static void ProcessPackage(IStorage pStg, string destinationFolder)
        {
            uint numReturned;
            pStg.EnumElements(0, IntPtr.Zero, 0, out IEnumSTATSTG pEnumStatStg);
            var ss = new STATSTG[1];

            // Loop through the STATSTG structures in the storage.
            do
            {
                // Retrieve the STATSTG structure
                pEnumStatStg.Next(1, ss, out numReturned);
                if (numReturned != 0)
                {
                    //System.Runtime.InteropServices.ComTypes.STATSTG statstm;
                    var bytT = new byte[4];

                    // Check if the pwcsName contains "Ole10Native" stream which contain the actual embedded object
                    if (ss[0].pwcsName.Contains("Ole10Native"))
                    {
                        // Get the stream objectOpen the stream
                        pStg.OpenStream(ss[0].pwcsName, IntPtr.Zero, (uint) STGM.READ | (uint) STGM.SHARE_EXCLUSIVE, 0,
                            out IStream pStream);

                        //pStream.Stat(out statstm, (int) STATFLAG.STATFLAG_DEFAULT);

                        IntPtr position = IntPtr.Zero;

                        // File name starts from 7th Byte.
                        // Position the cursor to the 7th Byte.
                        pStream.Seek(6, 0, position);

                        var ulRead = new IntPtr();
                        var filename = new char[260];
                        int i;

                        // Read the File name of the embedded object
                        for (i = 0; i < 260; i++)
                        {
                            pStream.Read(bytT, 1, ulRead);
                            pStream.Seek(0, 1, position);
                            filename[i] = (char) bytT[0];
                            if (bytT[0] == 0) break;
                        }

                        var path = new string(filename, 0, i);

                        // Next part is the source path of the embedded object.
                        // Length is unknown. Hence, loop through each byte to read the 0 terminated string
                        // Read the source path.
                        for (i = 0; i < 260; i++)
                        {
                            pStream.Read(bytT, 1, ulRead);
                            pStream.Seek(0, 1, position);
                            filename[i] = (char) bytT[0];
                            if (bytT[0] == 0) break;
                        }

                        // Unknown 4 bytes
                        pStream.Seek(4, 1, position);

                        // Next 4 byte gives the length of the temporary file path
                        // (Office uses a temporary location to copy the files before inserting to the document)
                        // The length is in little endian format. Hence conversion is needed
                        pStream.Read(bytT, 4, ulRead);
                        ulong dwSize = 0;
                        dwSize += (ulong) (bytT[3] << 24);
                        dwSize += (ulong) (bytT[2] << 16);
                        dwSize += (ulong) (bytT[1] << 8);
                        dwSize += bytT[0];

                        // Skip the temporary file path
                        pStream.Seek((long) dwSize, 1, position);

                        // Next four bytes gives the size of the actual data in little endian format.
                        // Convert the format.
                        pStream.Read(bytT, 4, ulRead);
                        dwSize = 0;
                        dwSize += (ulong) (bytT[3] << 24);
                        dwSize += (ulong) (bytT[2] << 16);
                        dwSize += (ulong) (bytT[1] << 8);
                        dwSize += bytT[0];

                        // Read the actual file content
                        var byData = new byte[dwSize];
                        pStream.Read(byData, (int) dwSize, ulRead);

                        // Create the file
                        var bWriter = new BinaryWriter(File.Open(Path.Combine(destinationFolder, GetFileName(path)),
                            FileMode.Create));
                        bWriter.Write(byData);
                        bWriter.Close();
                    }
                }
            } while (numReturned > 0);

            Marshal.ReleaseComObject(pEnumStatStg);
        }

        private static string GetFileName(string filePath)
        {
            return Regex.Replace(filePath, @"^.*[\\]", "");
        }
    }
}

您可以在我的Ole10Native GitHub存储库中找到完整的源代码(包括CodeSnippets类。

© www.soinside.com 2019 - 2024. All rights reserved.