以编程方式捕获HTML文件中的样式并使用PowerShell转换为Word文档

问题描述 投票:0回答:1

我正在制作一个脚本,它将在两年一度的印刷会议上提取30多个谈话的HTML文件。已选择Word作为打印工具。会谈有大胆,斜体,轮廓,标题和其他与风格相关的元素。我需要将所有这些样式捕获到word文档中,而无需人工干预。我尝试了两种解决这个问题的方法。

  1. 针对每个谈话并挑选包含相关信息(标题,演讲者,段落等)的元素。这很漂亮,但风格缺失。有没有办法用这种方法保留样式信息? 下面的完整代码示例显示了所有协同工作: $primaryURL = "https://www.domain.tld/folder?lang=eng" $baseURL = $primaryURL.Split("`?")[0] $talkArray = @() $content = (Invoke-WebRequest -Uri $primaryURL).Content $links = (Invoke-WebRequest -Uri $primaryURL).Links $talkLinks = $links | Where {$_.outerHTML -like "*/folder/$year/$month*"} | Select -expand href $talkInfo = $links | Where {$_.outerHTML -like "*/folder/$year/$month*"} | Select -expand outerText $domainName = $baseUrl.Split("/")[2] Foreach ($talk in $talkInfo) { $title = ($talk.Split("`n`r") -replace "`#.*", "$([char]0)" -replace "#.*" -replace "$([char]"0", "#" -replace "^\s*" -replace "\s*$")" | ? { $_; })[1] $speaker = ($talk.Split("`n`r") -replace "`#.*", "$([char]0)" -replace "#.*" -replace "$([char]"0", "#" -replace "^\s*" -replace "\s*$")" | ? { $_; })[2] $link = "https://" + $domainName + $talkLinks[$count] $obj = New-Object PSObject $obj | Add-Member -MemberType NoteProperty -Name "Title" -Value $title $obj | Add-Member -MemberType NoteProperty -Name "Speaker" -Value $speaker $obj | Add-Member -MemberType NoteProperty -Name "url" -Value $link $talkArray += $obj $count++ } $wordDocPath = "U:\" + $month + " " + $year + " Conference.docx" $word = New-Object -ComObject word.application $word.Visible = $false $doc = $word.documents.add() $doc.Styles["No Spacing"].ParagraphFormat.SpaceAfter = 0 $doc.Styles["No Spacing"].ParagraphFormat.SpaceBefore = 0 $margin = 36 # 0.5 inches $doc.PageSetup.LeftMargin = $margin $doc.PageSetup.RightMargin = $margin $doc.PageSetup.TopMargin = $margin $doc.PageSetup.BottomMargin = $margin $selection = $word.Selection $outputPath = $wordDocPath $doc.SaveAs($outputPath) $doc.Close() $word.Quit() $word = $null $objWord = New-Object -comobject Word.Application $objWord.Visible = $false $objDoc = $objWord.Documents.Open($wordDocPath) $objSelection = $objWord.Selection $trash = $objSelection.EndKey(6, 0) $objSelection.PageSetup.TextColumns.SetCount(2) ForEach ($talk in $talkArray) { $talkTitle = $talk.Title $a, $p, $results, $wrap, $n = $null $talkContent = Invoke-WebRequest -Uri $talk.url $speakerPic = "https:" + ($talkContent.images | Where {$_.alt -like "$talkTitle"}).src $picName = (($talkContent.images | Where {$_.alt -like "$talkTitle"}).src).Split("/")[-1] $kicker = ($talkContent.ParsedHTML.GetElementByID('kicker1')).innerHTML $count = 3 $line = 1 # - Download the picture $picPath = $tempDir + "\" + $picName Download-File -url $speakerPic -file $picPath # - Write the header information $text = $talkTitle $objSelection.Style="Heading 1" $objSelection.Font.Name = "Times New Roman" $objSelection.TypeText($text) $objSelection.TypeParagraph() $text = $talk.Speaker + "`r`n" $objSelection.Font.Bold = 1 $objSelection.Font.Name = "Times New Roman" $objSelection.TypeText("by $text") $objSelection.Font.Bold = 0 $objSelection.TypeParagraph() $objShape = $objDoc.Shapes $wrap = [Microsoft.Office.Interop.Word.WdWrapType]::wdWrapSquare # WdWrapTopBottom $objShape.AddPicture($picPath) | Out-Null $objShape.Range(1).WrapFormat.Type = $wrap $objSelection.Endkey() $objSelection.Font.Bold = 1 $objSelection.Font.Italic = 1 $objSelection.Font.Name = "Times New Roman" $objSelection.TypeText($kicker) $objSelection.TypeParagraph() $objSelection.Font.Bold = 0 $objSelection.Font.Italic = 0 # - Collect the paragraphs $paragraphs = @() do { [string]$p = ($talkContent.ParsedHTML.getElementById("p$count")).InnerText if ($p) { $obj = New-Object PSObject $obj | Add-Member -MemberType NoteProperty -Name "p" -Value $p $obj | Add-Member -MemberType NoteProperty -Name "line" -Value $line $paragraphs += $obj $line = $line + 1 } Else { } $count = $count + 1 } while ($p) # - Write the paragraphs $paragraphs | % { $results = [regex]::Split($_.p,'(?<![\d\s])(?<![\(\s])(?<![\:\s])(?<![\-\s])(\d{1,2})(?!\d)') $line = $_.line $objSelection.Style="Normal" $objSelection.Font.SuperScript = 0 $objSelection.Font.Size = 9 $objSelection.Font.Name = "Times New Roman" $objSelection.TypeText("$line. ") foreach ($item in $results) { $text = $item if ($text -match '(?<![\d\s])(?<![\(\s])(?<![\:\s])(?<![\-\s])(\d{1,2})(?!\d)') { $objSelection.Font.SuperScript = 1 $objSelection.TypeText("$text") $objSelection.Font.SuperScript = 0 } Else { if ($text -ne " ") { #$objSelection.Style="Normal" $objSelection.Font.SuperScript = 0 $objSelection.Font.Size = 9 $objSelection.Font.Name = "Times New Roman" $objSelection.TypeText("$text") } } } $objSelection.TypeParagraph() } $count = 1 # - Work on the notes $notes = @() $line = 1 do { [string]$n = ($talkContent.ParsedHTML.getElementById("note$count")).InnerText if ($n) { $obj = New-Object PSObject $obj | Add-Member -MemberType NoteProperty -Name "n" -Value $n -Force $obj | Add-Member -MemberType NoteProperty -Name "line" -Value $line -Force $notes += $obj $line = $line + 1 } Else { } $count = $count + 1 } while ($n) $text = "`vNOTES:" $objSelection.TypeText("$text") $objSelection.TypeParagraph() $notes | % { $note = $_.n $line = $_.line $text = $note $objSelection.Font.Size = 9 $objSelection.Font.Name = "Times New Roman" $objSelection.TypeText("$line. $text") $objSelection.TypeText("`r`n") } $objSelection.InsertNewPage() } $objDoc.saveas($wordDocPath) $objDoc.close() $objWord.quit() $objWord = $null

来自html的示例段落(带有样式元素):

    <p id="p1">This is an example with <em>italic</em> text and <strong>bold</strong> text.</p>
  1. 转储每个HTML文件,清理它,然后转换为Word。这样可以保留这种风格,但是去除我几乎放弃的所有垃圾(顶部的网页标题,链接和菜单,页脚底部的链接和导航按钮)变得非常麻烦。也许有一种更简单的方法?代码示例如下: # Code snipped. This version performs similar functions to collect information on each talk, creates the temp folder and the word doc. Then: [ref]$SaveFormat = "microsoft.office.interop.word.WdSaveFormat" -as [type] $word = New-Object -ComObject word.application $word.visible = $false $document = $word.documents.open($htmlPath) $selection = $word.Selection $paras = $document.Paragraphs foreach ($para in $paras) { $text = $para.Range.Text If ($text -like "$title*") { break } Else { $para.Range.Select() $selection.Cut() } } $document.saveas([ref] $wordDocPath, [ref]$SaveFormat::wdFormatDocumentDefault) $document.close() $word.Quit() $word = $null

这不是剥离违规元素的完整代码,但它首先在talk title元素之前清除所有内容。在此之后还有其他行必须删除,然后正如我所提到的,在谈话段落之后的几个元素。在此之后,我将需要处理Word中的每个段落以匹配最终产品的字体,大小,列,标题等。

问题:

  1. 你会采用哪种方法?
  2. 如果方法1,如何将段落标记中的样式元素转换为Word文档?
  3. 如果方法2,是否有更好的方法来处理文档,以便我可以归零我正在寻找的元素?

提前致谢。

html powershell ms-word
1个回答
0
投票

我用这种方法。

  1. 刮掉并保存你想要的部分作为html_file
  2. 使用Range.Insertfile(html_file)将保留html中的所有样式。
© www.soinside.com 2019 - 2024. All rights reserved.