我正在制作一个脚本,它将在两年一度的印刷会议上提取30多个谈话的HTML文件。已选择Word作为打印工具。会谈有大胆,斜体,轮廓,标题和其他与风格相关的元素。我需要将所有这些样式捕获到word文档中,而无需人工干预。我尝试了两种解决这个问题的方法。
$primaryURL = "https://www.domain.tld/folder?lang=eng"
$baseURL = $primaryURL.Split("`?")[0]
$talkArray = @()
$content = (Invoke-WebRequest -Uri $primaryURL).Content
$links = (Invoke-WebRequest -Uri $primaryURL).Links
$talkLinks = $links | Where {$_.outerHTML -like "*/folder/$year/$month*"} | Select -expand href
$talkInfo = $links | Where {$_.outerHTML -like "*/folder/$year/$month*"} | Select -expand outerText
$domainName = $baseUrl.Split("/")[2]
Foreach ($talk in $talkInfo)
{
$title = ($talk.Split("`n`r") -replace "`#.*", "$([char]0)" -replace "#.*" -replace "$([char]"0", "#" -replace "^\s*" -replace "\s*$")" | ? { $_; })[1]
$speaker = ($talk.Split("`n`r") -replace "`#.*", "$([char]0)" -replace "#.*" -replace "$([char]"0", "#" -replace "^\s*" -replace "\s*$")" | ? { $_; })[2]
$link = "https://" + $domainName + $talkLinks[$count]
$obj = New-Object PSObject
$obj | Add-Member -MemberType NoteProperty -Name "Title" -Value $title
$obj | Add-Member -MemberType NoteProperty -Name "Speaker" -Value $speaker
$obj | Add-Member -MemberType NoteProperty -Name "url" -Value $link
$talkArray += $obj
$count++
}
$wordDocPath = "U:\" + $month + " " + $year + " Conference.docx"
$word = New-Object -ComObject word.application
$word.Visible = $false
$doc = $word.documents.add()
$doc.Styles["No Spacing"].ParagraphFormat.SpaceAfter = 0
$doc.Styles["No Spacing"].ParagraphFormat.SpaceBefore = 0
$margin = 36 # 0.5 inches
$doc.PageSetup.LeftMargin = $margin
$doc.PageSetup.RightMargin = $margin
$doc.PageSetup.TopMargin = $margin
$doc.PageSetup.BottomMargin = $margin
$selection = $word.Selection
$outputPath = $wordDocPath
$doc.SaveAs($outputPath)
$doc.Close()
$word.Quit()
$word = $null
$objWord = New-Object -comobject Word.Application
$objWord.Visible = $false
$objDoc = $objWord.Documents.Open($wordDocPath)
$objSelection = $objWord.Selection
$trash = $objSelection.EndKey(6, 0)
$objSelection.PageSetup.TextColumns.SetCount(2)
ForEach ($talk in $talkArray)
{
$talkTitle = $talk.Title
$a, $p, $results, $wrap, $n = $null
$talkContent = Invoke-WebRequest -Uri $talk.url
$speakerPic = "https:" + ($talkContent.images | Where {$_.alt -like "$talkTitle"}).src
$picName = (($talkContent.images | Where {$_.alt -like "$talkTitle"}).src).Split("/")[-1]
$kicker = ($talkContent.ParsedHTML.GetElementByID('kicker1')).innerHTML
$count = 3
$line = 1
# - Download the picture
$picPath = $tempDir + "\" + $picName
Download-File -url $speakerPic -file $picPath
# - Write the header information
$text = $talkTitle
$objSelection.Style="Heading 1"
$objSelection.Font.Name = "Times New Roman"
$objSelection.TypeText($text)
$objSelection.TypeParagraph()
$text = $talk.Speaker + "`r`n"
$objSelection.Font.Bold = 1
$objSelection.Font.Name = "Times New Roman"
$objSelection.TypeText("by $text")
$objSelection.Font.Bold = 0
$objSelection.TypeParagraph()
$objShape = $objDoc.Shapes
$wrap = [Microsoft.Office.Interop.Word.WdWrapType]::wdWrapSquare # WdWrapTopBottom
$objShape.AddPicture($picPath) | Out-Null
$objShape.Range(1).WrapFormat.Type = $wrap
$objSelection.Endkey()
$objSelection.Font.Bold = 1
$objSelection.Font.Italic = 1
$objSelection.Font.Name = "Times New Roman"
$objSelection.TypeText($kicker)
$objSelection.TypeParagraph()
$objSelection.Font.Bold = 0
$objSelection.Font.Italic = 0
# - Collect the paragraphs
$paragraphs = @()
do
{
[string]$p = ($talkContent.ParsedHTML.getElementById("p$count")).InnerText
if ($p) {
$obj = New-Object PSObject
$obj | Add-Member -MemberType NoteProperty -Name "p" -Value $p
$obj | Add-Member -MemberType NoteProperty -Name "line" -Value $line
$paragraphs += $obj
$line = $line + 1
}
Else {
}
$count = $count + 1
} while ($p)
# - Write the paragraphs
$paragraphs | % {
$results = [regex]::Split($_.p,'(?<![\d\s])(?<![\(\s])(?<![\:\s])(?<![\-\s])(\d{1,2})(?!\d)')
$line = $_.line
$objSelection.Style="Normal"
$objSelection.Font.SuperScript = 0
$objSelection.Font.Size = 9
$objSelection.Font.Name = "Times New Roman"
$objSelection.TypeText("$line. ")
foreach ($item in $results)
{
$text = $item
if ($text -match '(?<![\d\s])(?<![\(\s])(?<![\:\s])(?<![\-\s])(\d{1,2})(?!\d)')
{
$objSelection.Font.SuperScript = 1
$objSelection.TypeText("$text")
$objSelection.Font.SuperScript = 0
}
Else
{
if ($text -ne " ")
{
#$objSelection.Style="Normal"
$objSelection.Font.SuperScript = 0
$objSelection.Font.Size = 9
$objSelection.Font.Name = "Times New Roman"
$objSelection.TypeText("$text")
}
}
}
$objSelection.TypeParagraph()
}
$count = 1
# - Work on the notes
$notes = @()
$line = 1
do
{
[string]$n = ($talkContent.ParsedHTML.getElementById("note$count")).InnerText
if ($n)
{
$obj = New-Object PSObject
$obj | Add-Member -MemberType NoteProperty -Name "n" -Value $n -Force
$obj | Add-Member -MemberType NoteProperty -Name "line" -Value $line -Force
$notes += $obj
$line = $line + 1
}
Else {
}
$count = $count + 1
} while ($n)
$text = "`vNOTES:"
$objSelection.TypeText("$text")
$objSelection.TypeParagraph()
$notes | % {
$note = $_.n
$line = $_.line
$text = $note
$objSelection.Font.Size = 9
$objSelection.Font.Name = "Times New Roman"
$objSelection.TypeText("$line. $text")
$objSelection.TypeText("`r`n")
}
$objSelection.InsertNewPage()
}
$objDoc.saveas($wordDocPath)
$objDoc.close()
$objWord.quit()
$objWord = $null
来自html的示例段落(带有样式元素):
<p id="p1">This is an example with <em>italic</em> text and <strong>bold</strong> text.</p>
# Code snipped. This version performs similar functions to collect information on each talk, creates the temp folder and the word doc. Then:
[ref]$SaveFormat = "microsoft.office.interop.word.WdSaveFormat" -as [type]
$word = New-Object -ComObject word.application
$word.visible = $false
$document = $word.documents.open($htmlPath)
$selection = $word.Selection
$paras = $document.Paragraphs
foreach ($para in $paras)
{
$text = $para.Range.Text
If ($text -like "$title*")
{
break
}
Else
{
$para.Range.Select()
$selection.Cut()
}
}
$document.saveas([ref] $wordDocPath, [ref]$SaveFormat::wdFormatDocumentDefault)
$document.close()
$word.Quit()
$word = $null
这不是剥离违规元素的完整代码,但它首先在talk title元素之前清除所有内容。在此之后还有其他行必须删除,然后正如我所提到的,在谈话段落之后的几个元素。在此之后,我将需要处理Word中的每个段落以匹配最终产品的字体,大小,列,标题等。
问题:
提前致谢。
我用这种方法。
html_file
。Range.Insertfile(html_file)
将保留html
中的所有样式。