我从一个在线词典中删除了一些html,我想将其转换为XML,以便最终转换为BK树的单词列表。在线词典记录了不同的拼写,但有时通过放置可能会或可能不会出现在括号中的元音或结尾来实现,如下所示:
<td>
<span class="FORM">
<span class="HDORTH">a</span>
<span class="POS"> indef. art. </span> Also
<span class="ORTH">an</span>. Early forms: as subj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on</span>,
<span class="ORTH">o</span>; as obj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>,
<span class="ORTH">o</span>, & (chiefly masc.)
<span class="ORTH">an(n)e</span>,
<span class="ORTH">æn(n)e</span>,
<span class="ORTH">en(n)e</span>,
<span class="ORTH">en</span>; after prep.,chiefly
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>, masc. also
<span class="ORTH">anne</span>,
<span class="ORTH">æn(n)e</span>, fem. also
<span class="ORTH">anre</span>,
<span class="ORTH">are</span>,
<span class="ORTH">hare</span>,
<span class="ORTH">ore</span>; gen.
<span class="ORTH">anes</span>,
<span class="ORTH">æn(n)es</span>,
<span class="ORTH">en(n)es</span>.</span>
</td>
我编写了以下XQuery来转换HTML TO XML,剥离任何不在标签中的东西,并根据特定范围的类选择元素:
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
return
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH") then <variant>{$span/text()}</variant>
else $span
} ;
<list>
{
let $collection:=concat($collection, '?select=*.xml')
let $q:=collection($collection)
for $y in $q
let $s := $y/td/span/*
let $c := local:node-change($s)
(:let $l := local:stripleftparen($c):)
order by number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))
return
<entry ref="{number(substring(substring-before(tokenize(document-uri($y), "/")[last()],"."),4))}">{$c}</entry>
}
</list>
这将返回以下XML:
<entry ref="3">
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on(e</variant>
<variant>o</variant>
<variant>an(n)e</variant>
<variant>æn(n)e</variant>
<variant>en(n)e</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on(e</variant>
<variant>anne</variant>
<variant>æn(n)e</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>æn(n)es</variant>
<variant>en(n)es</variant>
</entry>
我现在需要做的是克隆具有parens的节点,这样我就可以修改克隆并得到以下结果,但我不知道该怎么做。
<entry ref="3">
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>ene</variant>
<variant>enne</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>ænes</variant>
<variant>ænnes</variant>
<variant>enes</variant>
<variant>ennes</variant>
</entry>
我知道我需要使用substring,substring-before或substring-after来实际修改节点,但是我遇到问题的地方是实际的克隆过程。 Copy
在for / return循环中不起作用,我在网上找到的所有东西都表明复制节点或谈论重复数据(我需要做的,但我想完全像我一样在我这样做之前想要它)。如何复制节点,修改副本并显示结果,以便我可以获得我正在寻找的内容?
我不清楚规则是什么。但在我看来,您应该能够在节点更改功能中一次完成所有操作。
我认为你可以沿着这些方向做点什么:
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
let $varient1 :=
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH" and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
else $span
(:Implement some if here to get the other varient if needed :)
let $varient2 := <varient/>
return
($varient1, $varient2)
} ;
如果有三个变量(不确定),只需遵循相同的模式。当然,任何一个加法变量的else都可以是空元素,你可以在最后删除它(即else <empty />),然后只删除结果中的任何<empty />。
这样的事情可能(猜测规则):
xquery version "3.0";
declare function local:node-change($nodes as node()*) as node()* {
for $span in $nodes
let $varient1 :=
if ($span/@class = "HDORTH") then <headword>{$span/text()}</headword>
else if ($span/@class = "POS") then <part_of_speech>{$span/text()}</part_of_speech>
else if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{substring-before($span/text(),'(')}</variant>
else if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{concat(substring-before($span/text(),'('),substring-after($span/text(),')'))}</variant>
else if ($span/@class = "ORTH" and not(contains($span/text(),'(')) and not(contains($span/text(),')'))) then <variant>{$span/text()}</variant>
else $span
let $varient2 := if ($span/@class = "ORTH" and contains($span/text(),'(') and not(contains($span/text(),')'))) then <variant>{translate($span/text(),'(','')}</variant>
else <empty/>
let $varient3 := if ($span/@class = "ORTH" and contains($span/text(),'(') and contains($span/text(),')')) then <variant>{translate($span/text(),'()','')}</variant>
else <empty/>
return
($varient1, $varient2, $varient3)
} ;
let $cell := <td>
<span class="FORM">
<span class="HDORTH">a</span>
<span class="POS"> indef. art. </span> Also
<span class="ORTH">an</span>. Early forms: as subj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on</span>,
<span class="ORTH">o</span>; as obj.,
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>,
<span class="ORTH">o</span>, & (chiefly masc.)
<span class="ORTH">an(n)e</span>,
<span class="ORTH">æn(n)e</span>,
<span class="ORTH">en(n)e</span>,
<span class="ORTH">en</span>; after prep.,chiefly
<span class="ORTH">ane</span>,
<span class="ORTH">on(e</span>, masc. also
<span class="ORTH">anne</span>,
<span class="ORTH">æn(n)e</span>, fem. also
<span class="ORTH">anre</span>,
<span class="ORTH">are</span>,
<span class="ORTH">hare</span>,
<span class="ORTH">ore</span>; gen.
<span class="ORTH">anes</span>,
<span class="ORTH">æn(n)es</span>,
<span class="ORTH">en(n)es</span>.</span>
</td>
let $s := $cell/span/*
let $c := local:node-change($s)
return
$c[not(local-name()='empty')]
返回:
<headword>a</headword>
<part_of_speech> indef. art. </part_of_speech>
<variant>an</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>o</variant>
<variant>ane</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>ene</variant>
<variant>enne</variant>
<variant>en</variant>
<variant>ane</variant>
<variant>on</variant>
<variant>one</variant>
<variant>anne</variant>
<variant>æne</variant>
<variant>ænne</variant>
<variant>anre</variant>
<variant>are</variant>
<variant>hare</variant>
<variant>ore</variant>
<variant>anes</variant>
<variant>ænes</variant>
<variant>ænnes</variant>
<variant>enes</variant>
<variant>ennes</variant>