我正在尝试使用 beautifulsoup 和 elementree 解析和 XML,但它导致我的电脑崩溃,因为 XML 文件大小为 15,98,040KB,是否可以解析这么大的文件???或者代码有任何问题吗?
XML 文件示例
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE raml SYSTEM 'raml20.dtd'>
<raml version="2.0" xmlns="raml20.xsd">
<cmData type="actual">
<header>
<log dateTime="2023-12-20T16:11:15.000+05:30" action="created" appInfo="ActualExporter">InternalValues are used</log>
</header>
<managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-1" id="14823863">
<p name="cellIndividualSsbRsrpOffset">24</p>
<p name="cellIndividualSsbRsrqOffset">24</p>
<p name="gNbId">277215</p>
<p name="gNbIdLength">22</p>
<list name="gnbPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="handoverAllowedSA">1</p>
<p name="handoverAllowedSARedCap">0</p>
<p name="isRplCellForPSCell">0</p>
<p name="lcrId">16</p>
<p name="nrDcAllowed">0</p>
<p name="removeNotAllowed">1</p>
</managedObject>
<managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-131" id="14823714">
<p name="cellIndividualSsbRsrpOffset">24</p>
<p name="cellIndividualSsbRsrqOffset">24</p>
<p name="gNbId">277240</p>
<p name="gNbIdLength">22</p>
<list name="gnbPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="handoverAllowedSA">1</p>
<p name="handoverAllowedSARedCap">0</p>
<p name="isRplCellForPSCell">0</p>
<p name="lcrId">18</p>
<p name="nrDcAllowed">0</p>
<p name="removeNotAllowed">0</p>
</managedObject>
<managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-132" id="14823717">
<p name="cellIndividualSsbRsrpOffset">24</p>
<p name="cellIndividualSsbRsrqOffset">24</p>
<p name="gNbId">277227</p>
<p name="gNbIdLength">22</p>
<list name="gnbPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="handoverAllowedSA">1</p>
<p name="handoverAllowedSARedCap">0</p>
<p name="isRplCellForPSCell">0</p>
<p name="lcrId">18</p>
<p name="nrDcAllowed">0</p>
<p name="removeNotAllowed">0</p>
</managedObject>
<managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-17" id="14823804">
<p name="cellIndividualSsbRsrpOffset">24</p>
<p name="cellIndividualSsbRsrqOffset">24</p>
<p name="gNbId">277552</p>
<p name="gNbIdLength">22</p>
<list name="gnbPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="handoverAllowedSA">1</p>
<p name="handoverAllowedSARedCap">0</p>
<p name="isRplCellForPSCell">0</p>
<p name="lcrId">16</p>
<p name="nrDcAllowed">0</p>
<p name="removeNotAllowed">0</p>
</managedObject>
<managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-173" id="14823810">
<p name="cellIndividualSsbRsrpOffset">24</p>
<p name="cellIndividualSsbRsrqOffset">24</p>
<p name="gNbId">277234</p>
<p name="gNbIdLength">22</p>
<list name="gnbPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="handoverAllowedSA">1</p>
<p name="handoverAllowedSARedCap">0</p>
<p name="isRplCellForPSCell">0</p>
<p name="lcrId">1</p>
<p name="nrDcAllowed">0</p>
<p name="removeNotAllowed">0</p>
</managedObject>
<managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-180" id="45703377">
<p name="cellIndividualSsbRsrpOffset">24</p>
<p name="cellIndividualSsbRsrqOffset">24</p>
<p name="gNbId">277572</p>
<p name="gNbIdLength">22</p>
<list name="gnbPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="handoverAllowedSA">1</p>
<p name="handoverAllowedSARedCap">0</p>
<p name="isRplCellForPSCell">0</p>
<p name="lcrId">2</p>
<p name="nrDcAllowed">0</p>
<p name="removeNotAllowed">0</p>
</managedObject>
<managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-91" id="14823899">
<p name="cellIndividualOffset">24</p>
<p name="eNodeBId">28495</p>
<list name="ecgiPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="eutranCellBlacklisted">0</p>
<p name="lcrId">49</p>
<p name="removeNotAllowed">1</p>
</managedObject>
<managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-92" id="14823900">
<p name="cellIndividualOffset">24</p>
<p name="eNodeBId">22523</p>
<list name="ecgiPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="eutranCellBlacklisted">0</p>
<p name="lcrId">22</p>
<p name="removeNotAllowed">1</p>
</managedObject>
<managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-93" id="14823901">
<p name="cellIndividualOffset">24</p>
<p name="eNodeBId">8612</p>
<list name="ecgiPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="eutranCellBlacklisted">0</p>
<p name="lcrId">49</p>
<p name="removeNotAllowed">1</p>
</managedObject>
<managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-94" id="14823903">
<p name="cellIndividualOffset">24</p>
<p name="eNodeBId">26480</p>
<list name="ecgiPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="eutranCellBlacklisted">0</p>
<p name="lcrId">36</p>
<p name="removeNotAllowed">1</p>
</managedObject>
<managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-95" id="14823887">
<p name="cellIndividualOffset">24</p>
<p name="eNodeBId">7948</p>
<list name="ecgiPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="eutranCellBlacklisted">0</p>
<p name="lcrId">16</p>
<p name="removeNotAllowed">1</p>
</managedObject>
<managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-96" id="14823888">
<p name="cellIndividualOffset">24</p>
<p name="eNodeBId">32</p>
<list name="ecgiPlmn">
<item>
<p name="mcc">405</p>
<p name="mnc">854</p>
<p name="mncLength">3</p>
</item>
</list>
<p name="eutranCellBlacklisted">0</p>
<p name="lcrId">50</p>
<p name="removeNotAllowed">1</p>
</managedObject>
</cmData>
</raml>
预期输出表1
NRREL
MRBTS NRBTS NRCELL NRREL id cellIndividualSsbRsrpOffset cellIndividualSsbRsrqOffset gNbId gNbIdLength gnbPlmn Item-gnbPlmn-mcc Item-gnbPlmn-mnc Item-gnbPlmn-mncLength handoverAllowedSA handoverAllowedSARedCap isRplCellForPSCell lcrId nrDcAllowed removeNotAllowed
277215 277215 0 1 14823863 24 24 277215 22 List 405 854 3 1 0 0 16 0 1
277215 277215 0 131 14823714 24 24 277240 22 List 405 854 3 1 0 0 18 0 0
277215 277215 0 132 14823717 24 24 277227 22 List 405 854 3 1 0 0 18 0 0
277215 277215 0 17 14823804 24 24 277552 22 List 405 854 3 1 0 0 16 0 0
277215 277215 0 173 14823810 24 24 277234 22 List 405 854 3 1 0 0 1 0 0
277215 277215 0 180 45703377 24 24 277572 22 List 405 854 3 1 0 0 2 0 0
预期产出表2
NRRELE
MRBTS NRBTS NRCELL NRRELE id cellIndividualOffset eNodeBId ecgiPlmn Item-ecgiPlmn-mcc Item-ecgiPlmn-mnc Item-ecgiPlmn-mncLength eutranCellBlacklisted lcrId removeNotAllowed
277215 277215 2 91 14823899 24 28495 List 405 854 3 0 49 1
277215 277215 2 92 14823900 24 22523 List 405 854 3 0 22 1
277215 277215 2 93 14823901 24 8612 List 405 854 3 0 49 1
277215 277215 2 94 14823903 24 26480 List 405 854 3 0 36 1
277215 277215 2 95 14823887 24 7948 List 405 854 3 0 16 1
277215 277215 2 96 14823888 24 32 List 405 854 3 0 50 1
代码1
#extracting the xml data
import xml.etree.ElementTree as ET
import pandas as pd; import re
root = ET.parse("test.xml")
out = {}
for mo in root.findall(".//{*}managedObject"):
tmp = dict(
re.findall(r"([^/]+?)-([^/]+)", mo.attrib["distName"])[1:],
id=mo.attrib["id"],
**{ele.attrib["name"]: ele.text if ((tn:=ele.tag.split("}")[-1]) == "p")
else tn for ele in mo.findall(".//") if ele.attrib.get("name")},
)
out.setdefault(mo.attrib["class"], []).append(tmp)
#Writing to excel
with pd.ExcelWriter("output.xlsx") as writer:
for k,v in out.items():
(pd.DataFrame(v).apply(pd.to_numeric, errors="ignore")
.to_excel(writer, sheet_name=k, index=False))
# to access one of the dataframes, use dict-indexing (e.g `dfs["NRREL"]`)
dfs = {k: pd.DataFrame(v) for k,v in out.items()}
代码2
##ALTERNATE TO ABOVE
from bs4 import BeautifulSoup
import pandas as pd; import re
with open("test.xml") as f:
soup = BeautifulSoup(f.read(), "xml")
out = {}
for mo in soup.select("managedObject"):
tmp = dict(
re.findall(r"([^/]+?)-([^/]+)", mo["distName"])[1:], id=mo["id"],
**{t["name"]: t.get_text() for t in mo.select("p")},
**{(t:=mo.select(":not(p)")[0])["name"]: t.name}
)
out.setdefault(mo["class"], []).append(tmp)
with pd.ExcelWriter("output_all.xlsx") as writer:
for k,v in out.items():
(pd.DataFrame(v).apply(pd.to_numeric, errors="ignore")
.to_excel(writer, sheet_name=k, index=False))
dfs = {k: pd.DataFrame(v) for k,v in out.items()}
上面是 Timeleee 帮助的两个代码...我无法标记他的信用
当你有一个巨大的Xml文件时,你需要使用XmlReader。我获取了您在 12 月 30 日发布的文章中使用的代码,并进行了修改,以便使用 Powershell 与 XmlReader 一起使用
using assembly System.Xml
using assembly System.Xml.Linq
$xmlFilename = 'c:\temp\test.xml'
$csvFilename = 'c:\temp\test.csv'
$settings = [System.Xml.XmlReaderSettings]::new()
$settings.DtdProcessing = [System.Xml.DtdProcessing]::Ignore
$reader = [System.Xml.XmlReader]::Create($xmlFilename, $settings)
$table = [System.Collections.ArrayList]::new()
While(-not $reader.EOF)
{
if($reader.Name -ne 'managedObject')
{
$reader.ReadToFollowing('managedObject') | out-null;
}
if(-not $reader.EOF)
{
$managedObject = [System.Xml.Linq.XElement][System.Xml.Linq.XElement]::ReadFrom($reader);
$class = $managedObject.Attribute('class').Value
$newRow = [pscustomobject]@{
Class = $class
}
$distName = $managedObject.Attribute('distName').Value
$distNameArray = $distName.Split('/') | foreach { $obj = $_.Split('-'); $newRow | Add-Member -NotePropertyName $obj[0] -NotePropertyValue $obj[1] }
$newRow | Add-Member -NotePropertyName id -NotePropertyValue $managedObject.Attribute('id').Value
$managedObject.Descendants($ns + 'p') | foreach { $name = $_.Attribute('name').Value; $value = $_.Value; $newRow | Add-Member -NotePropertyName $name -NotePropertyValue $value }
$table.Add($newRow) | out-null
}
}
$table
$table | Export-CSV -Path $csvFilename -NoTypeInformation
$classes = $table | Group-Object -Property Class
$classes | foreach { $_.Group | Format-Table }
这是结果
Class PLMN MRBTS NRBTS NRCELL NRREL id cellIndividualSsbRsrpOffset cellIndividualSsbRsrqOffset gNbId
----- ---- ----- ----- ------ ----- -- --------------------------- --------------------------- -----
NRREL PLMN 277215 277215 0 1 14823863 24 24 277215
NRREL PLMN 277215 277215 0 131 14823714 24 24 277240
NRREL PLMN 277215 277215 0 132 14823717 24 24 277227
Class PLMN MRBTS NRBTS NRCELL NRRELE id cellIndividualOffset eNodeBId mcc
----- ---- ----- ----- ------ ------ -- -------------------- -------- ---
NRRELE PLMN 277215 277215 2 95 14823887 24 7948 405
NRRELE PLMN 277215 277215 2 96 14823888 24 32 405