使用 ELEMNTREE 和 Beautifulsoup 解析 XML 时面临内存问题

问题描述 投票:0回答:1

我正在尝试使用 beautifulsoup 和 elementree 解析和 XML,但它导致我的电脑崩溃,因为 XML 文件大小为 15,98,040KB,是否可以解析这么大的文件???或者代码有任何问题吗?

XML 文件示例

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE raml SYSTEM 'raml20.dtd'>
<raml version="2.0" xmlns="raml20.xsd">
  <cmData type="actual">
    <header>
      <log dateTime="2023-12-20T16:11:15.000+05:30" action="created" appInfo="ActualExporter">InternalValues are used</log>
    </header>
    <managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-1" id="14823863">
      <p name="cellIndividualSsbRsrpOffset">24</p>
      <p name="cellIndividualSsbRsrqOffset">24</p>
      <p name="gNbId">277215</p>
      <p name="gNbIdLength">22</p>
      <list name="gnbPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="handoverAllowedSA">1</p>
      <p name="handoverAllowedSARedCap">0</p>
      <p name="isRplCellForPSCell">0</p>
      <p name="lcrId">16</p>
      <p name="nrDcAllowed">0</p>
      <p name="removeNotAllowed">1</p>
    </managedObject>
    <managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-131" id="14823714">
      <p name="cellIndividualSsbRsrpOffset">24</p>
      <p name="cellIndividualSsbRsrqOffset">24</p>
      <p name="gNbId">277240</p>
      <p name="gNbIdLength">22</p>
      <list name="gnbPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="handoverAllowedSA">1</p>
      <p name="handoverAllowedSARedCap">0</p>
      <p name="isRplCellForPSCell">0</p>
      <p name="lcrId">18</p>
      <p name="nrDcAllowed">0</p>
      <p name="removeNotAllowed">0</p>
    </managedObject>
    <managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-132" id="14823717">
      <p name="cellIndividualSsbRsrpOffset">24</p>
      <p name="cellIndividualSsbRsrqOffset">24</p>
      <p name="gNbId">277227</p>
      <p name="gNbIdLength">22</p>
      <list name="gnbPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="handoverAllowedSA">1</p>
      <p name="handoverAllowedSARedCap">0</p>
      <p name="isRplCellForPSCell">0</p>
      <p name="lcrId">18</p>
      <p name="nrDcAllowed">0</p>
      <p name="removeNotAllowed">0</p>
    </managedObject>
    <managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-17" id="14823804">
      <p name="cellIndividualSsbRsrpOffset">24</p>
      <p name="cellIndividualSsbRsrqOffset">24</p>
      <p name="gNbId">277552</p>
      <p name="gNbIdLength">22</p>
      <list name="gnbPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="handoverAllowedSA">1</p>
      <p name="handoverAllowedSARedCap">0</p>
      <p name="isRplCellForPSCell">0</p>
      <p name="lcrId">16</p>
      <p name="nrDcAllowed">0</p>
      <p name="removeNotAllowed">0</p>
    </managedObject>
    <managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-173" id="14823810">
      <p name="cellIndividualSsbRsrpOffset">24</p>
      <p name="cellIndividualSsbRsrqOffset">24</p>
      <p name="gNbId">277234</p>
      <p name="gNbIdLength">22</p>
      <list name="gnbPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="handoverAllowedSA">1</p>
      <p name="handoverAllowedSARedCap">0</p>
      <p name="isRplCellForPSCell">0</p>
      <p name="lcrId">1</p>
      <p name="nrDcAllowed">0</p>
      <p name="removeNotAllowed">0</p>
    </managedObject>
    <managedObject class="NRREL" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-0/NRREL-180" id="45703377">
      <p name="cellIndividualSsbRsrpOffset">24</p>
      <p name="cellIndividualSsbRsrqOffset">24</p>
      <p name="gNbId">277572</p>
      <p name="gNbIdLength">22</p>
      <list name="gnbPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="handoverAllowedSA">1</p>
      <p name="handoverAllowedSARedCap">0</p>
      <p name="isRplCellForPSCell">0</p>
      <p name="lcrId">2</p>
      <p name="nrDcAllowed">0</p>
      <p name="removeNotAllowed">0</p>
    </managedObject>
    <managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-91" id="14823899">
      <p name="cellIndividualOffset">24</p>
      <p name="eNodeBId">28495</p>
      <list name="ecgiPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="eutranCellBlacklisted">0</p>
      <p name="lcrId">49</p>
      <p name="removeNotAllowed">1</p>
    </managedObject>
    <managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-92" id="14823900">
      <p name="cellIndividualOffset">24</p>
      <p name="eNodeBId">22523</p>
      <list name="ecgiPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="eutranCellBlacklisted">0</p>
      <p name="lcrId">22</p>
      <p name="removeNotAllowed">1</p>
    </managedObject>
    <managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-93" id="14823901">
      <p name="cellIndividualOffset">24</p>
      <p name="eNodeBId">8612</p>
      <list name="ecgiPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="eutranCellBlacklisted">0</p>
      <p name="lcrId">49</p>
      <p name="removeNotAllowed">1</p>
    </managedObject>
    <managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-94" id="14823903">
      <p name="cellIndividualOffset">24</p>
      <p name="eNodeBId">26480</p>
      <list name="ecgiPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="eutranCellBlacklisted">0</p>
      <p name="lcrId">36</p>
      <p name="removeNotAllowed">1</p>
    </managedObject>
    <managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-95" id="14823887">
      <p name="cellIndividualOffset">24</p>
      <p name="eNodeBId">7948</p>
      <list name="ecgiPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="eutranCellBlacklisted">0</p>
      <p name="lcrId">16</p>
      <p name="removeNotAllowed">1</p>
    </managedObject>
    <managedObject class="NRRELE" version="NRBTSCL23R4_2315_120" distName="PLMN-PLMN/MRBTS-277215/NRBTS-277215/NRCELL-2/NRRELE-96" id="14823888">
      <p name="cellIndividualOffset">24</p>
      <p name="eNodeBId">32</p>
      <list name="ecgiPlmn">
        <item>
          <p name="mcc">405</p>
          <p name="mnc">854</p>
          <p name="mncLength">3</p>
        </item>
      </list>
      <p name="eutranCellBlacklisted">0</p>
      <p name="lcrId">50</p>
      <p name="removeNotAllowed">1</p>
    </managedObject>
  </cmData>
</raml>

预期输出表1

NRREL

       MRBTS           NRBTS           NRCELL          NRREL           id          cellIndividualSsbRsrpOffset         cellIndividualSsbRsrqOffset         gNbId           gNbIdLength         gnbPlmn         Item-gnbPlmn-mcc        Item-gnbPlmn-mnc        Item-gnbPlmn-mncLength          handoverAllowedSA           handoverAllowedSARedCap         isRplCellForPSCell          lcrId           nrDcAllowed         removeNotAllowed
277215  277215  0   1   14823863    24  24  277215  22  List    405 854 3   1   0   0   16  0   1
277215  277215  0   131 14823714    24  24  277240  22  List    405 854 3   1   0   0   18  0   0
277215  277215  0   132 14823717    24  24  277227  22  List    405 854 3   1   0   0   18  0   0
277215  277215  0   17  14823804    24  24  277552  22  List    405 854 3   1   0   0   16  0   0
277215  277215  0   173 14823810    24  24  277234  22  List    405 854 3   1   0   0   1   0   0
277215  277215  0   180 45703377    24  24  277572  22  List    405 854 3   1   0   0   2   0   0

预期产出表2

NRRELE


       MRBTS           NRBTS           NRCELL          NRRELE          id          cellIndividualOffset        eNodeBId        ecgiPlmn        Item-ecgiPlmn-mcc           Item-ecgiPlmn-mnc           Item-ecgiPlmn-mncLength         eutranCellBlacklisted           lcrId           removeNotAllowed
277215  277215  2   91  14823899    24  28495   List    405 854 3   0   49  1
277215  277215  2   92  14823900    24  22523   List    405 854 3   0   22  1
277215  277215  2   93  14823901    24  8612    List    405 854 3   0   49  1
277215  277215  2   94  14823903    24  26480   List    405 854 3   0   36  1
277215  277215  2   95  14823887    24  7948    List    405 854 3   0   16  1
277215  277215  2   96  14823888    24  32  List    405 854 3   0   50  1

代码1

    #extracting the xml data
import xml.etree.ElementTree as ET
import pandas as pd; import re

root = ET.parse("test.xml")

out = {}
for mo in root.findall(".//{*}managedObject"):
    tmp = dict(
        re.findall(r"([^/]+?)-([^/]+)", mo.attrib["distName"])[1:],
        id=mo.attrib["id"],
        **{ele.attrib["name"]: ele.text if ((tn:=ele.tag.split("}")[-1]) == "p")
           else tn for ele in mo.findall(".//")  if ele.attrib.get("name")},
    )
    
    out.setdefault(mo.attrib["class"], []).append(tmp)

#Writing to excel
with pd.ExcelWriter("output.xlsx") as writer:
    for k,v in out.items():
        (pd.DataFrame(v).apply(pd.to_numeric, errors="ignore")
             .to_excel(writer, sheet_name=k, index=False))

# to access one of the dataframes, use dict-indexing (e.g `dfs["NRREL"]`)
dfs = {k: pd.DataFrame(v) for k,v in out.items()}

代码2

##ALTERNATE TO ABOVE
from bs4 import BeautifulSoup
import pandas as pd; import re

with open("test.xml") as f:
    soup = BeautifulSoup(f.read(), "xml")

out = {}
for mo in soup.select("managedObject"):
    tmp = dict(
        re.findall(r"([^/]+?)-([^/]+)", mo["distName"])[1:], id=mo["id"],
        **{t["name"]: t.get_text() for t in mo.select("p")},
        **{(t:=mo.select(":not(p)")[0])["name"]: t.name}
    )
 
    out.setdefault(mo["class"], []).append(tmp)
    
with pd.ExcelWriter("output_all.xlsx") as writer:
    for k,v in out.items():
        (pd.DataFrame(v).apply(pd.to_numeric, errors="ignore")
             .to_excel(writer, sheet_name=k, index=False))
        
dfs = {k: pd.DataFrame(v) for k,v in out.items()}

上面是 Timeleee 帮助的两个代码...我无法标记他的信用

python pandas xml beautifulsoup elementtree
1个回答
0
投票

当你有一个巨大的Xml文件时,你需要使用XmlReader。我获取了您在 12 月 30 日发布的文章中使用的代码,并进行了修改,以便使用 Powershell 与 XmlReader 一起使用

using assembly System.Xml 
using assembly System.Xml.Linq 

$xmlFilename = 'c:\temp\test.xml'
$csvFilename = 'c:\temp\test.csv'

$settings = [System.Xml.XmlReaderSettings]::new()
$settings.DtdProcessing = [System.Xml.DtdProcessing]::Ignore

$reader = [System.Xml.XmlReader]::Create($xmlFilename, $settings)

$table = [System.Collections.ArrayList]::new()

While(-not $reader.EOF)
{
    if($reader.Name -ne 'managedObject')
    {
        $reader.ReadToFollowing('managedObject') | out-null;
    }
    if(-not $reader.EOF)
    {
        $managedObject = [System.Xml.Linq.XElement][System.Xml.Linq.XElement]::ReadFrom($reader);
        $class = $managedObject.Attribute('class').Value
        $newRow = [pscustomobject]@{
           Class = $class
        }

        $distName = $managedObject.Attribute('distName').Value
        $distNameArray = $distName.Split('/') | foreach { $obj = $_.Split('-'); $newRow | Add-Member -NotePropertyName $obj[0] -NotePropertyValue $obj[1] }
   
        $newRow | Add-Member -NotePropertyName id -NotePropertyValue $managedObject.Attribute('id').Value

        $managedObject.Descendants($ns + 'p') | foreach { $name = $_.Attribute('name').Value; $value = $_.Value; $newRow | Add-Member -NotePropertyName $name -NotePropertyValue $value }

        $table.Add($newRow) | out-null
    }
}

$table
$table | Export-CSV -Path $csvFilename -NoTypeInformation

$classes = $table | Group-Object -Property Class

$classes | foreach { $_.Group | Format-Table }

这是结果

Class PLMN MRBTS  NRBTS  NRCELL NRREL id       cellIndividualSsbRsrpOffset cellIndividualSsbRsrqOffset gNbId
----- ---- -----  -----  ------ ----- --       --------------------------- --------------------------- -----
NRREL PLMN 277215 277215 0      1     14823863 24                          24                          277215
NRREL PLMN 277215 277215 0      131   14823714 24                          24                          277240
NRREL PLMN 277215 277215 0      132   14823717 24                          24                          277227



Class  PLMN MRBTS  NRBTS  NRCELL NRRELE id       cellIndividualOffset eNodeBId mcc
-----  ---- -----  -----  ------ ------ --       -------------------- -------- ---
NRRELE PLMN 277215 277215 2      95     14823887 24                   7948     405
NRRELE PLMN 277215 277215 2      96     14823888 24                   32       405
© www.soinside.com 2019 - 2024. All rights reserved.