循环浏览XML文件,解析数据,并写入Dataframe和XLSX文件。

问题描述 投票:0回答:1

预先感谢您的帮助。

我有一个脚本,循环浏览xls文件目录(文件是真正的XML文件),解析XML数据,并写入XLS。脚本的工作原理是这样的。

我想做的是

  1. 把文件写入一个新的目录,而不是同一个目录。
  2. 将每个文件写成'.xlsx'而不是'.xls'。
  3. 将每个数据框写入数据框,这样以后如果我想的话,就可以将每个数据框追加到一个excel文件中,在一个表中

下面是工作的代码。它将文件写入它所拉取的同一目录,并创建XLS文件。

import os
import pandas as pd
import xml.etree.cElementTree as ET
import glob

ns = {"doc": "urn:schemas-microsoft-com:office:spreadsheet"}


for filepath in glob.iglob(r'...Documents\Python\Current_Period\*.xls'):
    print(filepath)

    tree = ET.parse(filepath)

    root = tree.getroot()

    def getvalueofnode(node):
        """ return node text or None """
        return node.text if node is not None else None


    def main():
        """ main """
        parsed_xml = tree

        data = []
        for i, node in enumerate(root.findall('.//doc:Row', ns)):
            if i > 6:
                data.append({'Account': getvalueofnode(node.find('doc:Cell[1]/doc:Data', ns)),
                            'Total': getvalueofnode(node.find('doc:Cell[2]/doc:Data', ns))})

        return(pd.DataFrame(data))

    output_df = main()
    # print = output_df 

    output_df.to_excel(filepath + '.xls', index=False)

源数据。

<?xml version="1.0" encoding="utf-16"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:html="http://www.w3.org/TR/REC-html40">
  <DocumentProperties xmlns="urn:schemas-microsoft-com:office:office">
    <Author>NetSuite Reports</Author>
    <LastAuthor>NetSuite Reports</LastAuthor>
    <Company>NetSuite</Company>
  </DocumentProperties>
  <Styles>
    <Style ss:ID="company">
      <Alignment ss:Horizontal="Center" />
      <Font ss:Size="12" ss:Bold="1" />
    </Style>
    <Style ss:ID="subcompany">
      <Alignment ss:Horizontal="Center" />
      <Font ss:Size="14" ss:Bold="1" />
    </Style>
    <Style ss:ID="error">
      <Alignment ss:Horizontal="Center" />
      <Interior ss:Color="#f0d0d0" ss:Pattern="Solid" />
      <Font ss:Bold="1" />
    </Style>
    <Style ss:ID="header_l">
      <Alignment ss:Horizontal="Left" />
      <Font ss:Size="7" ss:Bold="1" />
      <Interior ss:Color="#d0d0d0" ss:Pattern="Solid" />
    </Style>
    <Style ss:ID="header_r">
      <Alignment ss:Horizontal="Right" />
      <Font ss:Size="7" ss:Bold="1" />
      <Interior ss:Color="#d0d0d0" ss:Pattern="Solid" />
    </Style>
    <Style ss:ID="header_c">
      <Alignment ss:Horizontal="Center" />
      <Font ss:Size="7" ss:Bold="1" />
      <Interior ss:Color="#d0d0d0" ss:Pattern="Solid" />
    </Style>
    <Style ss:ID="scheckbox">
      <Alignment ss:Vertical="Center" ss:Horizontal="Center" />
    </Style>
    <Style ss:ID="Default" ss:Name="Normal">
      <Alignment ss:Vertical="Bottom" />
      <Borders />
      <Font ss:FontName="Arial" ss:Size="8" />
      <Interior />
      <NumberFormat />
      <Protection />
    </Style>
    <Style ss:ID="s53">
      <Alignment ss:Vertical="Center" ss:Horizontal="Left" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s52">
      <Alignment ss:Horizontal="Left" ss:Indent="1" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="0" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s51">
      <Alignment ss:Vertical="Center" ss:Horizontal="Right" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="0" ss:Italic="0" />
      <NumberFormat ss:Format="&quot;€&quot;#,##0.00" />
      <Borders />
    </Style>
    <Style ss:ID="s50">
      <Alignment ss:Vertical="Center" ss:Horizontal="Left" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s58">
      <Alignment ss:Horizontal="Left" ss:Indent="2" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s54">
      <Alignment ss:Vertical="Center" ss:Horizontal="Right" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <NumberFormat ss:Format="&quot;€&quot;#,##0.00" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s59">
      <Alignment ss:Horizontal="Left" ss:Indent="1" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
    <Style ss:ID="s56">
      <Alignment ss:Horizontal="Left" ss:Indent="2" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s57">
      <Alignment ss:Horizontal="Left" ss:Indent="3" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="0" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s55">
      <Alignment ss:Horizontal="Left" ss:Indent="1" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders />
    </Style>
    <Style ss:ID="s60">
      <Alignment ss:Vertical="Center" ss:Horizontal="Left" />
      <Font ss:FontName="Arial" ss:Size="8" ss:Color="#000000" ss:Bold="1" ss:Italic="0" />
      <Borders>
        <Border ss:Position="Top" ss:LineStyle="Dash" ss:Weight="1" ss:Color="#cccccc" />
      </Borders>
    </Style>
  </Styles>
  <Worksheet ss:Name="TrialBalance">
    <Table>
      <Row>
        <Cell ss:StyleID="company" ss:MergeAcross="1">
          <Data ss:Type="String">Parent Company</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="company" ss:MergeAcross="1">
          <Data ss:Type="String">Company Holdings Inc. : Company A  B.V.</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String">Trial Balance</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String">End of Feb 2020</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String" />
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="subcompany" ss:MergeAcross="1">
          <Data ss:Type="String" />
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="header_l">
          <Data ss:Type="String">Account</Data>
        </Cell>
        <Cell ss:StyleID="header_r" ss:MergeDown="0" ss:Index="2">
          <Data ss:Type="String">Total</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="s50">
          <Data ss:Type="String">10000 - CASH &amp; CASH EQUIVALENTS</Data>
        </Cell>
        <Cell ss:StyleID="s51" />
      </Row>
      <Row>
        <Cell ss:StyleID="s52">
          <Data ss:Type="String">10101 - Bank - 9999 - Company A - EUR</Data>
        </Cell>
        <Cell ss:StyleID="s51">
          <Data ss:Type="Number">1234567.01</Data>
        </Cell>
      </Row>
      <Row>
        <Cell ss:StyleID="s53">
          <Data ss:Type="String">Total - 10000 - CASH &amp; CASH EQUIVALENTS</Data>
        </Cell>
        <Cell ss:Formula="SUM(R[-1]C)" ss:StyleID="s54">
          <Data ss:Type="Number">1234567.01</Data>
        </Cell>
      </Row>
    </Table>
  </Worksheet>
</Workbook>
python excel pandas xml-parsing
1个回答
0
投票

这可能会有帮助

import os
import pandas as pd
import xml.etree.cElementTree as ET
import glob

def iter_docs(root):
    worksheet = root.find('{urn:schemas-microsoft-com:office:spreadsheet}Worksheet')

    table = worksheet.find('{urn:schemas-microsoft-com:office:spreadsheet}Table')
    for row in table:
      doc_dict = []
      for cell in row.iter('{urn:schemas-microsoft-com:office:spreadsheet}Cell'):
        try:
          doc_dict.append(cell[0].text)
        except IndexError: # no data in cell
          doc_dict.append(None)
      yield doc_dict

for filepath in glob.iglob(r'...Documents\Python\Current_Period\*.xls'):
    print(filepath)

    tree = ET.parse(filepath)

    root = tree.getroot()

    doc_df = pd.DataFrame(list(iter_docs(root)))


    doc_df.to_excel(filepath + '.xlsx', index=False)

© www.soinside.com 2019 - 2024. All rights reserved.