我想从下载的xml文件(https://s3.amazonaws.com/irs-form-990/201542399349300614_public.xml)中提取某些数据点。
import pandas as pd
import csv
import os
from os import path
from xml.dom import minidom
from xml.etree import ElementTree
import requests
from bs4 import BeautifulSoup
#from IRS_Download import *
import sys
for o in object_id:
file_name = "" + o + ".xml"
basepath = path.dirname(__file__)
filepath = path.abspath(path.join(basepath, file_name))
dom = minidom.parse(filepath)
EmIdN = dom.getElementsByTagName('EIN')
print(EmIdN)
但是,这仅返回:
DOM元素:EIN位于0x1132eecc0
任何想法,我做错了什么?
我现在这样解决了它:
tree = ET.parse(xml_tree)
root = tree.getroot()
#prints out all tags to see the paths
#for elemtn in root.iter():
# print(elemtn)
if tree.find('.//{http://www.irs.gov/efile}EIN') is not None:
info = tree.find('.//{http://www.irs.gov/efile}EIN').text
EIN.append(info)
else:
info = 'Null'
EIN.append(info)