我有以下XML代码(整个代码包含成千上万的person id
,这就是我依赖iterparse()
的原因,因为文件非常大):]]
<?xml version="1.0" encoding="utf-8"?> <!DOCTYPE population SYSTEM "http://www.matsim.org/files/dtd/population_v6.dtd"> <population desc="Switzerland Baseline"> <person id="100127"> <attributes> <attribute name="age" class="java.lang.Integer" >11</attribute> <attribute name="censusId" class="java.lang.Integer" >224170</attribute> <attribute name="employed" class="java.lang.Boolean" >false</attribute> <attribute name="hasLicense" class="java.lang.String" >no</attribute> <attribute name="htsId" class="java.lang.Long" >9112520200003</attribute> <attribute name="isOutside" class="java.lang.Boolean" >false</attribute> <attribute name="isPassenger" class="java.lang.Boolean" >true</attribute> <attribute name="ptSubscription" class="java.lang.Boolean" >true</attribute> <attribute name="sex" class="java.lang.String" >m</attribute> </attributes> <plan selected="yes"> <activity type="home" link="220029" facility="home52627" x="647557.28056" y="6864961.034271" end_time="07:49:09" > <attributes> <attribute name="innerParis" class="java.lang.Boolean" >true</attribute> </attributes> </activity> <leg mode="access_walk" dep_time="07:49:09" trav_time="00:09:38"> <route type="generic" start_link="220029" end_link="pt_StopPoint:59229" trav_time="00:09:38" distance="692.895772305751"></route> </leg> <activity type="pt interaction" link="220029" x="647557.28056" y="6864961.034271" max_dur="00:00:00" > </activity> <leg mode="pt" dep_time="07:58:47" trav_time="00:13:13"> <route type="enriched_pt" start_link="pt_StopPoint:59229" end_link="pt_StopPoint:59585" trav_time="00:13:13" distance="5488.133844246115">{"inVehicleTime":720.0,"transferTime":73.0,"accessStopIndex":1,"egressStopindex":11,"transitRouteId":"97574868-1_240825","transitLineId":"100110001:1","departureId":"97593123-1_240438_07:58:00"}</route> </leg> <activity type="pt interaction" link="220029" x="647557.28056" y="6864961.034271" max_dur="00:00:00" > </activity> <leg mode="transit_walk" dep_time="08:12:00" trav_time="00:00:32"> <route type="generic" start_link="pt_StopPoint:59585" end_link="pt_StopPoint:59627" trav_time="00:00:32" distance="39.422182688315836"></route> </leg> <activity type="pt interaction" link="pt_StopPoint:59585" x="652159.1523468373" y="6862257.098785016" max_dur="00:00:00" > </activity> <leg mode="pt" dep_time="08:12:32" trav_time="00:17:27"> <route type="enriched_pt" start_link="pt_StopPoint:59627" end_link="pt_StopPoint:59624" trav_time="00:17:27" distance="5813.159959644434">{"inVehicleTime":960.0,"transferTime":87.14818109307089,"accessStopIndex":12,"egressStopindex":25,"transitRouteId":"95327450-1_295653","transitLineId":"100110004:4","departureId":"95327497-1_295565_07:59:00"}</route> </leg> <activity type="pt interaction" link="pt_StopPoint:59585" x="652159.1523468373" y="6862257.098785016" max_dur="00:00:00" > </activity> <leg mode="egress_walk" dep_time="08:30:00" trav_time="00:11:54"> <route type="generic" start_link="pt_StopPoint:59624" end_link="178690" trav_time="00:11:54" distance="856.0619451133888"></route> </leg> <activity type="education" link="178690" facility="16842" x="651100.0" y="6858204.3" start_time="08:19:09" end_time="17:49:09" > <attributes> <attribute name="innerParis" class="java.lang.Boolean" >true</attribute> </attributes> </activity> <leg mode="access_walk" dep_time="17:49:09" trav_time="00:04:22"> <route type="generic" start_link="178690" end_link="1185" trav_time="00:04:22" distance="313.6764640548623"></route> </leg> <activity type="pt interaction" link="178690" x="651100.0" y="6858204.3" max_dur="00:00:00" > </activity> <leg mode="pt" dep_time="17:53:31" trav_time="00:05:29"> <route type="enriched_pt" start_link="1185" end_link="413156" trav_time="00:05:29" distance="1302.0939972036185">{"inVehicleTime":300.0,"transferTime":29.0,"accessStopIndex":0,"egressStopindex":4,"transitRouteId":"95450970-1_205771","transitLineId":"100100088:88","departureId":"95450972-1_205754_17:54:00"}</route> </leg> <activity type="pt interaction" link="178690" x="651100.0" y="6858204.3" max_dur="00:00:00" > </activity> <leg mode="transit_walk" dep_time="17:59:00" trav_time="00:01:41"> <route type="generic" start_link="413156" end_link="pt_StopPoint:59547" trav_time="00:01:41" distance="122.21107200064658"></route> </leg> <activity type="pt interaction" link="413156" x="651043.1290909288" y="6859441.216973967" max_dur="00:00:00" > </activity> <leg mode="pt" dep_time="18:00:41" trav_time="00:18:18"> <route type="enriched_pt" start_link="pt_StopPoint:59547" end_link="pt_StopPoint:59244" trav_time="00:18:18" distance="7166.081827475872">{"inVehicleTime":1080.0,"transferTime":18.15743999946426,"accessStopIndex":13,"egressStopindex":27,"transitRouteId":"93653132-1_291567","transitLineId":"100110006:6","departureId":"93653147-1_291586_17:45:00"}</route> </leg> <activity type="pt interaction" link="413156" x="651043.1290909288" y="6859441.216973967" max_dur="00:00:00" > </activity> <leg mode="transit_walk" dep_time="18:19:00" trav_time="00:00:39"> <route type="generic" start_link="pt_StopPoint:59244" end_link="pt_StopPoint:59236" trav_time="00:00:39" distance="46.97102023296232"></route> </leg> <activity type="pt interaction" link="pt_StopPoint:59244" x="648272.9101174484" y="6863974.735813766" max_dur="00:00:00" > </activity> <leg mode="pt" dep_time="18:19:39" trav_time="00:03:20"> <route type="enriched_pt" start_link="pt_StopPoint:59236" end_link="pt_StopPoint:59229" trav_time="00:03:20" distance="1073.5096075636977">{"inVehicleTime":180.0,"transferTime":20.857483139203396,"accessStopIndex":16,"egressStopindex":18,"transitRouteId":"97575531-1_238697","transitLineId":"100110001:1","departureId":"97575477-1_238631_17:57:00"}</route> </leg> <activity type="pt interaction" link="pt_StopPoint:59244" x="648272.9101174484" y="6863974.735813766" max_dur="00:00:00" > </activity> <leg mode="egress_walk" dep_time="18:23:00" trav_time="00:09:38"> <route type="generic" start_link="pt_StopPoint:59229" end_link="220029" trav_time="00:09:38" distance="692.895772305751"></route> </leg> <activity type="home" link="220029" facility="home52627" x="647557.28056" y="6864961.034271" start_time="18:19:09" > <attributes> <attribute name="innerParis" class="java.lang.Boolean" >true</attribute> </attributes> </activity> </plan> </person> <person id="100128"> <attributes> <attribute name="age" class="java.lang.Integer" >11</attribute> <attribute name="censusId" class="java.lang.Integer" >224170</attribute> <attribute name="employed" class="java.lang.Boolean" >false</attribute> <attribute name="hasLicense" class="java.lang.String" >no</attribute> <attribute name="htsId" class="java.lang.Long" >1140500200003</attribute> <attribute name="isOutside" class="java.lang.Boolean" >false</attribute> <attribute name="isPassenger" class="java.lang.Boolean" >true</attribute> <attribute name="ptSubscription" class="java.lang.Boolean" >false</attribute> <attribute name="sex" class="java.lang.String" >m</attribute> </attributes> <plan selected="yes"> <activity type="home" link="220029" facility="home52627" x="647557.28056" y="6864961.034271" end_time="07:43:26" > <attributes> <attribute name="innerParis" class="java.lang.Boolean" >true</attribute> </attributes> </activity> <leg mode="walk" dep_time="07:43:26" trav_time="00:58:35"> <route type="generic" start_link="220029" end_link="624543" trav_time="00:58:35" distance="4218.5741465571855"></route> </leg> <activity type="education" link="624543" facility="34450" x="650799.2" y="6865103.7" start_time="07:48:26" end_time="15:33:26" > <attributes> <attribute name="innerParis" class="java.lang.Boolean" >true</attribute> </attributes> </activity> <leg mode="walk" dep_time="15:33:26" trav_time="00:58:35"> <route type="generic" start_link="624543" end_link="220029" trav_time="00:58:35" distance="4218.5741465571855"></route> </leg> <activity type="home" link="220029" facility="home52627" x="647557.28056" y="6864961.034271" start_time="15:43:26" > <attributes> <attribute name="innerParis" class="java.lang.Boolean" >true</attribute> </attributes> </activity> </plan> </person> </population>
我想要的是一个表格,其中包含
x
活动的所有y
和type="home"
值。因此,两行应分别为X和Y,并且列的值应在活动类型为home时显示(如果可以简化编码,也可以对其进行转置)。
这是我不成功的方法。我努力将x和y都添加到输出中。这样的代码不起作用。它是代码片段的改编版本,我曾用它来提取其他信息:
import gzip import io import os import xml.etree.cElementTree as ET from collections import defaultdict import pandas as pd import numpy as np tree = ET.iterparse(gzip.open('paris_cut_entd_1pm/paris_population.xml.gz', 'r')) em_champs = defaultdict(list) for xml_event, elem in tree: attributes = elem.attrib if elem.tag == 'activity' \ and attributes['type'] == 'home' : em_champs.append(attributes['x']) elem.clear() em_champs = pd.DataFrame.from_dict(em_champs, orient='index') em_champs
非常感谢您的帮助!
我有以下XML代码(整个代码包含成千上万的人id,这就是为什么我依赖iterparse()的原因,因为文件非常大):