Python-解析半结构化文本并提取为结构化数据

问题描述 投票:0回答:1

具有如下所示的半结构数据,需要将特定部分转换为结构化数据以备将来使用

%MOBILE PARSED MESSAGE FILE
%PARX VERSION   : PARX 06.30.80 patch 69
%RAYN VERSION   : RAYN_9.83
%LOG FILE NAME  : C:\Final\Bbi_10-31.11-36.dng

%Somethin Proprietary and Confidential.
2019 Oct 31  04:32:55.139  [02]  0xB0B3  LTE PDCP UL Cipher Data PDU
Subscription ID = 1
Version = 1
Num Subpackets = 1
Subpacket[0]
   Subpacket ID = PDCP PDU with Ciphering (0xC3)
   Subpacket Version = 26
   Subpacket Size = 60 bytes
   SRB Ciphering Keys (hex) =  6B 6E 77 04 68 A5 30 D2 E3 68 86 0E 1D 35 8C D1
   DRB Ciphering Keys (hex) =  98 1A 2E 33 E6 9A 85 2B C1 1F A2 CC 3D 31 45 8F
   SRB Cipher Algo = LTE AES
   DRB Cipher Algo = LTE AES
   Num PDUs = 1
   --------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   |                 |   |    |      |      |     |     |      |      |      |          |     |          |        |els |       |        |   |      |                        |
   |                 |cfg|    |sn    |bearer|valid|pdu  |logged|      |      |count     |     |compressed|        |mini|packet |        |   |      |                        |
   |PDCPUL CIPH DATA |idx|mode|length|id    |pdu  |size |bytes |sys_fn|sub_fn|(hex)     |sn   |pdu       |pdu type|sign|action |checksum|e  |option|log_buffer (hex)        |
   --------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   |PDCPUL CIPH DATA | 4 | AM |12 bit|  3   | Yes | 62  |  4   | 245  |  1   |   0x3A   | 58  |    No    | DEFAULT|n/a |  n/a  |  n/a   |n/a| n/a  | 80 3A 45 00            |

Cipher Subpacket[0]
PDU[0] Encrypted Data:
   Unable to encrypt

2019 Oct 31  04:32:55.169  [B0]  0xB0A3  LTE PDCP DL Cipher Data PDU
Subscription ID = 1
Version = 1
Num Subpackets = 1
Subpacket[0]
   Subpacket ID = PDCP PDU with Ciphering (0xC3)
   Subpacket Version = 24
   Subpacket Size = 60 bytes
   PDCP DL Data PDU with Ciphering {
      SRB Ciphering Keys (hex) =  6B 6E 77 04 68 A5 30 D2 E3 68 86 0E 1D 35 8C D1
      DRB Ciphering Keys (hex) =  98 1A 2E 33 E6 9A 85 2B C1 1F A2 CC 3D 31 45 8F
      SRB Cipher Algo = LTE AES
      DRB Cipher Algo = LTE AES
      Num PDUs = 1
      ------------------------------------------------------------------------------------------------------------------------
      |                |   |    |      |      |     |     |      |      |      |          |    |els |                        |
      |                |cfg|    |sn    |bearer|valid|pdu  |logged|      |      |count     |    |mini|                        |
      |PDCPDL CIPH DATA|idx|mode|length|id    |pdu  |size |bytes |sys_fn|sub_fn|(hex)     |sn  |sign|log_buffer (hex)        |
      ------------------------------------------------------------------------------------------------------------------------
      |PDCPDL CIPH DATA| 4 | AM |12 bit|  3   | Yes | 62  |  4   | 248  |  0   |   0x3A   | 58 |n/a | 80 3A 2F BC            |

   }
Cipher Subpacket[0]
PDU[0] Decrypted Data:
   Unable to decrypt


2019 Oct 31  04:32:56.168  [4F]  0xB0A3  LTE PDCP DL Cipher Data PDU
Subscription ID = 1
Version = 1
Num Subpackets = 1
Subpacket[0]
   Subpacket ID = PDCP PDU with Ciphering (0xC3)
   Subpacket Version = 24
   Subpacket Size = 60 bytes
   PDCP DL Data PDU with Ciphering {
      SRB Ciphering Keys (hex) =  6B 6E 77 04 68 A5 30 D2 E3 68 86 0E 1D 35 8C D1
      DRB Ciphering Keys (hex) =  98 1A 2E 33 E6 9A 85 2B C1 1F A2 CC 3D 31 45 8F
      SRB Cipher Algo = LTE AES
      DRB Cipher Algo = LTE AES
      Num PDUs = 1
      ------------------------------------------------------------------------------------------------------------------------
      |                |   |    |      |      |     |     |      |      |      |          |    |els |                        |
      |                |cfg|    |sn    |bearer|valid|pdu  |logged|      |      |count     |    |mini|                        |
      |PDCPDL CIPH DATA|idx|mode|length|id    |pdu  |size |bytes |sys_fn|sub_fn|(hex)     |sn  |sign|log_buffer (hex)        |
      ------------------------------------------------------------------------------------------------------------------------
      |PDCPDL CIPH DATA| 4 | AM |12 bit|  3   | Yes | 62  |  4   | 348  |  0   |   0x3B   | 59 |n/a | 80 3B 86 3B            |

   }
Cipher Subpacket[0]
PDU[0] Decrypted Data:
   Unable to decrypt


%MOBILE PARSED MESSAGE FILE
%PARX VERSION   : PARX 06.30.80 patch 69
%RAYN VERSION   : RAYN_9.83
%LOG FILE NAME  : C:\Final\Abi_10-31.11-39.dng

%Somethin Proprietary and Confidential.
2019 Oct 31  04:36:04.543  [85]  0xB0B3  LTE PDCP UL Cipher Data PDU
Subscription ID = 1
Version = 1
Num Subpackets = 1
Subpacket[0]
   Subpacket ID = PDCP PDU with Ciphering (0xC3)
   Subpacket Version = 26
   Subpacket Size = 60 bytes
   SRB Ciphering Keys (hex) =  BC 61 5B 1C 05 1F 92 C6 83 F2 68 E6 00 A3 D7 DC
   DRB Ciphering Keys (hex) =  6B 25 EE 8D 1C 48 B2 3A 07 9A 9D 22 AA 77 33 76
   SRB Cipher Algo = LTE AES
   DRB Cipher Algo = LTE AES
   Num PDUs = 1
   --------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   |                 |   |    |      |      |     |     |      |      |      |          |     |          |        |els |       |        |   |      |                        |
   |                 |cfg|    |sn    |bearer|valid|pdu  |logged|      |      |count     |     |compressed|        |mini|packet |        |   |      |                        |
   |PDCPUL CIPH DATA |idx|mode|length|id    |pdu  |size |bytes |sys_fn|sub_fn|(hex)     |sn   |pdu       |pdu type|sign|action |checksum|e  |option|log_buffer (hex)        |
   --------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   |PDCPUL CIPH DATA | 4 | AM |12 bit|  3   | Yes | 62  |  4   | 135  |  8   |   0xF9   | 249 |    No    | DEFAULT|n/a |  n/a  |  n/a   |n/a| n/a  | 80 F9 45 00            |

Cipher Subpacket[0]
PDU[0] Encrypted Data:
   Unable to encrypt

2019 Oct 31  04:36:04.568  [58]  0xB0A3  LTE PDCP DL Cipher Data PDU
Subscription ID = 1
Version = 1
Num Subpackets = 1
Subpacket[0]
   Subpacket ID = PDCP PDU with Ciphering (0xC3)
   Subpacket Version = 24
   Subpacket Size = 60 bytes
   PDCP DL Data PDU with Ciphering {
      SRB Ciphering Keys (hex) =  BC 61 5B 1C 05 1F 92 C6 83 F2 68 E6 00 A3 D7 DC
      DRB Ciphering Keys (hex) =  6B 25 EE 8D 1C 48 B2 3A 07 9A 9D 22 AA 77 33 76
      SRB Cipher Algo = LTE AES
      DRB Cipher Algo = LTE AES
      Num PDUs = 1
      ------------------------------------------------------------------------------------------------------------------------
      |                |   |    |      |      |     |     |      |      |      |          |    |els |                        |
      |                |cfg|    |sn    |bearer|valid|pdu  |logged|      |      |count     |    |mini|                        |
      |PDCPDL CIPH DATA|idx|mode|length|id    |pdu  |size |bytes |sys_fn|sub_fn|(hex)     |sn  |sign|log_buffer (hex)        |
      ------------------------------------------------------------------------------------------------------------------------
      |PDCPDL CIPH DATA| 4 | AM |12 bit|  3   | Yes | 62  |  4   | 138  |  7   |   0xF8   |248 |n/a | 80 F8 23 41            |

   }
Cipher Subpacket[0]
PDU[0] Decrypted Data:
   Unable to decrypt

我具有伪代码以按如下方式提取数据。我正在寻找的是有关pseduo代码标记为#need_help的特定步骤的帮助-这些步骤主要是围绕确定文本的特定部分并将其捕获到变量中。

intialize a list, data = []
for each text block  ( text block starts with time format `yyyy MMM dd  hh:mm:ss.mil`) #need_help 
if ending with `0xB0B3  LTE PDCP UL Cipher Data PDU` #need_help
    if `size pdu` field  value `== 62` #need_help
        store 62 to variable pdu_size 
        store 'ulPdu' to variable type
        Extract the `yyyy MMM dd  hh:mm:ss.mil` and store the value as `datetime` type in a variable `datetime` #need_help
        Extract the field `seq` and store as variable `seq` #need_help
        store ulPdu = {"datetime": datetime, "pDuType": type, "pDuSize": pdu_size", "seq": seq}
        add ulPdu to data
    else
        pass  # try next text block
else if ending with `0xB0A3  LTE PDCP DL Cipher Data PDU`
    if `size pdu` field  value `== 62`
        store 62 to variable pdu_size
        store 'dlPdu' to variable type
        Extract the `yyyy MMM dd  hh:mm:ss.mil` and store the value as `datetime` type in a variable `datetime`
        Extract the field `seq` and store as variable `seq`
        store dlPdu = {"datetime": datetime, "pDuType": type, "pDuSize": pdu_size", "seq": seq}
        add dlPdu to data
    else
        pass  # try next text block
else
    pass  # try next text block
python text-parsing
1个回答
1
投票

您可以使用TTP来解析以上文本,这是代码:

from ttp import ttp

ttp_template="""
<group name="results">
%PARX VERSION   : {{ PARX_VERSION | PHRASE }}
%RAYN VERSION   : {{ RAYN_VERSION }}
%LOG FILE NAME  : {{ LOG_FILE_NAME }}

<group name="Something Proprietary and Confidential">
%Somethin Proprietary and Confidential. {{ _start_ }}
<group name="{{ date }} {{ time }}">
{{ date | PHRASE | _start_ }}  {{ time }} [{{ ignore }}]  {{ ignore }}  LTE PDCP UL Cipher Data PDU
{{ date | PHRASE | _start_ }}  {{ time }} [{{ ignore }}]  {{ ignore }}  LTE PDCP DL Cipher Data PDU
Subscription ID = {{ Subscription_ID }}
Version = {{ version }}
Num Subpackets = {{ Num_Subpackets }}
   Subpacket ID = {{ Subpacket_ID | PHRASE }}
   Subpacket Version = {{ Subpacket_Version }}
   Subpacket Size = {{ Subpacket_Size | PHRASE }}
   SRB Ciphering Keys (hex) =  {{ SRB_Ciphering_Keys_hex | PHRASE }}
   DRB Ciphering Keys (hex) =  {{ DRB_Ciphering_Keys_hex | PHRASE }}
   SRB Cipher Algo = {{ SRB_Cipher_Algo | PHRASE }}
   DRB Cipher Algo = {{ DRB_Cipher_Algo | PHRASE }}
   Num PDUs = {{ Num_PDUs }}
<group name="PDCPUL_CIPH_DATA" method="table">
   |PDCPUL CIPH DATA | {{ cfg_idx | DIGIT }} | {{ mode }} |{{ sn_length }} bit| {{ bearer_id }} | {{ valid_pdu }} | {{ pdu_size | DIGIT }} |  {{ logged_bytes }}   | {{ sys_fn }}  |  {{ sub_fn }}   |   {{ count }}   | {{ sn }}  |    {{ compressed_pdu }}    | {{pdu_type}}|{{ els }} |  {{ packet_act }}  |  {{ checksum }}   |{{ e }}| {{ option }}  | {{ log_buffer | PHRASE }} |
      |PDCPDL CIPH DATA| {{ cfg_idx | DIGIT }} | {{ mode }} |{{ sn_length }} bit| {{ bearer_id }} | {{ valid_pdu }} | {{ pdu_size | DIGIT }}  |  {{ logged_bytes }}   | {{ sys_fn }}  |  {{ sub_fn }}   |   {{ count }}   |{{ sn }} |{{ els }} | {{ log_buffer | PHRASE }} |
      |PDCPDL CIPH DATA| {{ cfg_idx | DIGIT }} | {{ mode }} |{{ sn_length }} bit| {{ bearer_id }} | {{ valid_pdu }} | {{ pdu_size | DIGIT }}  |  {{ logged_bytes }}   | {{ sys_fn }}  |  {{ sub_fn }}   |   {{ count }}   | {{ sn }} |{{ els }} | {{ log_buffer | PHRASE }} |
</group>
</group>
</group>
</group>
"""

parser = ttp(data="/absolute/os/path/to/data.txt", template=ttp_template)
parser.parse()
print(parser.result(format="json")[0])

该代码将产生:

[
    {
        "results": [
            {
                "LOG_FILE_NAME": "C:\\Final\\Bbi_10-31.11-36.dng",
                "PARX_VERSION": "PARX 06.30.80 patch 69",
                "RAYN_VERSION": "RAYN_9.83",
                "Something Proprietary and Confidential": {
                    "2019 Oct 31 04:32:55.139": {
                        "DRB_Cipher_Algo": "LTE AES",
                        "DRB_Ciphering_Keys_hex": "98 1A 2E 33 E6 9A 85 2B C1 1F A2 CC 3D 31 45 8F",
                        "Num_PDUs": "1",
                        "Num_Subpackets": "1",
                        "PDCPUL_CIPH_DATA": {
                            "bearer_id": "3",
                            "cfg_idx": "4",
                            "checksum": "n/a",
                            "compressed_pdu": "No",
                            "count": "0x3A",
                            "e": "n/a",
                            "els": "n/a",
                            "log_buffer": "80 3A 45 00",
                            "logged_bytes": "4",
                            "mode": "AM",
                            "option": "n/a",
                            "packet_act": "n/a",
                            "pdu_size": "62",
                            "pdu_type": "DEFAULT",
                            "sn": "58",
                            "sn_length": "12",
                            "sub_fn": "1",
                            "sys_fn": "245",
                            "valid_pdu": "Yes"
                        },
                        "SRB_Cipher_Algo": "LTE AES",
                        "SRB_Ciphering_Keys_hex": "6B 6E 77 04 68 A5 30 D2 E3 68 86 0E 1D 35 8C D1",
                        "Subpacket_ID": "PDCP PDU with Ciphering (0xC3)",
                        "Subpacket_Size": "60 bytes",
                        "Subpacket_Version": "26",
                        "Subscription_ID": "1",
                        "version": "1"
                    },
                    "2019 Oct 31 04:32:55.169": {
                        "Num_Subpackets": "1",
                        "PDCPUL_CIPH_DATA": {
                            "bearer_id": "3",
                            "cfg_idx": "4",
                            "count": "0x3A",
                            "els": "n/a",
                            "log_buffer": "80 3A 2F BC",
                            "logged_bytes": "4",
                            "mode": "AM",
                            "pdu_size": "62",
                            "sn": "58",
                            "sn_length": "12",
                            "sub_fn": "0",
                            "sys_fn": "248",
                            "valid_pdu": "Yes"
                        },
                        "Subpacket_ID": "PDCP PDU with Ciphering (0xC3)",
                        "Subpacket_Size": "60 bytes",
                        "Subpacket_Version": "24",
                        "Subscription_ID": "1",
                        "version": "1"
                    },
                    "2019 Oct 31 04:32:56.168": {
                        "Num_Subpackets": "1",
                        "PDCPUL_CIPH_DATA": {
                            "bearer_id": "3",
                            "cfg_idx": "4",
                            "count": "0x3B",
                            "els": "n/a",
                            "log_buffer": "80 3B 86 3B",
                            "logged_bytes": "4",
                            "mode": "AM",
                            "pdu_size": "62",
                            "sn": "59",
                            "sn_length": "12",
                            "sub_fn": "0",
                            "sys_fn": "348",
                            "valid_pdu": "Yes"
                        },
                        "Subpacket_ID": "PDCP PDU with Ciphering (0xC3)",
                        "Subpacket_Size": "60 bytes",
                        "Subpacket_Version": "24",
                        "Subscription_ID": "1",
                        "version": "1"
                    }
                }
            },
            {
                "LOG_FILE_NAME": "C:\\Final\\Abi_10-31.11-39.dng",
                "PARX_VERSION": "PARX 06.30.80 patch 69",
                "RAYN_VERSION": "RAYN_9.83",
                "Something Proprietary and Confidential": {
                    "2019 Oct 31 04:32:55.169": {
                        "DRB_Cipher_Algo": "LTE AES",
                        "DRB_Ciphering_Keys_hex": "6B 25 EE 8D 1C 48 B2 3A 07 9A 9D 22 AA 77 33 76",
                        "Num_PDUs": "1",
                        "Num_Subpackets": "1",
                        "PDCPUL_CIPH_DATA": {
                            "bearer_id": "3",
                            "cfg_idx": "4",
                            "checksum": "n/a",
                            "compressed_pdu": "No",
                            "count": "0xF9",
                            "e": "n/a",
                            "els": "n/a",
                            "log_buffer": "80 F9 45 00",
                            "logged_bytes": "4",
                            "mode": "AM",
                            "option": "n/a",
                            "packet_act": "n/a",
                            "pdu_size": "62",
                            "pdu_type": "DEFAULT",
                            "sn": "249",
                            "sn_length": "12",
                            "sub_fn": "8",
                            "sys_fn": "135",
                            "valid_pdu": "Yes"
                        },
                        "SRB_Cipher_Algo": "LTE AES",
                        "SRB_Ciphering_Keys_hex": "BC 61 5B 1C 05 1F 92 C6 83 F2 68 E6 00 A3 D7 DC",
                        "Subpacket_ID": "PDCP PDU with Ciphering (0xC3)",
                        "Subpacket_Size": "60 bytes",
                        "Subpacket_Version": "26",
                        "Subscription_ID": "1",
                        "version": "1"
                    },
                    "2019 Oct 31 04:36:04.543": {},
                    "2019 Oct 31 04:36:04.568": {
                        "Num_Subpackets": "1",
                        "PDCPUL_CIPH_DATA": {
                            "bearer_id": "3",
                            "cfg_idx": "4",
                            "count": "0xF8",
                            "els": "n/a",
                            "log_buffer": "80 F8 23 41",
                            "logged_bytes": "4",
                            "mode": "AM",
                            "pdu_size": "62",
                            "sn": "248",
                            "sn_length": "12",
                            "sub_fn": "7",
                            "sys_fn": "138",
                            "valid_pdu": "Yes"
                        },
                        "Subpacket_ID": "PDCP PDU with Ciphering (0xC3)",
                        "Subpacket_Size": "60 bytes",
                        "Subpacket_Version": "24",
                        "Subscription_ID": "1",
                        "version": "1"
                    }
                }
            }
        ]
    }
]
© www.soinside.com 2019 - 2024. All rights reserved.