Python上的数据解析

问题描述 投票:0回答:1

我有以下txt文件,我想使用不使用行号的数据文本解析方法输出2个字典txt文件,因为它应该适用于更大的txt文件。

  • “First Chapter ed”词典应该类似于:
    • "Height1" : { "Fir_ColumnB" : 123.50, "Fir_ColumnC" : 4, "Fir_ColumnD" : 31}, "Height2" : { "Fir_ColumnB" : 2334.00, "Fir_ColumnC" : 62, "Fir_ColumnD" : 0}, "Height3" : {.....}
  • “Second Chapter ed”字典应包含 Sec_ColumnA 下的值作为键以及 Sec_ColumnB 的值。喜欢:
    • {"Row1" : "12251m", "Row2" : "3231m","Row3" : "31412m"}

但是由于那些背靠背的“****”行,我遇到了一些困难。

creating_testing_1
TESTING_1  ;
TESTING_1  ; 
TESTING_1  ; First Chapter ed
TESTING_1  ; ********************************************************
TESTING_1  ; Fir_ColumnA    Fir_ColumnB    Fir_ColumnC    Fir_ColumnD
TESTING_1  ; ********************************************************
TESTING_1  ; Height1             123.50              4             31
TESTING_1  ; Height2            2334.00             62              0
TESTING_1  ; Height3               0.00             23             23
TESTING_1  ; ********************************************************
TESTING_1  ;
TESTING_1  ; Second Chapter ed
TESTING_1  ; ********************************************************
TESTING_1  ; Sec_ColumnA                  Sec_ColumnB                            
TESTING_1  ; ********************************************************
TESTING_1  ; Row1                                              12251m
TESTING_1  ; Row2                                           3231m
TESTING_1  ; Row3                                              31412m
TESTING_1  ; ********************************************************
TESTING_1  ;
TESTING_1  ; Ending...
TESTING_1  ;
def parse_txt_file(filename):
    data = {}
    current_chapter = None

    with open(filename, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith(';'):
                continue 

            if 'First Chapter ed' in line:
                current_chapter = 'First Chapter ed'
                data[current_chapter] = {}
                continue
            elif 'Second Chapter ed' in line:
                current_chapter = 'Second Chapter ed'
                data[current_chapter] = {}
                continue
            elif 'Ending...' in line:
                break 

            if current_chapter:
                if line.startswith('*'):
                    continue  
                columns = line.split()
                if len(columns) == 4: 
                    key = columns[0]
                    values = columns[1:]
                    if current_chapter == 'First Chapter ed':
                        data[current_chapter][key] = {
                            "Fir_ColumnB": float(values[0]),
                            "Fir_ColumnC": int(values[1]),
                            "Fir_ColumnD": int(values[2])
                        }
                    elif current_chapter == 'Second Chapter ed':
                        data[current_chapter][key] = values[0]

    return data


def save_dictionary_to_txt(data, filename):
    with open(filename, 'w') as file:
        for chapter, chapter_data in data.items():
            file.write(f"{chapter}\n")
            for key, value in chapter_data.items():
                if isinstance(value, dict):
                    file.write(f'"{key}" : {value}\n')
                else:
                    file.write(f'"{key}" : "{value}"\n')
            file.write('\n')


def main():
    filename = 'short_test_2.txt'
    data = parse_txt_file(filename)

   
    for chapter, chapter_data in data.items():
        save_dictionary_to_txt({chapter: chapter_data}, f'{chapter.replace(" ", "_").lower()}.txt')


if __name__ == "__main__":
    main()
python dictionary parsing data-cleaning
1个回答
0
投票

这里成功的关键是要认识到数据都是围绕三行星号分组的。一旦您知道所有这些分组在哪里,其他一切就都到位了。

尚不清楚您期望的输出是什么,但您应该能够使用它来进一步解决问题:

import json # used only for presention of the dictionaries

def convert(s):
    try:
        return int(s)
    except ValueError:
        try:
            return float(s)
        except ValueError:
            pass
    return s

def segments(content):
    s = []
    for i, e in enumerate(content):
        if e.startswith("*"):
            s.append(i)
            if len(s) == 3:
                yield s
                s = []

def normalise(content):
    output = []
    for row in content:
        _, *values = row.split(";")
        if values and (value := values[0].strip()):
            output.append(value)
    return output

with open("foo.txt") as data:
    results = []
    content = normalise(data.readlines())
    for s, m, e in segments(content):
        td = {"title": content[s-1]}
        keys = content[s+1].split()
        for row in content[m+1:e]:
            for k, v in zip(keys, row.split()):
                td.setdefault(k, []).append(convert(v))
        results.append(td)
    print(json.dumps(results, indent=2))

输出:

[
  {
    "title": "First Chapter ed",
    "Fir_ColumnA": [
      "Height1",
      "Height2",
      "Height3"
    ],
    "Fir_ColumnB": [
      123.5,
      2334.0,
      0.0
    ],
    "Fir_ColumnC": [
      4,
      62,
      23
    ],
    "Fir_ColumnD": [
      31,
      0,
      23
    ]
  },
  {
    "title": "Second Chapter ed",
    "Sec_ColumnA": [
      "Row1",
      "Row2",
      "Row3"
    ],
    "Sec_ColumnB": [
      "12251m",
      "3231m",
      "31412m"
    ]
  }
]
© www.soinside.com 2019 - 2024. All rights reserved.