在 R 中读取包含由标题分隔的多个数据帧的文本文件

问题描述 投票:0回答:1

我有一个大文本文件,其中包含多个数据帧,由标题行分隔,我试图将其读入 R。第一个标题行包含时间变量。我想根据时间变量分离数据帧。数据如下:

data = c("** TIME:          41670", "** PROPERTY: Pressure", "** UNITS:    psi", 
"<      X       > <      Y       > <Layer    1>", "      2106604.41       3434119.83      7952.25", 
"      2111884.40       3434119.83      7970.05", "      2037964.57       3439399.82      7658.27", 
"      2043244.56       3439399.82         7754", "      2048524.55       3439399.82      7828.24", 
"      2053804.53       3439399.82      7879.78", "      2059084.52       3439399.82      7914.57", 
"      2064364.50       3439399.82      7944.66", "      2069644.49       3439399.82      7974.44", 
"      2074924.48       3439399.82      7999.03", "      2080204.46       3439399.82      8014.14", 
"      2085484.46       3439399.82      8016.27", "      2090764.46       3439399.82      8005.63", 
"", "", "** TIME:          41670", "** PROPERTY: Pressure", "** UNITS:    psi", 
"<      X       > <      Y       > <Layer    2>", "      2106604.41       3434119.83      8038.52", 
"      2111884.40       3434119.83      8066.89", "      2037964.57       3439399.82      7723.84", 
"      2043244.56       3439399.82      7821.79", "      2048524.55       3439399.82      7899.46", 
"      2053804.53       3439399.82      7955.23", "      2059084.52       3439399.82      7993.75", 
"      2064364.50       3439399.82      8026.08", "      2069644.49       3439399.82      8056.41", 
"      2074924.48       3439399.82      8080.33", "      2080204.46       3439399.82      8094.15", 
"      2085484.46       3439399.82      8095.07", "      2090764.46       3439399.82      8084.03", 
"      2096044.44       3439399.82      8068.33", "      2101324.41       3439399.82      8060.14", 
"      2106604.41       3439399.82      8073.08", "      2111884.40       3439399.82      8107.82", 
"      2117164.38       3439399.82      8145.84", "      2122444.37       3439399.82      8160.57"
)

我正在使用

readLines
读取文本文件。

我理想地想要一个带有时间戳的列表,附加到值的数据帧,例如:

[[1]]$date
[1] "2014-01-31"
[[1]]$data
   X          Y          Layer1
1 2106604.41  3434119.83 7952.25
2 2111884.40  3434119.83 7970.05
3 2037964.57  3439399.82 7658.27
4 2043244.56  3439399.82 7754

[[2]]$date
[1] "2014-01-31"
[[2]]$data
   X          Y          Layer2
1 2106604.41 3434119.83 8038.52
2 2111884.40 3434119.83 8066.89
3 2037964.57 3439399.82 7723.84
4 2043244.56 3439399.82 7821.79

这是我尝试过的:

data <- readLines("tmp.txt")

# Initialize an empty list to store data frames
dfs <- list()

# Initialize variables
current_time <- NULL
current_df <- NULL
property <- NULL

# Loop through each line of the file
for (line in data) {
  if (startsWith(line, "** TIME:")) {
    # Extract the time from the header line and convert to datetime
    current_time <- as.Date(as.numeric(trimws(sub("\\*{2}\\s+TIME:\\s+", "", line))), origin = "1899-12-30", format = "%Y-%m-%d")
    # Create a new data frame for the current time
    current_df <- data.frame()
  } else if (startsWith(line, "** PROPERTY:")) {
    next
  } else if (startsWith(line, "** UNITS:")) {
    next
  } else if (startsWith(line, "<")) {
    # Extract column names from header line 4
   clean_header <- gsub("<|>", "", line)
   clean_header <- trimws(clean_header)
   col_names <- strsplit(clean_header, " ")
   col_names <- unlist(col_names)
   col_names <- col_names[col_names != ""]
   col_names[3] <- paste0(col_names[3], col_names[4])
   col_names <- col_names[-4]
  } else if (!startsWith(line, "**")) {
    # Split the line by whitespace and create a new row in the data frame
    parts <- strsplit(line, "\\s+")[[1]]
    parts <- parts[parts != ""]
    current_df <- rbind(current_df, as.numeric(parts))
  } else {
    # End of current data frame, store it in the list
    colnames(current_df) <- col_names
    dfs[[length(dfs) + 1]] <- list(date = current_time, data = current_df)
    current_df <- NULL
  }
}

代码正在生成 current_df,它存储循环的最新数据帧,但未添加列名称。此外,current_df 没有保存到 dfs list 中,因此随着循环的继续,它会被新的 current_df 覆盖。

r dataframe list text-parsing
1个回答
0
投票

非常接近!只需要三个小改动:

  1. 倒数第二个条件
    !startsWith(line, "**")
    与空行匹配。这意味着空行被视为数据,并且从未达到最终条件(最终确定数据帧并将其添加到列表中)。我把条件改为
    nchar(line) > 0 & !startsWith(line, "**")
  2. 在示例数据中,数据帧之间有两个空行。这意味着在我们开始构建新的数据帧之前,最终条件会运行两次。但最终条件将
    current_df
    设置为
    NULL
    ,然后下次循环运行时,对
    current_df
    的操作将失败。为了避免这个问题,我将最后的
    else
    更改为
    else if(!is.null(current_df))
  3. 样本数据中,
    data
    最后一行包含一行数据;这意味着最终条件永远不会针对最后一个数据帧运行,因此最后一个数据帧永远不会添加到列表中。我在
    ""
    中又添加了一行
    data
    来解决这个问题。 (或者,我们可以复制最后一个条件的内容,并在整个循环运行后再次运行它们。)

以下是经过这三项更改后的代码:

data = c(data, "")
for (line in data) {
  if (startsWith(line, "** TIME:")) {
    # Extract the time from the header line and convert to datetime
    current_time <- as.Date(as.numeric(trimws(sub("\\*{2}\\s+TIME:\\s+", "", line))), origin = "1899-12-30", format = "%Y-%m-%d")
    # Create a new data frame for the current time
    current_df <- data.frame()
  } else if (startsWith(line, "** PROPERTY:")) {
    next
  } else if (startsWith(line, "** UNITS:")) {
    next
  } else if (startsWith(line, "<")) {
    # Extract column names from header line 4
    clean_header <- gsub("<|>", "", line)
    clean_header <- trimws(clean_header)
    col_names <- strsplit(clean_header, " ")
    col_names <- unlist(col_names)
    col_names <- col_names[col_names != ""]
    col_names[3] <- paste0(col_names[3], col_names[4])
    col_names <- col_names[-4]
  } else if (nchar(line) > 0 & !startsWith(line, "**")) {
    # Split the line by whitespace and create a new row in the data frame
    parts <- strsplit(line, "\\s+")[[1]]
    parts <- parts[parts != ""]
    current_df <- rbind(current_df, as.numeric(parts))
  } else if(!is.null(current_df)) {
    # End of current data frame, store it in the list
    colnames(current_df) <- col_names
    dfs[[length(dfs) + 1]] <- list(date = current_time, data = current_df)
    current_df <- NULL
  }
}
© www.soinside.com 2019 - 2024. All rights reserved.