R:使用聚集清理数据集

问题描述 投票:-1回答:1

[我有一个来自美国农业部的csv数据集,其中包含美国各县成年人在1970、1980、1990和2000年获得的教育水平。我已经使用read_csv函数导入了此csv,然后像这样清理数据集:

colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "State"] <- "state"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Area name"] <- "area_name"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1970"] <- "Less Than Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1970"] <- "Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college (1-3 years), 1970"] <- "AA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Four years of college or higher, 1970"] <- "BA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1970"] <- "%Less Than Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1970"] <- "% Diploma, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college (1-3 years), 1970"] <- "% AA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing four years of college or higher, 1970"] <- "% BA or more, 1970"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1980"] <- "Less Than Diploma, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1980"] <- "Diploma, 1980" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college (1-3 years), 1980"] <- "AA or more, 1980" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Four years of college or higher, 1980"] <- "BA or more, 1980" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1980"] <- "% Less Than Diploma, 1980" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1980"] <- "% Diploma, 1980" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college (1-3 years), 1980"] <- "% AA or more, 1980" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing four years of college or higher, 1980"] <- "% BA or more, 1980"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 1990"] <- "Less Than Diploma, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 1990"] <- "Diploma, 1990" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college or associate's degree, 1990"] <- "AA or more, 1990" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Bachelor's degree or higher, 1990"] <- "BA or more, 1990" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 1990"] <- "% Less Than Diploma, 1990" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 1990"] <- "% Diploma, 1990" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college or associate's degree, 1990"] <- "% AA or more, 1990" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a bachelor's degree or higher, 1990"] <- "% BA or more, 1990"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Less than a high school diploma, 2000"] <- "Less Than Diploma, 2000"
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "High school diploma only, 2000"] <- "Diploma, 2000" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Some college or associate's degree, 2000"] <- "AA or more, 2000" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Bachelor's degree or higher, 2000"] <- "BA or more, 2000" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with less than a high school diploma, 2000"] <- "% Less Than Diploma, 2000" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a high school diploma only, 2000"] <- "% Diploma, 2000" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults completing some college or associate's degree, 2000"] <- "% AA or more, 2000" 
colnames(eduLevelsbyCounty)[colnames(eduLevelsbyCounty) == "Percent of adults with a bachelor's degree or higher, 2000"] <- "% BA or more, 2000"

所以现在我有一个非常大的小标题,但是问题是我现在想通过将年份分成自己的列并在其他相应列中达到的受教育程度的名称来进一步清理它。我知道collect()可以完成我想做的事情,但是问题是我的数据集包含多个年份:1970、1980、1990和2000。

我希望我已经明确了这一点,如果没有,我可以根据需要添加信息。任何帮助将不胜感激。

r tidy
1个回答
0
投票

我认为变量的命名方式使其变得不必要地复杂。否则,使用privot_longer(较新的功能代替gather)可以解决此问题。我已将您的原始名称重命名为:

使用pivot_longer将数据从宽到长旋转

library(tidyverse)
long<-pivot_longer(df, -c("state", "area_name"),
            names_to = c(".value", "year"), 
            names_sep = "_", values_drop_na = TRUE) 
> long              
# A tibble: 4 x 11
  state area_name year  Less.Than.Diploma Diploma AA.or.more BA.or.more percent.Less.Than.D~ percent.Diploma percent.AA.or.m~ percent.BA.or.m~
  <dbl>     <dbl> <chr>             <dbl>   <dbl>      <dbl>      <dbl>                <dbl>           <dbl>            <dbl>            <dbl>
1     1         2 1970                 71      72         73         74                   75              76               77               78
2     1         2 1980                 81      82         83         84                   85              86               87               88
3     1         2 1990                 91      92         93         94                   95              96               97               98
4     1         2 2000                 21      22         23         24                   25              26               27               28
> 

数据

df <-data.frame(
  "state" = 1, 
  "area_name" =2,
  "Less Than Diploma_1970" = 71,
  "Diploma_1970" = 72,
  "AA or more_1970"  = 73,
  "BA or more_1970"  = 74,
  "percent Less Than Diploma_1970"  = 75,
  "percent Diploma_1970"  = 76,
  "percent AA or more_1970"  = 77,
  "percent BA or more_1970"  = 78,
  "Less Than Diploma_1980"  = 81,
  "Diploma_1980" = 82,
  "AA or more_1980" = 83, 
  "BA or more_1980" = 84, 
  "percent Less Than Diploma_1980" = 85, 
  "percent Diploma_1980" = 86, 
  "percent AA or more_1980" = 87, 
  "percent BA or more_1980" = 88,
  "Less Than Diploma_1990" = 91,
  "Diploma_1990" = 92, 
  "AA or more_1990" = 93, 
  "BA or more_1990" = 94,
  "percent Less Than Diploma_1990" = 95 ,
  "percent Diploma_1990" = 96, 
  "percent AA or more_1990"= 97, 
  "percent BA or more_1990" = 98,
  "Less Than Diploma_2000" = 21,
  "Diploma_2000"  = 22, 
  "AA or more_2000"  = 23, 
  "BA or more_2000"  = 24, 
  "percent Less Than Diploma_2000"  = 25, 
  "percent Diploma_2000"  = 26, 
  "percent AA or more_2000"  = 27, 
  "percent BA or more_2000"  = 28)  
© www.soinside.com 2019 - 2024. All rights reserved.