我正在尝试将 2021 年仇恨犯罪主文件从犯罪数据浏览器提取到 R 中。 https://cde.ucr.cjis.gov/LATEST/webapp/#/pages/downloads(滚动至主文件,选择“仇恨犯罪”,然后选择“2021”)
该文件是固定宽度的 ASCII 文本格式。该文件包含两种记录类型 - 批次标题和事件报告。每个批次标头 (BH) 后面都跟着可用的仇恨犯罪事件报告 (IR),并且该批次标头后面的所有事件报告都属于该批次标头的原始机构标识符 (ORI)。批次标头后面可以跟另一个批次标头,这意味着 ORI 没有事件数据。
我看过一些与此数据相关的帖子,但他们提取此数据的最终目标与我的不同。理想情况下,我想通过创建多个数据框(所有数据框都具有相同的格式并链接在一起)将此数据导入到 R 中。我希望每个数据框都有 BH,然后跟踪任何事件报告。如果有一种方法可以创建一个单一的数据框,然后在每个批标题下包含事件报告的子集,那么这也是可行的。我对 R 有一些了解,但我不是专家 - 确实需要一些关于如何入门的帮助。
我该如何最好地提取这些数据?
我研究了很多文章,但一直找不到可以让我开始的东西。我的主要问题是我想要所有数据 - 不仅仅是批标题或事件报告。
前100行数据:
BH50AK0010100000000000000 ANCHORAGE AK1C941Y 3030020A 00028623800 38000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZIII ANCHORAGE 020
IR50AK00101002W2MGU73SXX820210527 200100U2900011332I 0000010000U
IR50AK00101002W2MGU73SY2 20210525 200001W2900010343B 0000000001U
IR50AK00101002W2MGU82JNB420210825 300301W13C0012014I 0010020100U
IR50AK00101002W2MGU82S46820211104 400000U2900010422R 0000000000U
IR50AK0010100PT-0WE0W4HCT20210429 200102I13A0011341I 0010000200N
BH50AK0010200000000000000 20210101FAIRBANKS AK4 941Y 3030020AA 00003059800 258000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZIZ FAIRBANKS 090
IR50AK0010200CN0BAM5D728N20210802N300102I13A0011341I 0010000200U
IR50AK0010200CN0BIVMR728N20210813N300101W13C0011441I 0010000100N
IR50AK0010200CN0BIWS9728N20210924N300001I2900010521B 0000000100N
BH50AK0010300000000000000 JUNEAU AK4 941N 3030020A 00003187400 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021I I JUNEAU 110
IR50AK0010300CN01REICT-8N20210202 100100U13B0011313I 0010000000U
IR50AK0010300Z91X J8ZU63920211229 400101B13B0015413I 0010000100N
BH50AK0010400000000000000 20210101KETCHIKAN AK6 941N 3030020AA 00000819800 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZI KETCHIKAN 130
IR50AK0010400CN-VAO5U728N20211003N400000U2900011126O 0000000000
BH50AK0010500000000000000 20210101KODIAK AK6 941N 3030020AA 00000577600 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ KODIAK 150
BH50AK0010600000000000000 20210101NOME AK6 941N 3030020AA 00000387100 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZIZ NOME 180
IR50AK00106003I-MQ3QCZX I20210718N300100U23H0012041I 2900012041I 0010000000
BH50AK0010700000000000000 20210101PETERSBURG AK6 941N 3030020AA 00000330200 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZIZZ PETERSBURG 195
IR50AK0010700CN-BEOSU728N20210429N200101W13C0012412I 0010000100N
BH50AK0010800000000000000 20210101SEWARD AK6 941N 3030020AA 00000295000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZIZ SEWARD 122
IR50AK0010800CN-B-YZU728N20210922N300301W13B0032042I 0030000100U
BH50AK0010900000000000000 20210101SITKA AK6 941N 3030020AA 00000835300 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ SITKA 220
BH50AK0011000000000000000 20210101SKAGWAY AK7 941N 3030020AA 00000120200 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ SKAGWAY 230
BH50AK0011100000000000000 20210101WRANGELL AK6 941N 3030020AA 00000252200 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ WRANGELL 275
BH50AK0011200000000000000 20210101VALDEZ AK6 941N 3030020AA 00000384200 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ VALDEZ 261
BH50AK0011300000000000000 20210101BETHEL AK6 941N 3030020AA 00000671700 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ BETHEL 050
BH50AK0011400000000000000 20210101CORDOVA AK7 941N 3030020AA 00000216200 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ CORDOVA 261
BH50AK0011600000000000000 20210101KOTZEBUE AK6 941N 3030020AA 00000326400 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ KOTZEBUE 188
BH50AK0011700000000000000 20210101PALMER AK6 941N 3030020AA 00000797200 38000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZI PALMER 170
IR50AK0011700PE2MPUH3I71A20211105N400101W13A0011171I 0010000100
BH50AK0011800000000000000 20210101BARROW AK6 941N 3030020AA 00000931000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ NORTH SLOPE BOROUGH 185
BH50AK0011900000000000000 SELDOVIA AK7 941N 3030020A 00000028000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 SELDOVIA 122
BH50AK0012000000000000000 20210101SOLDOTNA AK6 941N 3030020AA 00000485500 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ SOLDOTNA 122
BH50AK0012100000000000000 20210101HAINES AK6 941N 3030020AA 00000262400 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ HAINES 100
BH50AK0012200000000000000 20210101HOMER AK6 941N 3030020AA 00000614300 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ HOMER 122
BH50AK0012300000000000000 20210101KENAI AK6 941N 3030020AA 00000797000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ KENAI 122
BH50AK0012400000000000000 ANCHORAGE AK7 941NAK00101003030020A 2021120100000000000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 SPENARD SERVICE DISTRICT 020
BH50AK0012500000000000000 20210101METLAKATLA AK7 947N 3030020AA 000000000000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ METLAKATLA TRIBAL
BH50AK0012600000000000000 20210101FAIRBANKS AK7 945N 3030020AA 000000000000258000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ FAIRBANKS INTRNL AIRPORT
BH50AK0012700000000000000 20210101ANCHORAGE AK7 945N 3030020AA 000000000000 38000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZ Z TED STEVENS ANCHORAGE INTERNAT
BH50AK0012800000000000000 WHITTIER AK7 941N 3030020A 00000020300 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 WHITTIER 261
BH50AK0013000000000000000 20210101DILLINGHAM AK7 941N 3030020AA 00000232400 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ DILLINGHAM 070
BH50AK0013100000000000000 HOONAH AK7 941N 3030020A 00000080100 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 HOONAH 105
BH50AK0013200000000000000 20210101NORTH POLE AK7 941N 3030020AA 00000210600 258000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ NORTH POLE 090
BH50AK0013300000000000000 20210101UNALASKA AK6 941N 3030020AA 00000448400 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ UNALASKA 016
BH50AK0013500000000000000 KLAWOCK AK7 941N 3030020A 00000078100 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 KLAWOCK 198
BH50AK0014200000000000000 20210101FAIRBANKS AK7 943N 3030020AA 000000000000258000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ UNIV OF AK: FAIRBANKS
BH50AK0014300000000000000 20210101ANCHORAGE AK7 943N 3030020AA 000000000000 38000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ UNIV OF AK: ANCHORAGE
BH50AK0014500000000000000 20210101CRAIG AK7 941N 3030020AA 00000127300 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ CRAIG 198
BH50AK0014600000000000000 20210101KING SALMON AK7 941N 3030020AA 00000076900 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZZZ BRISTOL BAY BOROUGH 060
BH50AK0014700000000000000 KAKE AK7 941NAKAST01003030020A 2021120100000053600 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 KAKE 198
BH50AK0014900000000000000 NENANA AK7 941NAKAST01003030020A 2021120100000034000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 NENANA 290
BH50AK0015000000000000000 SAINT PAUL ISLAND AK7 941N 3030020A 00000048600 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 ST. PAUL 016
BH50AK0015200000000000000 SAND POINT AK7 941N 3030020A 00000110000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 SAND POINT 013
BH50AK0015400000000000000 TOGIAK AK7 941NAKAST01003030020A 2021120100000082500 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 TOGIAK 070
BH50AK0015600000000000000 20210101WASILLA AK5 941N 3030020AA 00001148500 38000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZIZ WASILLA 170
IR50AK0015600W81HPUOPZY1A20210821N300101W13C0012062I 2900011862I 0010000100N
IR50AK0015600W81HPUOZJ21A20210806N300101W13A0011812I 41 0010000100N
BH50AK0016300000000000000 ANVIK AK7 941NAKAST01003030020A 20211201000000078000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 ANVIK
BH50AK0016400000000000000 EMMONAK AK7 941NAKAST01003030020A 2021120100000081900 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 EMMONAK 158
BH50AK0016600000000000000 HOUSTON AK6 941NAKAST01003030020A 2021120100000252300 38000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 HOUSTON 170
BH50AK020045Y000000000000 ANCHORAGE AK7 946NAKAST01003030020A 20211201000000000000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 ALCOHOL BEV CONTROL BRD
BH50AKASP0000000000000000 ANCHORAGE AK0 944NAKAST01003030020A 2021120100000000000 000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 STATE TROOPERS
BH50AKAST0000000000000000 AK8E944N 3030020A 00000000000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 STATE TROOPERS, JUNEAU 110
BH50AKAST0100000000000000 20210101Anchorage AK8A944N 3030020AA 000266838000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021ZZIZ ALASKA STATE TROOPERS
IR50AKAST0100D42A5WL-XNTD20210815N300100U2900011011I 0010000000
IR50AKAST0100GB2WKBBM619G20210714N300201I13A0022011IL 0020000100
IR50AKAST0100GB2WKBBM61NG20210714N300101I13C0015811I 0010000100
BH50AKAST0600000000000000 AK8E944N 3030020A 00000000000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 STATE TROOPERS, BETHEL 050
BH50AKAST1500000000000000 Anchorage AK9E944N 3030020A 00000000000 38000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 STATE TROOPERS HQS, ANCHORAGE 020
BH98AKATF0200000000000000 FS 3030020A 000000000000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 BUREAU OF ATF ANCHORAGE
BH50AKDI00100000000000000 AK0 94 N 00000000 00000000000 000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 KATMAI NM
BH50AKDI00200000000000000 AK0 94 N 00000000 00000000000 000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 MOUNT MCKINLEY NP
BH50AKDI00300000000000000 AK0 94 N 00000000 00000000000 000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 SITKA NHP
BH50AKDI00400000000000000 AK0 94 N 00000000 00000000000 000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 GLACIER BAY NM
BH98AKFBIAN00000000000000 FS DCFBIWA013030020A 20211201000000000000 0000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021 FBI, ANCHORAGE
BH01AL0010000000000000000 20210101BIRMINGHAM AL9A632N 3070010NA 000156865037 98000000000000000000 000000000000000000 000000000000000000 000000000000000000 0000000002021IIII JEFFERSON 073
IR01AL001000027231GOYLCXV20211007N400101W13B0012011I 0010000100N
IR01AL00100002H6W8M9H4K7 20210902N300101B13B0011545I 0010000100N
IR01AL00100002W2JPU7Z7NXO20210413N200101B13A0011533I 0010000100N
IR01AL00100002ZE6O-L326ST20210510N200101W13C0012012I 2900012012I 0010000100N
IR01AL00100003S0GHP4YMZSL20210822N300001W2900011511B 0000000100N
IR01AL0010000696A5PTV301Q20210909N300001W2900012285B 0000000001N
IR01AL00100008WDEJ-K1J6X020210202N100101W13A0011312I 0010000100N
IR01AL0010000AZEKH18Y7HZ420210820N300101W13B0011511I 0010000100N
IR01AL0010000D96IGAR23PU620211114N400101U23G0012011I 0010000000
IR01AL0010000EM2MRVC3TUE220211116N400101U26F0012085I 0010000000U
IR01AL0010000H E193G07Q5P20210810N300101W13B0012012I 0010000100N
IR01AL0010000HQ7G6OPPYP6S20210627N200001B35A0011311S 0000000100N
IR01AL0010000HSCF7R8HWW3320211004N400101U23F0012085I 0010000000U
IR01AL0010000IE0DVPBMDOTC20210908N300100U2900012533I 0010000000
IR01AL0010000IGAM8S0HFQ3C20210518N200101W13C0012312I 0010000100N
IR01AL0010000IQBI5Q-PGA2Q20210821N300002B23D0010711B 0000000200N
IR01AL0010000K-EYLL64K0Z520210625N200001B2900011511G 0000000100N
IR01AL0010000O24G0XNSPMRX20210826N300101U23D0012041I 0010000000
file <- "/Users/pc/Downloads/hate-crime-2021/2021_HC_NATIONAL_MASTER_FILE_ENC.txt"
library(tidyverse)
# read file in line by line, and make each line it's own string
lines <- readLines(file)
# turn each line into a row of a one column tibble
bh_df <- lines[grepl("^BH", lines)] |> as_tibble()
# since the data is fixed width, we use the separate_wider_position function to split the data into columns
# this is a huge pain to do, so I will leave doing the ir ones to you. You can find the details on how wide each column is meant to be in the help file you find when downloading the dataset
bh_df <- bh_df |>
separate_wider_position(value, widths = c(type = 2,
state = 2,
ori = 9,
incident_number = 12,
date_added = 8,
date_ori_went_nibrs = 8,
city = 30,
state_abb = 2,
pop_group = 2,
country_div = 1,
country_region = 1,
agency_ind = 1,
core_city = 1,
covered_by_ori = 9,
fbi_field_office = 4,
judicial_district = 4,
agency_nibrs_flag = 1,
date_agency_inactive = 8,
current_pop1 = 9,
ucr_county_code1 = 3,
msa_code1 = 3,
last_pop1 = 9,
current_pop2 = 9,
ucr_county_code2 = 3,
msa_code2 = 3,
last_pop2 = 9,
current_pop3 = 9,
ucr_county_code3 = 3,
msa_code3 = 3,
last_pop3 = 9,
current_pop4 = 9,
ucr_county_code4 = 3,
msa_code4 = 3,
last_pop4 = 9,
current_pop5 = 9,
ucr_county_code5 = 3,
msa_code5 = 3,
last_pop5 = 9,
master_file_year = 4,
state_first_quarter_activy = 1,
state_second_quarter_activity = 1,
state_third_quarter_activity = 1,
state_fourth_quarter_activity = 1,
fed_first_quarter_activy = 1,
fed_second_quarter_activity = 1,
fed_third_quarter_activity = 1,
fed_fourth_quarter_activity = 1,
agency_name = 30,
fips_county1 = 3,
fips_county2 = 3,
fips_county3 = 3,
fips_county4 = 3,
fips_county5 = 3
))
ir_df <- lines[grepl("^IR", lines)] |> as_tibble()
# now do something similar, but with the widths for the ir data! :-)
输出:
# A tibble: 25,298 × 53
type state ori incident_number date_added date_ori_went_nibrs city
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 BH 50 AK0010100 000000000000 " " " " "ANCHOR…
2 BH 50 AK0010200 000000000000 " " "20210101" "FAIRBA…
3 BH 50 AK0010300 000000000000 " " " " "JUNEAU…
4 BH 50 AK0010400 000000000000 " " "20210101" "KETCHI…
5 BH 50 AK0010500 000000000000 " " "20210101" "KODIAK…
6 BH 50 AK0010600 000000000000 " " "20210101" "NOME …
7 BH 50 AK0010700 000000000000 " " "20210101" "PETERS…
8 BH 50 AK0010800 000000000000 " " "20210101" "SEWARD…
9 BH 50 AK0010900 000000000000 " " "20210101" "SITKA …
10 BH 50 AK0011000 000000000000 " " "20210101" "SKAGWA…
# ℹ 25,288 more rows
# ℹ 46 more variables: state_abb <chr>, pop_group <chr>, country_div <chr>,
# country_region <chr>, agency_ind <chr>, core_city <chr>,
# covered_by_ori <chr>, fbi_field_office <chr>, judicial_district <chr>,
# agency_nibrs_flag <chr>, date_agency_inactive <chr>, current_pop1 <chr>,
# ucr_county_code1 <chr>, msa_code1 <chr>, last_pop1 <chr>,
# current_pop2 <chr>, ucr_county_code2 <chr>, msa_code2 <chr>, …
# ℹ Use `print(n = ...)` to see more rows