这是我的第一个数据框:
s_cmdb_id = c("BEAT0001", "BEAT0002", "BEAT0003","BEAT0004", "BEAT0005", "BEAT0006", "ROLL0001", "ROLL0002", "ROLL0003","ROLL0004", "ROLL0005", "ROLL0006", "ROLL0007", "EAGL0001", "EAGL0002","EAGL0003", "EAGL0004", "EAGL0005")
s_name = c("JOHNL", "PAULM","GEORGEH", "RINGOS", "BRIANE", "GEORGEM", "MICKG", "KEITHR","CHARLIEW", "RONW", "BILLW", "BRIANJ", "MICKT", "GLENNF", "DONH", "TIMB", "JOEW", "DONF")
s_dns = c("johnl.winston.lennon", "paulm.sir.james.paul.mccartney","georgeh.george.harrison", "ringos.sir.richard.starkey", "briane.brian.samuel.epstein","georgem.sir.george.henry.martin", "mickj.sir.michael.philip.jagger","keithr.keith.richards", "charliew.charles.robert.watts", "ronw.ronald.david.wood","billw.william.george.wyman", "brianj.lewis.brian.hokin.jones","mickt.michael.kevin.taylor", "glennf.glenn.lewis.frey", "donh.donald.hugh.henley","timb.timothy.bruce.schmit", "joew.joseph.fidler.walsh", "donf.donald.william.felder")
s_company = c("Beatles Company", "Beatles Company", "Beatles Company","Beatles Company", "Beatles Company", "Beatles Company", "Rolling Company","Rolling Company", "Rolling Company", "Rolling Company", "Rolling Company","Rolling Company", "Rolling Company", "Eagles Company", "Eagles Company","Eagles Company", "Eagles Company", "Eagles Company")
s_environ = c("Production","Production", "Production", "Test", "Contingency", "Development","Production", "Certification", "Pre-Production", "Development","Test", "Production", "Test", "Production", "Production", "Development","Development", "Development")
s_ip_address = c("160.61.22.110","160.61.22.111", "160.61.22.112", "160.61.22.1", "160.61.20.13","150.23.33.130", "150.23.33.131", "150.23.33.132", NA, "150.23.33.134","150.23.33.135", NA, NA, "120.220.210.222", "120.220.210.223","120.220.210.224", "120.220.210.225", "120.220.210.226")
s_ip_other = c("16.30.12.110 14.22.33.1","16.30.12.111", "16.30.12.112", "16.30.12.1", "16.30.12.13",NA, "14.22.33.131", "14.22.33.132", "14.22.33.133 13.21.32.1 13.21.32.2","14.22.33.134", "14.22.33.135", "14.22.33.136 13.20.30.1 13.20.30.2","14.22.33.137 13.19.29.4 13.19.29.44", NA, NA, "110.11.2.3","110.11.2.4", "110.11.2.5")
s_status = c("Lawsuit", "Active","Available", "Active", "Damaged", "Active", "Active", "Active","End of Life", "Lawsuit", "Lost", "Scrapped", "Pending Disposal","Active", "Active", "Pending Wipe", "Wiped", "Wiped")
df_cmdb <- data.frame(s_cmdb_id,s_name,s_dns,s_company,s_environ,s_ip_address,s_ip_other,s_status)
> df_cmdb
s_cmdb_id s_name s_dns s_company s_environ s_ip_address s_ip_other s_status
1 BEAT0001 JOHNL johnl.winston.lennon Beatles Company Production 160.61.22.110 16.30.12.110 14.22.33.1 Lawsuit
2 BEAT0002 PAULM paulm.sir.james.paul.mccartney Beatles Company Production 160.61.22.111 16.30.12.111 Active
3 BEAT0003 GEORGEH georgeh.george.harrison Beatles Company Production 160.61.22.112 16.30.12.112 Available
4 BEAT0004 RINGOS ringos.sir.richard.starkey Beatles Company Test 160.61.22.1 16.30.12.1 Active
5 BEAT0005 BRIANE briane.brian.samuel.epstein Beatles Company Contingency 160.61.20.13 16.30.12.13 Damaged
6 BEAT0006 GEORGEM georgem.sir.george.henry.martin Beatles Company Development 150.23.33.130 <NA> Active
7 ROLL0001 MICKG mickj.sir.michael.philip.jagger Rolling Company Production 150.23.33.131 14.22.33.131 Active
8 ROLL0002 KEITHR keithr.keith.richards Rolling Company Certification 150.23.33.132 14.22.33.132 Active
9 ROLL0003 CHARLIEW charliew.charles.robert.watts Rolling Company Pre-Production <NA> 14.22.33.133 13.21.32.1 13.21.32.2 End of Life
10 ROLL0004 RONW ronw.ronald.david.wood Rolling Company Development 150.23.33.134 14.22.33.134 Lawsuit
11 ROLL0005 BILLW billw.william.george.wyman Rolling Company Test 150.23.33.135 14.22.33.135 Lost
12 ROLL0006 BRIANJ brianj.lewis.brian.hokin.jones Rolling Company Production <NA> 14.22.33.136 13.20.30.1 13.20.30.2 Scrapped
13 ROLL0007 MICKT mickt.michael.kevin.taylor Rolling Company Test <NA> 14.22.33.137 13.19.29.4 13.19.29.44 Pending Disposal
14 EAGL0001 GLENNF glennf.glenn.lewis.frey Eagles Company Production 120.220.210.222 <NA> Active
15 EAGL0002 DONH donh.donald.hugh.henley Eagles Company Production 120.220.210.223 <NA> Active
16 EAGL0003 TIMB timb.timothy.bruce.schmit Eagles Company Development 120.220.210.224 110.11.2.3 Pending Wipe
17 EAGL0004 JOEW joew.joseph.fidler.walsh Eagles Company Development 120.220.210.225 110.11.2.4 Wiped
18 EAGL0005 DONF donf.donald.william.felder Eagles Company Development 120.220.210.226
110.11.2.5 Wiped
这是我的第二个数据框:
s_ip = c("14.22.33.1", "14.22.33.1", "16.30.12.111","16.30.12.111", "150.23.33.132", "13.21.32.2", "13.19.29.4")
s_dns = c("johnl.winston.lennon.dev", "johnl.winston.lennon.dev","paulm.sir.james.paul.mccartney", NA, "keithr.keith.richards","charliew.charles.robert.watts.cert", NA)
s_qid = c("38169","38170", "38601", "38603", "38909", "38655", "42366")
n_port = c(56134L,56134L, 56134L, 221L, 22L, 5634L, 9090L)
s_protocol = c("tcp","tcp", "tcp", "udp", "tcp", "tcp", "tcp")
df_scan <- data.frame(s_ip,s_dns,s_qid,n_port,s_protocol)
> df_scan
s_ip s_dns s_qid n_port s_protocol
1 14.22.33.1 johnl.winston.lennon.dev 38169 56134 tcp
2 14.22.33.1 johnl.winston.lennon.dev 38170 56134 tcp
3 16.30.12.111 paulm.sir.james.paul.mccartney 38601 56134 tcp
4 16.30.12.111 <NA> 38603 221 udp
5 150.23.33.132 keithr.keith.richards 38909 22 tcp
6 13.21.32.2 charliew.charles.robert.watts.cert 38655 5634 tcp
7 13.19.29.4 <NA> 42366 9090 tcp
这是我的最终数据框:
s_ip = c("14.22.33.1", "14.22.33.1", "16.30.12.111","16.30.12.111", "150.23.33.132", "13.21.32.2", "13.19.29.4")
s_dns = c("johnl.winston.lennon.dev", "johnl.winston.lennon.dev","paulm.sir.james.paul.mccartney", NA, "keithr.keith.richards","charliew.charles.robert.watts.cert", NA)
s_qid = c("38169","38170", "38601", "38603", "38909", "38655", "42366")
n_port = c(56134L,56134L, 56134L, 221L, 22L, 5634L, 9090L)
s_protocol = c("tcp","tcp", "tcp", "udp", "tcp", "tcp", "tcp")
s_cmdb_id = c("BEAT0001","BEAT0001", "BEAT0002", "BEAT0002", "ROLL0002", "ROLL0003","ROLL0007")
s_name = c("JOHNL", "JOHNL", "PAULM", "PAULM","KEITHR", "CHARLIEW", "MICKT")
s_company = c("Beatles Company","Beatles Company", "Beatles Company", "Beatles Company","Rolling Company", "Rolling Company", "Rolling Company")
s_status = c("Lawsuit", "Lawsuit", "Active", "Active", "Active","End of Life", "Pending Disposal")
df_final <- data.frame(s_ip,s_dns,s_qid,n_port,s_protocol,s_cmdb_id,s_name,s_company,s_status)
> df_final
s_ip s_dns s_qid n_port s_protocol s_cmdb_id s_name s_company s_status
1 14.22.33.1 johnl.winston.lennon.dev 38169 56134 tcp BEAT0001 JOHNL Beatles Company Lawsuit
2 14.22.33.1 johnl.winston.lennon.dev 38170 56134 tcp BEAT0001 JOHNL Beatles Company Lawsuit
3 16.30.12.111 paulm.sir.james.paul.mccartney 38601 56134 tcp BEAT0002 PAULM Beatles Company Active
4 16.30.12.111 <NA> 38603 221 udp BEAT0002 PAULM Beatles Company Active
5 150.23.33.132 keithr.keith.richards 38909 22 tcp ROLL0002 KEITHR Rolling Company Active
6 13.21.32.2 charliew.charles.robert.watts.cert 38655 5634 tcp ROLL0003 CHARLIEW Rolling Company End of Life
7 13.19.29.4 <NA> 42366 9090 tcp ROLL0007 MICKT Rolling Company Pending Disposal
使用 dplyr,这是一种在
df_scan上使用
left_join
的方法。使用 paste
和 strsplit
准备 df_cmdb。
假设 s_ip_address 或 s_ip_other 中的所有值并非都是
NA
。
library(dplyr)
left_join(df_scan, df_cmdb %>%
mutate(s_ip_all = paste(s_ip_address, s_ip_other),
s_ip_all = strsplit(s_ip_all, " +"),
s_dns = NULL,
s_environ = NULL,
s_ip_address = NULL,
s_ip_other = NULL) %>%
unnest(s_ip_all), join_by(s_ip == s_ip_all))
输出
s_ip s_dns s_qid n_port s_protocol
1 14.22.33.1 johnl.winston.lennon.dev 38169 56134 tcp
2 14.22.33.1 johnl.winston.lennon.dev 38170 56134 tcp
3 16.30.12.111 paulm.sir.james.paul.mccartney 38601 56134 tcp
4 16.30.12.111 <NA> 38603 221 udp
5 150.23.33.132 keithr.keith.richards 38909 22 tcp
6 13.21.32.2 charliew.charles.robert.watts.cert 38655 5634 tcp
7 13.19.29.4 <NA> 42366 9090 tcp
s_cmdb_id s_name s_company s_status
1 BEAT0001 JOHNL Beatles Company Lawsuit
2 BEAT0001 JOHNL Beatles Company Lawsuit
3 BEAT0002 PAULM Beatles Company Active
4 BEAT0002 PAULM Beatles Company Active
5 ROLL0002 KEITHR Rolling Company Active
6 ROLL0003 CHARLIEW Rolling Company End of Life
7 ROLL0007 MICKT Rolling Company Pending Disposal
如果您想使用多核,也可以尝试data.table。
library(data.table)
setDT(df_cmdb)
setDT(df_scan)
df_cmdb[, .(s_ip = unlist(strsplit(paste(s_ip_address, s_ip_other), " +")),
s_cmdb_id, s_name, s_company, s_status),
by=1:nrow(df_cmdb)][df_scan, on="s_ip"][, -"nrow"]
输出
s_ip s_cmdb_id s_name s_company s_status
<char> <char> <char> <char> <char>
1: 14.22.33.1 BEAT0001 JOHNL Beatles Company Lawsuit
2: 14.22.33.1 BEAT0001 JOHNL Beatles Company Lawsuit
3: 16.30.12.111 BEAT0002 PAULM Beatles Company Active
4: 16.30.12.111 BEAT0002 PAULM Beatles Company Active
5: 150.23.33.132 ROLL0002 KEITHR Rolling Company Active
6: 13.21.32.2 ROLL0003 CHARLIEW Rolling Company End of Life
7: 13.19.29.4 ROLL0007 MICKT Rolling Company Pending Disposal
s_dns s_qid n_port s_protocol
<char> <char> <int> <char>
1: johnl.winston.lennon.dev 38169 56134 tcp
2: johnl.winston.lennon.dev 38170 56134 tcp
3: paulm.sir.james.paul.mccartney 38601 56134 tcp
4: <NA> 38603 221 udp
5: keithr.keith.richards 38909 22 tcp
6: charliew.charles.robert.watts.cert 38655 5634 tcp
7: <NA> 42366 9090 tcp