如何将具有一个 ip 字符串字段的数据帧与另一个具有两个多 ip 字符串字段的数据帧相关联

问题描述 投票:0回答:1

这是我的第一个数据框:

s_cmdb_id = c("BEAT0001", "BEAT0002", "BEAT0003","BEAT0004", "BEAT0005", "BEAT0006", "ROLL0001", "ROLL0002", "ROLL0003","ROLL0004", "ROLL0005", "ROLL0006", "ROLL0007", "EAGL0001", "EAGL0002","EAGL0003", "EAGL0004", "EAGL0005")
s_name = c("JOHNL", "PAULM","GEORGEH", "RINGOS", "BRIANE", "GEORGEM", "MICKG", "KEITHR","CHARLIEW", "RONW", "BILLW", "BRIANJ", "MICKT", "GLENNF", "DONH", "TIMB", "JOEW", "DONF")
s_dns = c("johnl.winston.lennon", "paulm.sir.james.paul.mccartney","georgeh.george.harrison", "ringos.sir.richard.starkey", "briane.brian.samuel.epstein","georgem.sir.george.henry.martin", "mickj.sir.michael.philip.jagger","keithr.keith.richards", "charliew.charles.robert.watts", "ronw.ronald.david.wood","billw.william.george.wyman", "brianj.lewis.brian.hokin.jones","mickt.michael.kevin.taylor", "glennf.glenn.lewis.frey", "donh.donald.hugh.henley","timb.timothy.bruce.schmit", "joew.joseph.fidler.walsh", "donf.donald.william.felder")
s_company = c("Beatles Company", "Beatles Company", "Beatles Company","Beatles Company", "Beatles Company", "Beatles Company", "Rolling Company","Rolling Company", "Rolling Company", "Rolling Company", "Rolling Company","Rolling Company", "Rolling Company", "Eagles Company", "Eagles Company","Eagles Company", "Eagles Company", "Eagles Company")
s_environ = c("Production","Production", "Production", "Test", "Contingency", "Development","Production", "Certification", "Pre-Production", "Development","Test", "Production", "Test", "Production", "Production", "Development","Development", "Development")
s_ip_address = c("160.61.22.110","160.61.22.111", "160.61.22.112", "160.61.22.1", "160.61.20.13","150.23.33.130", "150.23.33.131", "150.23.33.132", NA, "150.23.33.134","150.23.33.135", NA, NA, "120.220.210.222", "120.220.210.223","120.220.210.224", "120.220.210.225", "120.220.210.226")
s_ip_other = c("16.30.12.110 14.22.33.1","16.30.12.111", "16.30.12.112", "16.30.12.1", "16.30.12.13",NA, "14.22.33.131", "14.22.33.132", "14.22.33.133  13.21.32.1 13.21.32.2","14.22.33.134", "14.22.33.135", "14.22.33.136 13.20.30.1  13.20.30.2","14.22.33.137   13.19.29.4  13.19.29.44", NA, NA, "110.11.2.3","110.11.2.4", "110.11.2.5")
s_status = c("Lawsuit", "Active","Available", "Active", "Damaged", "Active", "Active", "Active","End of Life", "Lawsuit", "Lost", "Scrapped", "Pending Disposal","Active", "Active", "Pending Wipe", "Wiped", "Wiped")
df_cmdb <- data.frame(s_cmdb_id,s_name,s_dns,s_company,s_environ,s_ip_address,s_ip_other,s_status)

> df_cmdb
   s_cmdb_id   s_name                           s_dns       s_company      s_environ    s_ip_address                             s_ip_other         s_status
1   BEAT0001    JOHNL            johnl.winston.lennon Beatles Company     Production   160.61.22.110                16.30.12.110 14.22.33.1          Lawsuit
2   BEAT0002    PAULM  paulm.sir.james.paul.mccartney Beatles Company     Production   160.61.22.111                           16.30.12.111           Active
3   BEAT0003  GEORGEH         georgeh.george.harrison Beatles Company     Production   160.61.22.112                           16.30.12.112        Available
4   BEAT0004   RINGOS      ringos.sir.richard.starkey Beatles Company           Test     160.61.22.1                             16.30.12.1           Active
5   BEAT0005   BRIANE     briane.brian.samuel.epstein Beatles Company    Contingency    160.61.20.13                            16.30.12.13          Damaged
6   BEAT0006  GEORGEM georgem.sir.george.henry.martin Beatles Company    Development   150.23.33.130                                   <NA>           Active
7   ROLL0001    MICKG mickj.sir.michael.philip.jagger Rolling Company     Production   150.23.33.131                           14.22.33.131           Active
8   ROLL0002   KEITHR           keithr.keith.richards Rolling Company  Certification   150.23.33.132                           14.22.33.132           Active
9   ROLL0003 CHARLIEW   charliew.charles.robert.watts Rolling Company Pre-Production            <NA>    14.22.33.133  13.21.32.1 13.21.32.2      End of Life
10  ROLL0004     RONW          ronw.ronald.david.wood Rolling Company    Development   150.23.33.134                           14.22.33.134          Lawsuit
11  ROLL0005    BILLW      billw.william.george.wyman Rolling Company           Test   150.23.33.135                           14.22.33.135             Lost
12  ROLL0006   BRIANJ  brianj.lewis.brian.hokin.jones Rolling Company     Production            <NA>    14.22.33.136 13.20.30.1  13.20.30.2         Scrapped
13  ROLL0007    MICKT      mickt.michael.kevin.taylor Rolling Company           Test            <NA> 14.22.33.137   13.19.29.4  13.19.29.44 Pending Disposal
14  EAGL0001   GLENNF         glennf.glenn.lewis.frey  Eagles Company     Production 120.220.210.222                                   <NA>           Active
15  EAGL0002     DONH         donh.donald.hugh.henley  Eagles Company     Production 120.220.210.223                                   <NA>           Active
16  EAGL0003     TIMB       timb.timothy.bruce.schmit  Eagles Company    Development 120.220.210.224                             110.11.2.3     Pending Wipe
17  EAGL0004     JOEW        joew.joseph.fidler.walsh  Eagles Company    Development 120.220.210.225                             110.11.2.4            Wiped
18  EAGL0005     DONF      donf.donald.william.felder  Eagles Company    Development 120.220.210.226 

                        110.11.2.5            Wiped

这是我的第二个数据框:

s_ip = c("14.22.33.1", "14.22.33.1", "16.30.12.111","16.30.12.111", "150.23.33.132", "13.21.32.2", "13.19.29.4")
s_dns = c("johnl.winston.lennon.dev", "johnl.winston.lennon.dev","paulm.sir.james.paul.mccartney", NA, "keithr.keith.richards","charliew.charles.robert.watts.cert", NA)
s_qid = c("38169","38170", "38601", "38603", "38909", "38655", "42366")
n_port = c(56134L,56134L, 56134L, 221L, 22L, 5634L, 9090L)
s_protocol = c("tcp","tcp", "tcp", "udp", "tcp", "tcp", "tcp")
df_scan <- data.frame(s_ip,s_dns,s_qid,n_port,s_protocol)

> df_scan
           s_ip                              s_dns s_qid n_port s_protocol
1    14.22.33.1           johnl.winston.lennon.dev 38169  56134        tcp
2    14.22.33.1           johnl.winston.lennon.dev 38170  56134        tcp
3  16.30.12.111     paulm.sir.james.paul.mccartney 38601  56134        tcp
4  16.30.12.111                               <NA> 38603    221        udp
5 150.23.33.132              keithr.keith.richards 38909     22        tcp
6    13.21.32.2 charliew.charles.robert.watts.cert 38655   5634        tcp
7    13.19.29.4                               <NA> 42366   9090        tcp

这是我的最终数据框:

s_ip = c("14.22.33.1", "14.22.33.1", "16.30.12.111","16.30.12.111", "150.23.33.132", "13.21.32.2", "13.19.29.4")
s_dns = c("johnl.winston.lennon.dev", "johnl.winston.lennon.dev","paulm.sir.james.paul.mccartney", NA, "keithr.keith.richards","charliew.charles.robert.watts.cert", NA)
s_qid = c("38169","38170", "38601", "38603", "38909", "38655", "42366")
n_port = c(56134L,56134L, 56134L, 221L, 22L, 5634L, 9090L)
s_protocol = c("tcp","tcp", "tcp", "udp", "tcp", "tcp", "tcp")
s_cmdb_id = c("BEAT0001","BEAT0001", "BEAT0002", "BEAT0002", "ROLL0002", "ROLL0003","ROLL0007")
s_name = c("JOHNL", "JOHNL", "PAULM", "PAULM","KEITHR", "CHARLIEW", "MICKT")
s_company = c("Beatles Company","Beatles Company", "Beatles Company", "Beatles Company","Rolling Company", "Rolling Company", "Rolling Company")
s_status = c("Lawsuit", "Lawsuit", "Active", "Active", "Active","End of Life", "Pending Disposal")
df_final <- data.frame(s_ip,s_dns,s_qid,n_port,s_protocol,s_cmdb_id,s_name,s_company,s_status)

> df_final
           s_ip                              s_dns s_qid n_port s_protocol s_cmdb_id   s_name       s_company         s_status
1    14.22.33.1           johnl.winston.lennon.dev 38169  56134        tcp  BEAT0001    JOHNL Beatles Company          Lawsuit
2    14.22.33.1           johnl.winston.lennon.dev 38170  56134        tcp  BEAT0001    JOHNL Beatles Company          Lawsuit
3  16.30.12.111     paulm.sir.james.paul.mccartney 38601  56134        tcp  BEAT0002    PAULM Beatles Company           Active
4  16.30.12.111                               <NA> 38603    221        udp  BEAT0002    PAULM Beatles Company           Active
5 150.23.33.132              keithr.keith.richards 38909     22        tcp  ROLL0002   KEITHR Rolling Company           Active
6    13.21.32.2 charliew.charles.robert.watts.cert 38655   5634        tcp  ROLL0003 CHARLIEW Rolling Company      End of Life
7    13.19.29.4                               <NA> 42366   9090        tcp  ROLL0007    MICKT Rolling Company Pending Disposal
  • 如您所见,df_final 是 df_scan 和 df_cmdb 的 left_join。
  • 添加了 4 个新字段,这些字段与 s_ip 字段 (df_scan) 相关 <---> s_ip_address+s_ip_other 字段 (df_cmdb)。
  • s_ip_address 和 s_ip_other 字段中的 ip 地址可以用一个或多个空格分隔。
  • 考虑 IP 地址可能相似(但不相等),例如。 ips: 14.22.33.1 类似于 14.22.33.131
  • 列 s_ip_address 和 s_ip_other 可能具有 NA 值。
  • 在 df_final 数据框中建立这种关系的最佳方法是什么?
  • 请考虑 df_scan 包含 100 万条记录,df_cmdb 包含 100k 条记录。
  • 提前致谢。
  • 我是 R 语言的初学者。
r
1个回答
0
投票

使用 dplyr,这是一种在

df_scan
上使用 left_join 的方法。使用 paste
strsplit
准备
df_cmdb

假设 s_ip_addresss_ip_other 中的所有值并非都是

NA

library(dplyr)

left_join(df_scan, df_cmdb %>% 
                     mutate(s_ip_all = paste(s_ip_address, s_ip_other),
                            s_ip_all = strsplit(s_ip_all, " +"), 
                            s_dns = NULL, 
                            s_environ = NULL, 
                            s_ip_address = NULL, 
                            s_ip_other = NULL) %>% 
                     unnest(s_ip_all), join_by(s_ip == s_ip_all))

输出

           s_ip                              s_dns s_qid n_port s_protocol
1    14.22.33.1           johnl.winston.lennon.dev 38169  56134        tcp
2    14.22.33.1           johnl.winston.lennon.dev 38170  56134        tcp
3  16.30.12.111     paulm.sir.james.paul.mccartney 38601  56134        tcp
4  16.30.12.111                               <NA> 38603    221        udp
5 150.23.33.132              keithr.keith.richards 38909     22        tcp
6    13.21.32.2 charliew.charles.robert.watts.cert 38655   5634        tcp
7    13.19.29.4                               <NA> 42366   9090        tcp
  s_cmdb_id   s_name       s_company         s_status
1  BEAT0001    JOHNL Beatles Company          Lawsuit
2  BEAT0001    JOHNL Beatles Company          Lawsuit
3  BEAT0002    PAULM Beatles Company           Active
4  BEAT0002    PAULM Beatles Company           Active
5  ROLL0002   KEITHR Rolling Company           Active
6  ROLL0003 CHARLIEW Rolling Company      End of Life
7  ROLL0007    MICKT Rolling Company Pending Disposal

如果您想使用多核,也可以尝试data.table

library(data.table)

setDT(df_cmdb)
setDT(df_scan)

df_cmdb[, .(s_ip = unlist(strsplit(paste(s_ip_address, s_ip_other), " +")), 
              s_cmdb_id, s_name, s_company, s_status), 
  by=1:nrow(df_cmdb)][df_scan, on="s_ip"][, -"nrow"]

输出

            s_ip s_cmdb_id   s_name       s_company         s_status
          <char>    <char>   <char>          <char>           <char>
1:    14.22.33.1  BEAT0001    JOHNL Beatles Company          Lawsuit
2:    14.22.33.1  BEAT0001    JOHNL Beatles Company          Lawsuit
3:  16.30.12.111  BEAT0002    PAULM Beatles Company           Active
4:  16.30.12.111  BEAT0002    PAULM Beatles Company           Active
5: 150.23.33.132  ROLL0002   KEITHR Rolling Company           Active
6:    13.21.32.2  ROLL0003 CHARLIEW Rolling Company      End of Life
7:    13.19.29.4  ROLL0007    MICKT Rolling Company Pending Disposal
                                s_dns  s_qid n_port s_protocol
                               <char> <char>  <int>     <char>
1:           johnl.winston.lennon.dev  38169  56134        tcp
2:           johnl.winston.lennon.dev  38170  56134        tcp
3:     paulm.sir.james.paul.mccartney  38601  56134        tcp
4:                               <NA>  38603    221        udp
5:              keithr.keith.richards  38909     22        tcp
6: charliew.charles.robert.watts.cert  38655   5634        tcp
7:                               <NA>  42366   9090        tcp
© www.soinside.com 2019 - 2024. All rights reserved.