gsub for 循环创建重复的字符串

问题描述 投票:0回答:1

我有这个数据框:

test2 <- structure(list(AU = c("MEISYARA D;GUSWENRIVO I;SINGHAM GV", "VANGALA RM;LOSHALI A;BASA KS;CH G;MASTHAN S;GANACHARI BC;MUNGALA SR;TADAKAMADLA J;TADAKAMADLA SK;BALLA SB", 
                               "SEBASTIAN S;FRANCO A;MÂNICA S", "HUO D-M;MAO X-Y;MO W-W;ZHAO F-M;DU M;SUN R-R", 
                               "KIHARA EN;KARANJA SM;WANZALA P;WAGAIYU EG", "LIANG CT;HIGGINS D;ASHAR A", 
                               "SINDI MA;AL-SEBAEI MO;BAMASHMOUS MS", "PALOMINO-SOTO M;CARRANZA-SAMANEZ K;DULANTO-VARGAS J;QUEZADA-MARQUEZ MM;FONSECA GM;RAMIREZ-WONG F", 
                               "PANCI V;HACKMAN L", "STEVENS DW;DUNN MR;MILLS VS;BOWDEN DA;MCMILLAN PJ;HART AC;CHIN C;DAVEY NK;PINKERTON MH", 
                               "ŠVÁBOVÁ NEE UHROVÁ P;BEŇUŠ R;CHOVANCOVÁ NEE KONDEKOVÁ M;VOJTUŠOVÁ A;NOVOTNÝ M;THURZO A", 
                               "RIAZ S;KHAMIS MF;ABDULLAH JY;AHMAD WMAW;ALAM MK", "FAN F;KE W;DAI X;SHI L;LIU Y;LIN Y;CHENG Z;ZHANG Y;CHEN H;DENG Z", 
                               "CASTILLO-ALONSO C;TABILO L;LÓPEZ-LÁZARO S", "ROBERTS G;LUCAS VS;CAMILLERI S;JAYARAMAN J;KASPER KA;LEWIS JM", 
                               "BU W-Q;GUO Y-X;ZHANG D;DU S-Y;HAN M-Q;WU Z-X;TANG Y;CHEN T;GUO Y-C;MENG H-T", 
                               "TANAKA S;KARIBE H;KATO Y;KOMATSUZAKI A;SEKIMOTO T;SHIMOMURA-KUROKI J", 
                               "KLINGBERG G;BENCHIMOL D;BERLIN H;BRING J;GORNITZKI C;ODEBERG J;TRANÆUS S;TWETMAN S;WERNERSSON E;ÖSTLUND P;DOMEIJ H", 
                               "TSOGTSAIKHAN K;HATANO Y;KOSAKA M;YOSHIDA K;MINJUUR T;GARIDKHUU A;SASAKI K;SUZUKI T", 
                               "LOPATIN O;BARSZCZ M;WOŹNIAK KJ", "SINGH M", "VILA-BLANCO N;VARAS-QUINTANA P;TOMÁS I;CARREIRA MJ", 
                               "GIRIJAN P;BOEDI R;MÂNICA S;FRANCO A", "KAVYA S;PRABHU S;ADYANTHYA S;MIQDAD S;ABDULLA R;JAYARAJAN D", 
                               "HINTON MS;MCMILLAN BR;HERSEY KR;LARSEN RT", "HASSAN A;ELHOSENY M;KAYED M", 
                               "JAYAPRIYA T;KELUSKAR V;SRIDHAR M;LOKESH KUMAR S;FERNANDES A", 
                               "NUZZOLESE E;MALERBA G;VELLA GD", "BARRONE J;VIDAL MC;STEVENSON R", 
                               "TIMME M;VIKTOROV J;STEFFENS L;STREETER A;KARCH A;SCHMELING A", 
                               "ALWOHAIBI RN;ALMAIMONI RA;ALSHREFY AJ;ALMUSAILET LI;ALHAZZAA SA;MENEZES RG", 
                               "EL-DESOUKY SS;KABBASH IA", "ANDERSEN IL;OCEPEK M;THINGNES SL;NEWBERRY RC", 
                               "WANG C;TIAN ZK;WEN D;QU W;XU R;LIU Y;JIA H;TANG X;LI J;ZHA L;LIU Y", 
                               "MORGAN J", "SALAZAR-VALENZUELA L;LÓPEZ-LÁZARO S;AGUAYO-CÁDIZ JE;CAPITANEANU C;FONSECA GM", 
                               "MAHABALA KY;NATARAJAN S;MAHMOOD MT;NAYAK AP;RAO A", "BJELOPAVLOVIC M;REDER SR;FRITZEN I;BROCKMANN MA;HARDT J;PETROWSKI K", 
                               "LORETO DBL;BARROS BÁCD", "WALIA M;BHATI K;GULLAIYA J", "ARROYO-BOTE S;MARTÍNEZ-ARROYO C;GALLEGO-ÁLVAREZ MÁ;ARROYO-BOTE C;MANZANARES-CÉSPEDES MC", 
                               "SOORNEEDI N;VENKATACHALAIAH A;MANIKYA S;DASARI A;GADDAM B;LATHA A", 
                               "PALMELA PEREIRA CM;DOS SANTOS AR;GONÇALVES CR;NUSHI V;COUTINHO F;SALVADO E SILVA FJ;DE SOUSA SANTOS RFV", 
                               "JAWANDA MK;GUPTA S;SANDHU H;ESCOBEDO RLO;BHULLAR HS;HAMZA M", 
                               "HARUDIN MH;FRANCO A;JAFFAR N;NOOR MHM;IBRAHIM MA;MANICA S", 
                               "KUHNEN B;FERNANDES CMS;BARROS F;SCARSO FILHO J;GONÇALVES M;SERRA MC", 
                               "VODANOVIĆ M;SUBAŠIĆ M;MILOŠEVIĆ DP;GALIĆ I;BRKIĆ H", 
                               "RAMKUMAR J;GANESH R;VINAY J", "RIVERA-MENDOZA F;ESPINOZA-SILVA PV;FONSECA GM", 
                               "PEREIRA CP;SANTOS R;NUSHI V;LAMEIRO MV;ANTUNES P;CARVALHO R;MAJOR T;ALQAHTANI SJ"
), Author.s..ID = c("57211806807; 57192983097; 36952129500", 
                    "58235484500; 58236398600; 58235265200; 57217734842; 58235484600; 58236171600; 58235043100; 36183085400; 57207799926; 56878742000", 
                    "58235287100; 57205450144; 57189010056", "57215812605; 58519454000; 57924026600; 57924660300; 57829018800; 58519185900", 
                    "57195633934; 6506257821; 6508066300; 6602243600", "58195847100; 15135714000; 58195400400", 
                    "57218218739; 6602870096; 57190741097", "58307513800; 58308177600; 58022245400; 57192995591; 35190337400; 35520201500", 
                    "58484954800; 55185444800", "7401571955; 7402116738; 25651855700; 8552552500; 7101707095; 7201706201; 58303547300; 58569857500; 7003793212", 
                    "57416602300; 6701772603; 57416602400; 58080360400; 58081085000; 36028646100", 
                    "57203321439; 55357687600; 24833021600; 55163807800; 7401492628", 
                    "56727508100; 57218226768; 35932019300; 57203955437; 56668190400; 58081759100; 58082278400; 57840021300; 57189326786; 36839218900", 
                    "58544769500; 58544603300; 46861416800", "57061571400; 8042276500; 6701854111; 51565457300; 26532954500; 56764949400", 
                    "57391780400; 57188842428; 57203335815; 57215374469; 57203317671; 58191839100; 58192302000; 35236150000; 56009909700; 55916628500", 
                    "57194377552; 7003893659; 56539124800; 55318758500; 56441159900; 7004885391", 
                    "55900416400; 57207514529; 57194834663; 6701764799; 57039166200; 58352179300; 6602247508; 57203616204; 58182056400; 56730646000; 58478877100", 
                    "57428699300; 56797610300; 55957033800; 57429757800; 58141532500; 12761531400; 7404487919; 55731667900", 
                    "56898875600; 57224073300; 26640785000", "57195629545", "57205495992; 57217653844; 7005882178; 15061137300", 
                    "58136445000; 57210935896; 57189010056; 57205450144", "58540862700; 25522097000; 58540160600; 57423908600; 57195430892; 58540862800", 
                    "57783544200; 7006925639; 55206855800; 16245238300", "57220773969; 57148260400; 14422375000", 
                    "58074312500; 6504295343; 57204201957; 57268460000; 57901658300", 
                    "56056565700; 6602086533; 57198019376", "58507483800; 56237340500; 7402601773", 
                    "56072719400; 58132576500; 57297746300; 55147892100; 57223444197; 7003946207", 
                    "57904670600; 57904055400; 57904270300; 57904670700; 57904471800; 55517099900", 
                    "57194481538; 16835094000", "7101937964; 35750447700; 55330827100; 7005305192", 
                    "57209341512; 58108459600; 57203920354; 57221468725; 57919847100; 57857535000; 57919671100; 57914942600; 56928003400; 54786023300; 55850166100", 
                    "57700221200", "57539490700; 46861416800; 57540135000; 57195464981; 35190337400", 
                    "57552704200; 57903377000; 58562326200; 57194032731; 57191113816", 
                    "57222986450; 57338272500; 58451580100; 7003454067; 56236335200; 8904910000", 
                    "57209570307; 58453037100", "57325475900; 57465984000; 57396827200", 
                    "56203448200; 57855205700; 57855205800; 57855831000; 57205131975", 
                    "57219023505; 58537713700; 57189588884; 58537055700; 57559779600; 58537055800", 
                    "55433307900; 58186424700; 58070106900; 58185591700; 58185926500; 58186424800; 56979441000", 
                    "55556592000; 57213338080; 58144251500; 58295221000; 58292275300; 57218176879", 
                    "57192662543; 57205450144; 58157816800; 57193712434; 55358169200; 57189010056", 
                    "57221841980; 50261506600; 57221849486; 6504233232; 7202320517; 54917611300", 
                    "12446817300; 6508382225; 57211558771; 57202373100; 6701830546", 
                    "58236897700; 58237381400; 56631932700", "57194795258; 57777793500; 35190337400", 
                    "55433307900; 56979441000; 58185591700; 58145498900; 58145499000; 58145697100; 58145089200; 56450570300"
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))

我想要做的是确定 test2$AU 中是否有来自 1 test2$Author.s..ID 的不同名称。所以我制作了这个脚本:

library(tidyverse)
library(stringr)
au <- strsplit(test2$AU, ";")
author_ids <- strsplit(test2$Author.s..ID, ";")

max_elements <- max(lengths(au), lengths(author_ids))


au <- lapply(au, function(x) c(x, rep(NA, max_elements - length(x))))
author_ids <- lapply(author_ids, function(x) c(x, rep(NA, max_elements - length(x))))

author_data <- data.frame(
  AU = unlist(au, use.names = FALSE),
  Author.s..ID = unlist(author_ids, use.names = FALSE)
)

author_data <- na.omit(author_data)
author_data$Author.s..ID <- gsub(" ","", author_data$Author.s..ID)
author_data <- unique(author_data)


mismatches <- author_data %>%
  group_by(Author.s..ID) %>%
  summarize(AU_count = n_distinct(AU))


mismatches <- mismatches %>%
  filter(AU_count > 1)

if (nrow(mismatches) > 0) {
  print(mismatches)
} else {
  cat("No mismatches or variations found.")
}

然后我创建一个 for 循环来替换所有不匹配的内容:

for (i in 1:nrow(mismatches)) {
  q <- mismatches[i, 1]
  rows <- author_data[which(author_data$Author.s..ID == as.character(q)), ]
  for (y in 1:nrow(rows)) {
    name_replaced <- rows$AU[2:y]
    name_fixed <- rows$AU[1]
    for (j in 1:nrow(test2)) {
      for (k in 1:length(name_replaced)) {
        if (grepl(name_replaced[k], test2$AU[j])) {
          test2$AU[j] <- gsub(name_replaced[k], name_fixed, test2$AU[j])
        } else {test2$AU[j] <- test2$AU[j]
        }
      }
    }
  }
}

我制作的 for 循环创建了一个奇怪的字符串子,它重复字符,如果我尝试运行循环,我仍然不匹配,并且输出仍然不正确。

author_data[which(author_data$Author.s..ID == "56979441000"),]

我哪里做错了?

谢谢!

r string for-loop gsub
1个回答
0
投票

如果你最终想要做的是找到多次归属的ID和具有多个ID的作者,我建议你先整理一下数据,每行有一个作者和作者ID。然后查找重复项就变得容易多了:

library(tidyverse)
# tidy up your data
test2 <- test2 |>
  mutate(ManuscriptID = row_number(), # I assumed these are articles or manuscripts?
         AU = map(AU, \(x) trimws(unlist(strsplit(x,";")))),
         Author.s..ID = map(Author.s..ID, \(x) trimws(unlist(strsplit(x,";"))))) |>
  unnest(c(AU, Author.s..ID))

# find Authors having multiple IDs
test2 |>
  group_by(AU) |>
  filter(n()>1) |>
  arrange(Author.s..ID)

# deal with Authors having multiple IDs
# ... insert your own code here

# find IDs attributed multiple times
test2 |>
  group_by(Author.s..ID) |>
  filter(n()>1) |>
  arrange(Author.s..ID)

# deal with IDs attributed multiple times
# ... insert your own code here

# restore the previous data structure
test2 |>
  group_by(ManuscriptID) |>
  summarise(across(everything(), ~ paste(.x, collapse="; ")))

我让您决定如何更改这两种情况下的重复 ID。

© www.soinside.com 2019 - 2024. All rights reserved.