我正在使用 R 进行网络抓取,并尝试提取 IMDB 排名前 250 的电影的数据帧。到目前为止我的代码很短:
library(tidyverse)
library(rvest)
page = read_html('https://www.imdb.com/chart/top/?ref_=nv_mv_250')
base = html_elements(page, 'li')
base %>% html_elements('h3') %>% html_text2() %>% str_remove('^[0-9]+\\. ')
base %>% html_element('.sc-b0691f29-7 hrgukm cli-title-metadata')
每当我尝试使用
html_element
时,我似乎只得到NA:最后一行就是这种情况,它应该提取电影的年份、时长和年龄分级,但只返回NA。
同样的事情发生在倒数第二行,尝试提取
h3
元素,在本例中是电影的标题。如果我使用 html_element
,我会得到 NA 列表,而如果我使用 html_elements
,我会得到所需的结果(此替换对最后一行不起作用)。我做错了什么?
我会使用类选择器(.ipc-title__text)作为标题,然后对文本进行一些后处理:
library(rvest)
read_html('https://www.imdb.com/chart/top/?ref_=nv_mv_250') %>%
html_elements(".ipc-title__text") %>%
html_text() %>%
`[`(grepl("^\\d", .)) %>%
sub("^\\d+\\. ", "", .)
#> [1] "The Shawshank Redemption"
#> [2] "The Godfather"
#> [3] "The Dark Knight"
#> [4] "The Godfather Part II"
#> [5] "12 Angry Men"
#> [6] "Schindler's List"
#> [7] "The Lord of the Rings: The Return of the King"
#> [8] "Pulp Fiction"
#> [9] "The Lord of the Rings: The Fellowship of the Ring"
#> [10] "The Good, the Bad and the Ugly"
#> [11] "Forrest Gump"
#> [12] "The Lord of the Rings: The Two Towers"
#> [13] "Fight Club"
#> [14] "Inception"
#> [15] "Dune: Part Two"
#> [16] "Star Wars: Episode V - The Empire Strikes Back"
#> [17] "The Matrix"
#> [18] "Goodfellas"
#> [19] "One Flew Over the Cuckoo's Nest"
#> [20] "Se7en"
#> [21] "Interstellar"
#> [22] "It's a Wonderful Life"
#> [23] "Seven Samurai"
#> [24] "The Silence of the Lambs"
#> [25] "Saving Private Ryan"
#> [26] "City of God"
#> [27] "Life Is Beautiful"
#> [28] "The Green Mile"
#> [29] "Terminator 2: Judgment Day"
#> [30] "Star Wars: Episode IV - A New Hope"
#> [31] "Back to the Future"
#> [32] "Spirited Away"
#> [33] "The Pianist"
#> [34] "Parasite"
#> [35] "Psycho"
#> [36] "Spider-Man: Across the Spider-Verse"
#> [37] "Gladiator"
#> [38] "The Lion King"
#> [39] "Léon: The Professional"
#> [40] "The Departed"
#> [41] "American History X"
#> [42] "Whiplash"
#> [43] "The Prestige"
#> [44] "Grave of the Fireflies"
#> [45] "Harakiri"
#> [46] "The Usual Suspects"
#> [47] "Casablanca"
#> [48] "The Intouchables"
#> [49] "Cinema Paradiso"
#> [50] "Modern Times"
#> [51] "Rear Window"
#> [52] "Once Upon a Time in the West"
#> [53] "Alien"
#> [54] "City Lights"
#> [55] "Apocalypse Now"
#> [56] "Django Unchained"
#> [57] "12th Fail"
#> [58] "Memento"
#> [59] "WALL·E"
#> [60] "Raiders of the Lost Ark"
#> [61] "The Lives of Others"
#> [62] "Sunset Blvd."
#> [63] "Paths of Glory"
#> [64] "Avengers: Infinity War"
#> [65] "Spider-Man: Into the Spider-Verse"
#> [66] "The Shining"
#> [67] "Witness for the Prosecution"
#> [68] "The Great Dictator"
#> [69] "Aliens"
#> [70] "Inglourious Basterds"
#> [71] "The Dark Knight Rises"
#> [72] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb"
#> [73] "American Beauty"
#> [74] "Oldboy"
#> [75] "Coco"
#> [76] "Amadeus"
#> [77] "Toy Story"
#> [78] "Das Boot"
#> [79] "Braveheart"
#> [80] "Avengers: Endgame"
#> [81] "Joker"
#> [82] "Princess Mononoke"
#> [83] "Good Will Hunting"
#> [84] "Your Name."
#> [85] "Once Upon a Time in America"
#> [86] "High and Low"
#> [87] "3 Idiots"
#> [88] "Singin' in the Rain"
#> [89] "Oppenheimer"
#> [90] "Capernaum"
#> [91] "Requiem for a Dream"
#> [92] "Come and See"
#> [93] "Toy Story 3"
#> [94] "Star Wars: Episode VI - Return of the Jedi"
#> [95] "Eternal Sunshine of the Spotless Mind"
#> [96] "The Hunt"
#> [97] "2001: A Space Odyssey"
#> [98] "Reservoir Dogs"
#> [99] "Ikiru"
#> [100] "Lawrence of Arabia"
#> [101] "The Apartment"
#> [102] "North by Northwest"
#> [103] "Citizen Kane"
#> [104] "Vertigo"
#> [105] "M"
#> [106] "Incendies"
#> [107] "Double Indemnity"
#> [108] "Scarface"
#> [109] "Amélie"
#> [110] "Full Metal Jacket"
#> [111] "A Clockwork Orange"
#> [112] "Heat"
#> [113] "Up"
#> [114] "To Kill a Mockingbird"
#> [115] "The Sting"
#> [116] "A Separation"
#> [117] "Indiana Jones and the Last Crusade"
#> [118] "Die Hard"
#> [119] "Metropolis"
#> [120] "Like Stars on Earth"
#> [121] "Hamilton"
#> [122] "Snatch"
#> [123] "L.A. Confidential"
#> [124] "Bicycle Thieves"
#> [125] "1917"
#> [126] "Taxi Driver"
#> [127] "Downfall"
#> [128] "Dangal"
#> [129] "For a Few Dollars More"
#> [130] "Batman Begins"
#> [131] "The Wolf of Wall Street"
#> [132] "Some Like It Hot"
#> [133] "The Kid"
#> [134] "Green Book"
#> [135] "The Father"
#> [136] "Judgment at Nuremberg"
#> [137] "All About Eve"
#> [138] "Top Gun: Maverick"
#> [139] "The Truman Show"
#> [140] "Shutter Island"
#> [141] "There Will Be Blood"
#> [142] "Casino"
#> [143] "Ran"
#> [144] "Jurassic Park"
#> [145] "The Sixth Sense"
#> [146] "Pan's Labyrinth"
#> [147] "Unforgiven"
#> [148] "No Country for Old Men"
#> [149] "A Beautiful Mind"
#> [150] "The Thing"
#> [151] "The Treasure of the Sierra Madre"
#> [152] "Kill Bill: Vol. 1"
#> [153] "Yojimbo"
#> [154] "Monty Python and the Holy Grail"
#> [155] "The Great Escape"
#> [156] "Finding Nemo"
#> [157] "Rashomon"
#> [158] "Prisoners"
#> [159] "Howl's Moving Castle"
#> [160] "The Elephant Man"
#> [161] "Chinatown"
#> [162] "Dial M for Murder"
#> [163] "Gone with the Wind"
#> [164] "V for Vendetta"
#> [165] "Lock, Stock and Two Smoking Barrels"
#> [166] "The Secret in Their Eyes"
#> [167] "Inside Out"
#> [168] "Raging Bull"
#> [169] "Three Billboards Outside Ebbing, Missouri"
#> [170] "Trainspotting"
#> [171] "The Bridge on the River Kwai"
#> [172] "Spider-Man: No Way Home"
#> [173] "Fargo"
#> [174] "Klaus"
#> [175] "Warrior"
#> [176] "Catch Me If You Can"
#> [177] "Godzilla Minus One"
#> [178] "Gran Torino"
#> [179] "My Neighbor Totoro"
#> [180] "Million Dollar Baby"
#> [181] "Harry Potter and the Deathly Hallows: Part 2"
#> [182] "Children of Heaven"
#> [183] "12 Years a Slave"
#> [184] "Blade Runner"
#> [185] "Before Sunrise"
#> [186] "The Grand Budapest Hotel"
#> [187] "Ben-Hur"
#> [188] "Barry Lyndon"
#> [189] "Gone Girl"
#> [190] "Hacksaw Ridge"
#> [191] "The Gold Rush"
#> [192] "Memories of Murder"
#> [193] "In the Name of the Father"
#> [194] "Dead Poets Society"
#> [195] "On the Waterfront"
#> [196] "The General"
#> [197] "The Deer Hunter"
#> [198] "Wild Tales"
#> [199] "Mad Max: Fury Road"
#> [200] "Monsters, Inc."
#> [201] "Sherlock Jr."
#> [202] "The Third Man"
#> [203] "Wild Strawberries"
#> [204] "The Wages of Fear"
#> [205] "Jaws"
#> [206] "How to Train Your Dragon"
#> [207] "Mary and Max"
#> [208] "Mr. Smith Goes to Washington"
#> [209] "Ford v Ferrari"
#> [210] "Ratatouille"
#> [211] "Room"
#> [212] "The Seventh Seal"
#> [213] "The Big Lebowski"
#> [214] "Tokyo Story"
#> [215] "Rocky"
#> [216] "Logan"
#> [217] "Spotlight"
#> [218] "Hotel Rwanda"
#> [219] "Platoon"
#> [220] "The Terminator"
#> [221] "The Passion of Joan of Arc"
#> [222] "Before Sunset"
#> [223] "La haine"
#> [224] "The Best Years of Our Lives"
#> [225] "Jai Bhim"
#> [226] "The Exorcist"
#> [227] "Pirates of the Caribbean: The Curse of the Black Pearl"
#> [228] "Rush"
#> [229] "Network"
#> [230] "Stand by Me"
#> [231] "The Wizard of Oz"
#> [232] "The Incredibles"
#> [233] "Hachi: A Dog's Tale"
#> [234] "My Father and My Son"
#> [235] "Into the Wild"
#> [236] "The Handmaiden"
#> [237] "The Sound of Music"
#> [238] "The Grapes of Wrath"
#> [239] "To Be or Not to Be"
#> [240] "The Battle of Algiers"
#> [241] "Groundhog Day"
#> [242] "Amores Perros"
#> [243] "Rebecca"
#> [244] "Cool Hand Luke"
#> [245] "The Iron Giant"
#> [246] "The Help"
#> [247] "It Happened One Night"
#> [248] "Aladdin"
#> [249] "Dances with Wolves"
#> [250] "Drishyam"
创建于 2024-03-26,使用 reprex v2.1.0