import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_recommenders as tfrs
from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
movies.head()
credits.head()
movies.info()
credits = credits.rename(columns={"movie_id": "id"})
df = movies.merge(credits, on='id')
def get_text(text, obj='name'):
text = literal_eval(text)
if len(text) == 1:
for i in text:
return i[obj]
else:
s = []
for i in text:
s.append(i[obj])
return ', '.join(s)
df['genres'] = df['genres'].apply(get_text)
df['production_companies'] = df['production_companies'].apply(get_text)
df['production_countries'] = df['production_countries'].apply(get_text)
df['crew'] = df['crew'].apply(get_text)
df['spoken_languages'] = df['spoken_languages'].apply(get_text)
df['keywords'] = df['keywords'].apply(get_text)
# New columns
df['characters'] = df['cast'].apply(get_text, obj='character')
df['actors'] = df['cast'].apply(get_text)
df.drop('cast', axis=1, inplace=True)
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)
R = df['vote_average']
v = df['vote_count']
m = df['vote_count'].quantile(0.8)
C = df['vote_average'].mean()
df['weighted_average'] = (v/(v+m) * R) + (m/(m+v) * C)
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[['popularity', 'weighted_average']])
weighted_df = pd.DataFrame(scaled, columns=['popularity', 'weighted_average'])
weighted_df.index = df['original_title']
weighted_df['score'] = weighted_df['weighted_average']*0.5 + weighted_df['popularity'].astype('float64')*0.5
weighted_df_sorted = weighted_df.sort_values(by='score', ascending=False)
weighted_df_sorted.head(10)
hybrid_df = df[['original_title', 'genres', 'overview', 'production_companies', 'tagline', 'keywords', 'crew', 'characters', 'actors']]
def remove_punc(text):
translator = str.maketrans('', '', string.punctuation + string.digits)
clean_text = str(text).translate(translator)
return clean_text
hybrid_df['genres'] = hybrid_df['genres'].apply(remove_punc)
hybrid_df['overview'] = hybrid_df['overview'].apply(remove_punc)
hybrid_df['production_companies'] = hybrid_df['production_companies'].apply(remove_punc)
hybrid_df['tagline'] = hybrid_df['tagline'].apply(remove_punc)
hybrid_df['keywords'] = hybrid_df['keywords'].apply(remove_punc)
hybrid_df['crew'] = hybrid_df['crew'].apply(remove_punc)
hybrid_df['characters'] = hybrid_df['characters'].apply(remove_punc)
hybrid_df['actors'] = hybrid_df['actors'].apply(remove_punc)
hybrid_df['bag_of_words'] = ''
hybrid_df['bag_of_words'] = hybrid_df[hybrid_df.columns[1:]].apply(lambda x: ' '.join(x), axis=1)
hybrid_df.set_index('original_title', inplace=True)
hybrid_df = hybrid_df[['bag_of_words']]
hybrid_df.head()
tfidf = TfidfVectorizer(stop_words='english', min_df=5)
tfidf_matrix = tfidf.fit_transform(hybrid_df['bag_of_words'])
tfidf_matrix.shape
cos_sim = cosine_similarity(tfidf_matrix)
cos_sim.shape
def predict(title, similarity_weight=0.7, top_n=10):
data = hybrid_df.reset_index()
index_movie = data[data['original_title'] == title].index
similarity = cos_sim[index_movie].T
sim_df = pd.DataFrame(similarity, columns=['similarity'])
final_df = pd.concat([data, sim_df], axis=1)
# You can also play around with the number
final_df['final_score'] = final_df['score']*(1-similarity_weight) + final_df['similarity']*similarity_weight
print(final_df.columns)
final_df_sorted = final_df.sort_values(by='final_score', ascending=False).head(top_n)
final_df_sorted.set_index('original_title', inplace=True)
return final_df_sorted[['score', 'similarity', 'final_score']]
print(cos_sim.shape)
predict('Toy Story', similarity_weight=0.7, top_n=10)
我不明白为什么我会收到此错误任何解释都会有所帮助:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.9/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3628 try:
-> 3629 return self._engine.get_loc(casted_key)
3630 except KeyError as err:
5 frames
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'score'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.9/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
3629 return self._engine.get_loc(casted_key)
3630 except KeyError as err:
-> 3631 raise KeyError(key) from err
3632 except TypeError:
3633 # If we have a listlike key, _check_indexing_error will raise
KeyError: 'score'