我编写了一个脚本,它从数据表中获取嵌入、ID 和元数据,并将其更新插入到松果索引中。我已经多次确认我的数据结构是正确的。我传递一个元组列表,我的嵌入向量是一个列表而不是数组。然而,无论我尝试什么,在尝试更新插入数据时都会遇到错误。
实际错误 pinecone_package.indexing.index - 错误 - 将向量更新插入索引时出错:tunl-vision,错误:“NoneType”类型的对象没有 len()
我处理数据框中数据的函数
def process_batch(df, model, processor, tokenizer, indexer, s3_client, bucket_name):
"""
Process a batch of images: generate embeddings, upsert to Pinecone, and upload to S3.
"""
try:
# Check if image URLs are valid
df['is_valid'] = df['image'].apply(check_valid_urls)
valid_df = df[df['is_valid']]
# Get embeddings
valid_df['image_embeddings'] = valid_df['image'].apply(lambda url: get_single_image_embedding(get_image(url), processor, model))
valid_df['text_embeddings'] = valid_df['description'].apply(lambda text: get_single_text_embedding(text, tokenizer, model))
# Convert embeddings to lists
for col in ['image_embeddings', 'text_embeddings']:
valid_df[col] = valid_df[col].apply(lambda x: x[0].tolist() if isinstance(x, np.ndarray) and x.ndim > 1 else x.tolist())
# Upsert to Pinecone
item_ids = valid_df['id'].tolist()
vectors = valid_df['image_embeddings'].tolist()
metadata = valid_df.drop(columns=['id', 'is_valid', 'image_embeddings', 'text_embeddings', 'size']).to_dict(orient='records')
data_to_upsert = list(zip(item_ids, vectors, metadata))
indexer.upsert_vectors(data_to_upsert)
# Preprocess images and upload to S3
for url in valid_df['image']:
preprocess_and_upload_image(s3_client, bucket_name, url)
logging.info("Successfully processed batch.")
except Exception as e:
logging.error(f"Error processing batch: {str(e)}")
我的实际 upsert 函数(除了在调用时初始化 pinecone 的类之外)
def upsert_vectors(self, data: List[Tuple[str, List[float], Dict]]) -> None:
"""
Upsert vectors to the Pinecone index.
Parameters
----------
data : List[Tuple[str, List[float], Dict]]
List of tuples, each containing an item ID, a vector, and a dictionary of metadata.
Raises
------
Exception
If there is an error in upserting the vectors.
"""
try:
# Print the first 5 data points
self.logger.info(f'First 5 data points: {data[:5]}')
# Check if data is a list of tuples
if not all(isinstance(i, tuple) and len(i) == 3 for i in data):
self.logger.error(f'Data is not in correct format: {data}')
return
# Check if all IDs, vectors, and metadata are non-empty
for item_id, vector, meta in data:
if not item_id or not vector or not meta:
self.logger.error(f'Found empty or None data: ID={item_id}, Vector={vector}, Meta={meta}')
return
upsert_result = self.index.upsert(vectors=data)
self.logger.info(
f'Successfully upserted {len(upsert_result.upserted_ids)} vectors to index: {self.index_name}')
except Exception as e:
self.logger.error(f'Error upserting vectors to index: {self.index_name}, Error: {str(e)}')
更新插入之前的数据片段确认我的结构是正确的。
[('62a0be4d5ce2f83f3931a452-00664382372027', [0.2172567993402481, 0.05793587118387222, 0.1606423407793045, 0.303006321191第7877章,……],,{'图片':'https://athleta.gap.com/webcontent/0014/560/754/cn14560754。 jpg', '商家': '62a0b5535ce2f83f392ec994', '品牌': ''...})