我目前正在学习Python。我在 Excel 中列出了多个 bigquery 表,我想读取 Excel 并检查所有表是否都存在于 bigquery 中。并将结果返回到excel。任何人都可以请指教。我无法循环遍历多行和多列。
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from google.cloud.exceptions import NotFound
import numpy
credentials = service_account.Credentials.from_service_account_file('file path.json')
project_id = 'creating-data-413804'
client = bigquery.Client(credentials= credentials,project=project_id)
#Reading the excel file
data = pd.read_excel('file path', usecols=('Dataset','Source Schema', 'Source Table Name', 'Target BQ Project ID', 'table exist'))
`for col in data.iterrows():
project_nm = data['Source Schema'][col]
dataset_nm =data['Dataset'][col]
table_nm = data['Source Table Name'][col]
client = bigquery.Client(credentials= credentials,project=project_id)
dataset = client.dataset(dataset_nm)
table_ref = dataset.table(table_nm)
def if_tbl_exists(client, table_ref):
#from google.cloud.exceptions import NotFound
try:
client.get_table(table_ref)
return True
except NotFound:
return False`
print(col)
print(if_tbl_exists(client, table_ref))
book = openpyxl.load_workbook('file path.xlsx')
sheet = book['Sheet1']
if if_tbl_exists(client, table_ref) == True:
sheet.cell(row=3, column=5).value = 'table exist'
else:
sheet.cell(row=3, column=5).value = 'table doesnt exist'
book.save('file path.xlsx')
根据您的工作表的长度,我认为 for 循环会使过程变慢。我认为获取不同的数据集和每个数据集中的表列表。将其与工作表中的数据连接(左/外连接),并获取未获得任何连接数据的表的列表。
import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import openpyxl
# Function Definitions
def get_tables_in_dataset(client, project_id, dataset_id):
"""Fetch tables in a given dataset. Return a list of "projectId.datasetId.tableId" format strings."""
tables_list = []
try:
dataset_ref = client.dataset(dataset_id, project=project_id)
tables = list(client.list_tables(dataset_ref))
if tables:
for table in tables:
tables_list.append(f"{project_id}.{dataset_id}.{table.table_id}")
else:
tables_list.append(f"{project_id}.{dataset_id}.no tables")
except Exception as e:
print(f"Error accessing dataset {dataset_id} in project {project_id}: {e}")
tables_list.append(f"{project_id}.{dataset_id}.no tables")
return tables_list
def create_full_qualified_name(row):
"""Concatenate project, dataset, and table names."""
return f"{row['Target BQ Project ID']}.{row['Dataset']}.{row['Source Table Name']}"
# Configuration
credentials_path = 'path/to/your/credentials.json'
excel_file_path = 'path/to/your/excel.xlsx'
sheet_name = 'YourSheetName' # Update with your actual sheet name
# Setup BigQuery Client
credentials = service_account.Credentials.from_service_account_file(credentials_path)
client = bigquery.Client(credentials=credentials)
# 1. Read Excel file
df1 = pd.read_excel(excel_file_path, sheet_name=sheet_name)
df1['Qualified Name'] = df1.apply(create_full_qualified_name, axis=1)
# 2. Get distinct projectId.dataset
distinct_datasets = df1['Qualified Name'].apply(lambda x: '.'.join(x.split('.')[:2])).unique()
# 3. Get a list of all tables from BigQuery that are in the list of datasets
all_tables = []
for dataset in distinct_datasets:
project_id, dataset_id = dataset.split('.')
tables_in_dataset = get_tables_in_dataset(client, project_id, dataset_id)
all_tables.extend(tables_in_dataset)
df2 = pd.DataFrame(all_tables, columns=['Qualified Name'])
# 4. Join df1 and df2
merged_df = pd.merge(df1, df2, on='Qualified Name', how='outer', indicator=True)
# 5. List of tables in df1 not in BigQuery
tables_in_df1_not_in_bq = merged_df.loc[merged_df['_merge'] == 'left_only', 'Qualified Name']
# 6. List of tables in BigQuery not in the Excel sheet
tables_in_bq_not_in_df1 = merged_df.loc[merged_df['_merge'] == 'right_only', 'Qualified Name']
# Output the lists
print("Tables in Excel not in BigQuery:")
print(tables_in_df1_not_in_bq.tolist())
print("\nTables in BigQuery not in Excel:")
print(tables_in_bq_not_in_df1.tolist())