我在excel中有多个bigquery表列表,我想读取excel并检查所有表是否存在于bigquery中或不使用python

问题描述 投票:0回答:1

我目前正在学习Python。我在 Excel 中列出了多个 bigquery 表,我想读取 Excel 并检查所有表是否都存在于 bigquery 中。并将结果返回到excel。任何人都可以请指教。我无法循环遍历多行和多列。


import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from google.cloud.exceptions import NotFound
import numpy


credentials = service_account.Credentials.from_service_account_file('file path.json')

project_id = 'creating-data-413804'
client = bigquery.Client(credentials= credentials,project=project_id)

#Reading the excel file 

data = pd.read_excel('file path', usecols=('Dataset','Source Schema', 'Source Table Name', 'Target   BQ Project ID', 'table exist'))

`for col in data.iterrows():

    project_nm = data['Source Schema'][col]
    dataset_nm =data['Dataset'][col]
    table_nm = data['Source Table Name'][col]
    client = bigquery.Client(credentials= credentials,project=project_id)
    dataset = client.dataset(dataset_nm)
    table_ref = dataset.table(table_nm)
    def if_tbl_exists(client, table_ref):
        #from google.cloud.exceptions import NotFound
        try:
            client.get_table(table_ref)
            return True
        except NotFound:
            return False`
print(col)
print(if_tbl_exists(client, table_ref))

book = openpyxl.load_workbook('file path.xlsx')  

sheet = book['Sheet1']
if if_tbl_exists(client, table_ref) == True:
    sheet.cell(row=3, column=5).value = 'table exist'
else:
    sheet.cell(row=3, column=5).value = 'table doesnt exist'

book.save('file path.xlsx')
python pandas excel google-bigquery openpyxl
1个回答
0
投票

根据您的工作表的长度,我认为 for 循环会使过程变慢。我认为获取不同的数据集和每个数据集中的表列表。将其与工作表中的数据连接(左/外连接),并获取未获得任何连接数据的表的列表。

import pandas as pd
from google.cloud import bigquery
from google.oauth2 import service_account
import openpyxl

# Function Definitions
def get_tables_in_dataset(client, project_id, dataset_id):
    """Fetch tables in a given dataset. Return a list of "projectId.datasetId.tableId" format strings."""
    tables_list = []
    try:
        dataset_ref = client.dataset(dataset_id, project=project_id)
        tables = list(client.list_tables(dataset_ref))
        if tables:
            for table in tables:
                tables_list.append(f"{project_id}.{dataset_id}.{table.table_id}")
        else:
            tables_list.append(f"{project_id}.{dataset_id}.no tables")
    except Exception as e:
        print(f"Error accessing dataset {dataset_id} in project {project_id}: {e}")
        tables_list.append(f"{project_id}.{dataset_id}.no tables")
    return tables_list

def create_full_qualified_name(row):
    """Concatenate project, dataset, and table names."""
    return f"{row['Target BQ Project ID']}.{row['Dataset']}.{row['Source Table Name']}"

# Configuration
credentials_path = 'path/to/your/credentials.json'
excel_file_path = 'path/to/your/excel.xlsx'
sheet_name = 'YourSheetName'  # Update with your actual sheet name

# Setup BigQuery Client
credentials = service_account.Credentials.from_service_account_file(credentials_path)
client = bigquery.Client(credentials=credentials)

# 1. Read Excel file
df1 = pd.read_excel(excel_file_path, sheet_name=sheet_name)
df1['Qualified Name'] = df1.apply(create_full_qualified_name, axis=1)

# 2. Get distinct projectId.dataset
distinct_datasets = df1['Qualified Name'].apply(lambda x: '.'.join(x.split('.')[:2])).unique()

# 3. Get a list of all tables from BigQuery that are in the list of datasets
all_tables = []
for dataset in distinct_datasets:
    project_id, dataset_id = dataset.split('.')
    tables_in_dataset = get_tables_in_dataset(client, project_id, dataset_id)
    all_tables.extend(tables_in_dataset)

df2 = pd.DataFrame(all_tables, columns=['Qualified Name'])

# 4. Join df1 and df2
merged_df = pd.merge(df1, df2, on='Qualified Name', how='outer', indicator=True)

# 5. List of tables in df1 not in BigQuery
tables_in_df1_not_in_bq = merged_df.loc[merged_df['_merge'] == 'left_only', 'Qualified Name']

# 6. List of tables in BigQuery not in the Excel sheet
tables_in_bq_not_in_df1 = merged_df.loc[merged_df['_merge'] == 'right_only', 'Qualified Name']

# Output the lists
print("Tables in Excel not in BigQuery:")
print(tables_in_df1_not_in_bq.tolist())

print("\nTables in BigQuery not in Excel:")
print(tables_in_bq_not_in_df1.tolist())

© www.soinside.com 2019 - 2024. All rights reserved.