当我的 pyspark 代码/引导程序失败时,我需要收到松弛通知,并在 EMR 中提供失败的确切原因。
我使用的是airflow 2.0,我使用on_failure_callback=task_fail_slack_alert来通知slack。但它没有通知。我的参数正确吗?
cluster_creator = EmrCreateJobFlowOperator(task_id='create_job_flow',
job_flow_overrides=JOB_FLOW_OVERRIDES)
step_adder = EmrAddStepsOperator(task_id='add_steps',
job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
steps=SPARK_STEPS,
on_failure_callback=task_fail_slack_alert)
step_checker = EmrStepSensor(task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
step_id="{{ task_instance.xcom_pull(task_ids='add_steps', key='return_value')[0] }}",
aws_conn_id='aws_default',on_failure_callback=task_fail_slack_alert,)
cluster_remover = EmrTerminateJobFlowOperator(task_id='remove_cluster',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow', key='return_value') }}",
aws_conn_id='aws_default',
dag=dag
)
我收到失败通知,但没有收到 EMR 失败的确切原因。我怎样才能得到它?
exception=context.get('exception')是给出 EMR 失败确切原因的函数
使用 slack 的 on_failure_callback 示例:
step_checker = EmrStepSensor(task_id='watch_step',
job_flow_id="{{ task_instance.xcom_pull('create_job_flow',
key='return_value') }}",
step_id="{{task_instance.xcom_pull(task_ids='add_steps',key='return_value')[0] }}",
aws_conn_id='aws_default',
on_failure_callback=task_fail_slack_alert,)
def task_fail_slack_alert(context):
SLACK_CONN_ID = 'slack'
slack_webhook_token = BaseHook.get_connection(SLACK_CONN_ID).password
slack_msg = """
:red_circle: Task Failed.
*Task*: {task}
*Dag*: {dag}
*Execution Time*: {exec_date}
*Log Url*: {log_url}
*Error*:{exception}
""".format(
task=context.get('task_instance').task_id,
dag=context.get('task_instance').dag_id,
exec_date=context.get('execution_date'),
log_url=context.get('task_instance').log_url,
exception=context.get('exception')
)
failed_alert = SlackWebhookOperator(
task_id='slack_test',
http_conn_id='slack',
webhook_token=slack_webhook_token,
message=slack_msg,
username='airflow',
dag=dag)
return failed_alert.execute(context=context)