apache · jaketf · Sep 28, 2019 · Sep 28, 2019 · Sep 29, 2019 · Sep 29, 2019
diff --git a/airflow/models/base_async_operator.py b/airflow/models/base_async_operator.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Base Asynchronous Operator for kicking off a long running
+operations and polling for completion with reschedule mode.
+"""
+from abc import abstractmethod
+from functools import wraps
+from typing import Dict, List, Union, Optional
+
+from airflow.sensors.base_sensor_operator import BaseSensorOperator
+from airflow.exceptions import AirflowException, AirflowRescheduleException
+from airflow.models.xcom import XCOM_EXTERNAL_RESOURCE_ID_KEY
+from airflow.models import TaskReschedule
+
+PLACEHOLDER_RESOURCE_ID = 'RESOURCE_ID_NOT_APPLICABLE'
+
+class BaseAsyncOperator(BaseSensorOperator, SkipMixin):
+    """
+    AsyncOperators are derived from this class and inherit these attributes.
+
+    AsyncOperators must define a `submit_request` to fire a request for a
+    long running operation with a method and then executes a `poke` method
+    executing at a time interval and succeed when a criteria is met and fail
+    if and when they time out. They are effctively an opinionated way use
+    combine an Operator and a Sensor in order to kick off a long running
+    process without blocking a worker slot while waiting for the long running
+    process to complete by leveraging reschedule mode.
+
+    :param soft_fail: Set to true to mark the task as SKIPPED on failure
+    :type soft_fail: bool
+    :param poke_interval: Time in seconds that the job should wait in
+        between each tries
+    :type poke_interval: int
+    :param timeout: Time, in seconds before the task times out and fails.
+    :type timeout: int
+    """
+    ui_color = '#9933ff'  # type: str
+
+    @apply_defaults
+    def __init__(self,
+                 *args,
+                 **kwargs) -> None:
+        super().__init__(mode='reschedule', *args, **kwargs)
+
+    @abstractmethod
+    def submit_request(self, context) -> Optional[Union[String, List, Dict]]:
+        """
+        This method should kick off a long running operation.
+        This method should return the ID for the long running operation if
+        applicable.
+        Context is the same dictionary used as when rendering jinja templates.
+
+        Refer to get_template_context for more context.
+
+        :returns: a resource_id for the long running operation.
+        :rtype: str
+        """
+        raise AirflowException('Async Operators must override the `submit_request` method.')
+
+    def process_result(self, context):
+        """
+        This method can optionally be overriden to process the result of a long running operation.
+        Context is the same dictionary used as when rendering jinja templates.
+
+        Refer to get_template_context for more context.
+        """
+        self.log.info('Using default process_result. Got result of %s. Done.',
+                      self.get_external_resource_id(context))
+
+    def execute(self, context):
+        # On the first execute call submit_request and set the
+        # external resource id.
+        task_reschedules = TaskReschedule.find_for_task_instance(context['ti'])
+        if not task_reschedules:
+            resource_id = self.submit_request(self, context)
+            if not resource_id:
+                resource_id = PLACEHOLDER_RESOURCE_ID
+            self.set_external_resource_id(context, resource_id)
+
+        super().execute(self, context)
+
+        #TODO(jaketf) validate comment below w/ tests.
+        # The above should raise AirflowRescheduleException if we are
+        # rescheduling a poke, and thus never reach this code below.
+        try:
+            resource_id = self.get_external_resource_id(context)
+            if resource_id == PLACEHOLDER_RESOURCE_ID:
+                self.log.info("Calling process_request for %s.", resource_id)
+            else:
+                self.log.info("Calling process_request.")
+            self.process_request(context)
+        finally:
+            # Clear the resource id for this task..
+            self.set_external_resource_id(context, None)
+
+    @staticmethod
+    def set_external_resource_id(context, value):
+        return context['ti'].xcom_push(key=XCOM_EXTERNAL_RESOURCE_ID_KEY,
+                                       value=value)
+
+    @staticmethod
+    def get_external_resource_id(context):
+        return context['ti'].xcom_pull(task_ids=context['task'].task_id,
+                                       key=XCOM_EXTERNAL_RESOURCE_ID_KEY)
diff --git a/airflow/models/xcom.py b/airflow/models/xcom.py
@@ -36,6 +36,7 @@
 # https://github.com/apache/airflow/pull/1618#discussion_r68249677
 MAX_XCOM_SIZE = 49344
 XCOM_RETURN_KEY = 'return_value'
+XCOM_EXTERNAL_RESOURCE_ID_KEY = 'external_resource_id'
 
 
 class XCom(Base, LoggingMixin):

diff --git a/airflow/sensors/base_sensor_operator.py b/airflow/sensors/base_sensor_operator.py
@@ -24,7 +24,8 @@
 
 from airflow.exceptions import AirflowException, AirflowSensorTimeout, \
     AirflowSkipException, AirflowRescheduleException
-from airflow.models import BaseOperator, SkipMixin, TaskReschedule
+from airflow.models import BaseOperator, SkipMixin, TaskReschedule, \
+    BaseAsyncOperator
 from airflow.utils import timezone
 from airflow.utils.decorators import apply_defaults
 from airflow.ti_deps.deps.ready_to_reschedule import ReadyToRescheduleDep
@@ -121,8 +122,11 @@ def execute(self, context: Dict) -> None:
                 raise AirflowRescheduleException(reschedule_date)
             else:
                 sleep(self.poke_interval)
+
         self.log.info("Success criteria met. Exiting.")
 
+
+
     def _do_skip_downstream_tasks(self, context: Dict) -> None:
         downstream_tasks = context['task'].get_flat_relatives(upstream=False)
         self.log.debug("Downstream task_ids %s", downstream_tasks)