@@ -499,6 +499,7 @@ def process_dag(self, dag, executor):
499
499
skip_tis = {(ti [0 ], ti [1 ]) for ti in qry .all ()}
500
500
501
501
descartes = [obj for obj in product (dag .tasks , active_runs )]
502
+ could_not_run = set ()
502
503
self .logger .info ('Checking dependencies on {} tasks instances, minus {} '
503
504
'skippable ones' .format (len (descartes ), len (skip_tis )))
504
505
for task , dttm in descartes :
@@ -513,6 +514,23 @@ def process_dag(self, dag, executor):
513
514
elif ti .is_runnable (flag_upstream_failed = True ):
514
515
self .logger .debug ('Firing task: {}' .format (ti ))
515
516
executor .queue_task_instance (ti , pickle_id = pickle_id )
517
+ else :
518
+ could_not_run .add (ti )
519
+
520
+ # this type of deadlock happens when dagruns can't even start and so
521
+ # the TI's haven't been persisted to the database.
522
+ if len (could_not_run ) == len (descartes ):
523
+ self .logger .error (
524
+ 'Dag runs are deadlocked for DAG: {}' .format (dag .dag_id ))
525
+ (session
526
+ .query (models .DagRun )
527
+ .filter (
528
+ models .DagRun .dag_id == dag .dag_id ,
529
+ models .DagRun .state == State .RUNNING ,
530
+ models .DagRun .execution_date .in_ (active_runs ))
531
+ .update (
532
+ {models .DagRun .state : State .FAILED },
533
+ synchronize_session = 'fetch' ))
516
534
517
535
# Releasing the lock
518
536
self .logger .debug ("Unlocking DAG (scheduler_lock)" )
@@ -553,8 +571,6 @@ def process_events(self, executor, dagbag):
553
571
# collect queued tasks for prioritiztion
554
572
if ti .state == State .QUEUED :
555
573
self .queued_tis .add (ti )
556
- elif ti in self .queued_tis :
557
- self .queued_tis .remove (ti )
558
574
else :
559
575
# special instructions for failed executions could go here
560
576
pass
@@ -583,6 +599,8 @@ def prioritize_queued(self, session, executor, dagbag):
583
599
else :
584
600
d [ti .pool ].append (ti )
585
601
602
+ self .queued_tis .clear ()
603
+
586
604
dag_blacklist = set (dagbag .paused_dags ())
587
605
for pool , tis in list (d .items ()):
588
606
if not pool :
@@ -781,11 +799,12 @@ def _execute(self):
781
799
782
800
# Build a list of all instances to run
783
801
tasks_to_run = {}
784
- failed = []
785
- succeeded = []
786
- started = []
787
- wont_run = []
788
- not_ready_to_run = set ()
802
+ failed = set ()
803
+ succeeded = set ()
804
+ started = set ()
805
+ skipped = set ()
806
+ not_ready = set ()
807
+ deadlocked = set ()
789
808
790
809
for task in self .dag .tasks :
791
810
if (not self .include_adhoc ) and task .adhoc :
@@ -800,67 +819,56 @@ def _execute(self):
800
819
session .commit ()
801
820
802
821
# Triggering what is ready to get triggered
803
- deadlocked = False
804
822
while tasks_to_run and not deadlocked :
805
-
823
+ not_ready . clear ()
806
824
for key , ti in list (tasks_to_run .items ()):
807
825
808
826
ti .refresh_from_db ()
809
827
ignore_depends_on_past = (
810
828
self .ignore_first_depends_on_past and
811
829
ti .execution_date == (start_date or ti .start_date ))
812
830
813
- # Did the task finish without failing? -- then we're done
814
- if (
815
- ti .state in (State .SUCCESS , State .SKIPPED ) and
816
- key in tasks_to_run ):
817
- succeeded .append (key )
818
- tasks_to_run .pop (key )
831
+ # The task was already marked successful or skipped by a
832
+ # different Job. Don't rerun it.
833
+ if key not in started :
834
+ if ti .state == State .SUCCESS :
835
+ succeeded .add (key )
836
+ tasks_to_run .pop (key )
837
+ continue
838
+ elif ti .state == State .SKIPPED :
839
+ skipped .add (key )
840
+ tasks_to_run .pop (key )
841
+ continue
819
842
820
- # Is the task runnable? -- the run it
821
- elif ti .is_queueable (
843
+ # Is the task runnable? -- then run it
844
+ if ti .is_queueable (
822
845
include_queued = True ,
823
846
ignore_depends_on_past = ignore_depends_on_past ,
824
847
flag_upstream_failed = True ):
848
+ self .logger .debug ('Sending {} to executor' .format (ti ))
825
849
executor .queue_task_instance (
826
850
ti ,
827
851
mark_success = self .mark_success ,
828
852
pickle_id = pickle_id ,
829
853
ignore_dependencies = self .ignore_dependencies ,
830
854
ignore_depends_on_past = ignore_depends_on_past ,
831
855
pool = self .pool )
832
- ti .state = State .RUNNING
833
- if key not in started :
834
- started .append (key )
835
- if ti in not_ready_to_run :
836
- not_ready_to_run .remove (ti )
837
-
838
- # Mark the task as not ready to run. If the set of tasks
839
- # that aren't ready ever equals the set of tasks to run,
840
- # then the backfill is deadlocked
856
+ started .add (key )
857
+
858
+ # Mark the task as not ready to run
841
859
elif ti .state in (State .NONE , State .UPSTREAM_FAILED ):
842
- not_ready_to_run .add (ti )
843
- if not_ready_to_run == set (tasks_to_run .values ()):
844
- msg = 'BackfillJob is deadlocked: no tasks can be run.'
845
- if any (
846
- t .are_dependencies_met () !=
847
- t .are_dependencies_met (
848
- ignore_depends_on_past = True )
849
- for t in tasks_to_run .values ()):
850
- msg += (
851
- ' Some of the tasks that were unable to '
852
- 'run have "depends_on_past=True". Try running '
853
- 'the backfill with the option '
854
- '"ignore_first_depends_on_past=True" '
855
- ' or passing "-I" at the command line.' )
856
- self .logger .error (msg )
857
- deadlocked = True
858
- wont_run .extend (not_ready_to_run )
859
- tasks_to_run .clear ()
860
+ self .logger .debug ('Added {} to not_ready' .format (ti ))
861
+ not_ready .add (key )
860
862
861
863
self .heartbeat ()
862
864
executor .heartbeat ()
863
865
866
+ # If the set of tasks that aren't ready ever equals the set of
867
+ # tasks to run, then the backfill is deadlocked
868
+ if not_ready and not_ready == set (tasks_to_run ):
869
+ deadlocked .update (tasks_to_run .values ())
870
+ tasks_to_run .clear ()
871
+
864
872
# Reacting to events
865
873
for key , state in list (executor .get_event_buffer ().items ()):
866
874
dag_id , task_id , execution_date = key
@@ -882,12 +890,12 @@ def _execute(self):
882
890
883
891
# task reports skipped
884
892
elif ti .state == State .SKIPPED :
885
- wont_run . append (key )
893
+ skipped . add (key )
886
894
self .logger .error ("Skipping {} " .format (key ))
887
895
888
896
# anything else is a failure
889
897
else :
890
- failed .append (key )
898
+ failed .add (key )
891
899
self .logger .error ("Task instance {} failed" .format (key ))
892
900
893
901
tasks_to_run .pop (key )
@@ -899,18 +907,19 @@ def _execute(self):
899
907
if ti .state == State .SUCCESS :
900
908
self .logger .info (
901
909
'Task instance {} succeeded' .format (key ))
902
- succeeded .append (key )
910
+ succeeded .add (key )
903
911
tasks_to_run .pop (key )
904
912
905
913
# task reports failure
906
914
elif ti .state == State .FAILED :
907
915
self .logger .error ("Task instance {} failed" .format (key ))
908
- failed .append (key )
916
+ failed .add (key )
909
917
tasks_to_run .pop (key )
910
918
911
919
# this probably won't ever be triggered
912
- elif key in not_ready_to_run :
913
- continue
920
+ elif ti in not_ready :
921
+ self .logger .info (
922
+ "{} wasn't expected to run, but it did" .format (ti ))
914
923
915
924
# executor reports success but task does not - this is weird
916
925
elif ti .state not in (
@@ -939,29 +948,51 @@ def _execute(self):
939
948
ti .handle_failure (msg )
940
949
tasks_to_run .pop (key )
941
950
942
- msg = (
943
- "[backfill progress] "
944
- "waiting: {0} | "
945
- "succeeded: {1} | "
946
- "kicked_off: {2} | "
947
- "failed: {3} | "
948
- "wont_run: {4} " ).format (
949
- len (tasks_to_run ),
950
- len (succeeded ),
951
- len (started ),
952
- len (failed ),
953
- len (wont_run ))
951
+ msg = ' | ' .join ([
952
+ "[backfill progress]" ,
953
+ "waiting: {0}" ,
954
+ "succeeded: {1}" ,
955
+ "kicked_off: {2}" ,
956
+ "failed: {3}" ,
957
+ "skipped: {4}" ,
958
+ "deadlocked: {5}"
959
+ ]).format (
960
+ len (tasks_to_run ),
961
+ len (succeeded ),
962
+ len (started ),
963
+ len (failed ),
964
+ len (skipped ),
965
+ len (deadlocked ))
954
966
self .logger .info (msg )
955
967
956
968
executor .end ()
957
969
session .close ()
970
+
971
+ err = ''
958
972
if failed :
959
- msg = (
960
- "------------------------------------------\n "
961
- "Some tasks instances failed, "
962
- "here's the list:\n {}" .format (failed ))
963
- raise AirflowException (msg )
964
- self .logger .info ("All done. Exiting." )
973
+ err += (
974
+ "---------------------------------------------------\n "
975
+ "Some task instances failed:\n {}\n " .format (failed ))
976
+ if deadlocked :
977
+ err += (
978
+ '---------------------------------------------------\n '
979
+ 'BackfillJob is deadlocked.' )
980
+ deadlocked_depends_on_past = any (
981
+ t .are_dependencies_met () != t .are_dependencies_met (
982
+ ignore_depends_on_past = True )
983
+ for t in deadlocked )
984
+ if deadlocked_depends_on_past :
985
+ err += (
986
+ 'Some of the deadlocked tasks were unable to run because '
987
+ 'of "depends_on_past" relationships. Try running the '
988
+ 'backfill with the option '
989
+ '"ignore_first_depends_on_past=True" or passing "-I" at '
990
+ 'the command line.' )
991
+ err += ' These tasks were unable to run:\n {}\n ' .format (deadlocked )
992
+ if err :
993
+ raise AirflowException (err )
994
+
995
+ self .logger .info ("Backfill done. Exiting." )
965
996
966
997
967
998
class LocalTaskJob (BaseJob ):
0 commit comments