29
29
import kubernetes
30
30
from kubernetes import watch , client
31
31
from kubernetes .client .rest import ApiException
32
- from urllib3 .exceptions import HTTPError
32
+ from urllib3 .exceptions import HTTPError , ReadTimeoutError
33
33
34
34
from airflow .configuration import conf
35
35
from airflow .contrib .kubernetes .istio import Istio
@@ -320,6 +320,21 @@ def _validate(self):
320
320
class KubernetesJobWatcher (multiprocessing .Process , LoggingMixin ):
321
321
"""Watches for Kubernetes jobs"""
322
322
def __init__ (self , namespace , watcher_queue , resource_version , worker_uuid , kube_config ):
323
+ """Initialize KubernetesJobWatcher, a background process that informs the
324
+ AirflowKubernetesScheduler when tasks are completed.
325
+
326
+ :param namespace: The namespace which will contain all tasks
327
+ :type namespace: str
328
+ :param watcher_queue: Used to inform the Scheduler of completed tasks
329
+ :type watcher_queue: multiprocessing.Queue
330
+ :param resource_version: A counter to indicate how many times a kubernetes resource changed
331
+ :type resource_version: str, but looks like an int, for example "0"
332
+ :param worker_uuid: A label selector used to locate pods that belong to this executor
333
+ :type worker_uuid: str
334
+ :param kube_config: Configuration for Kubernetes
335
+ :type kube_config: KubeConfig
336
+
337
+ """
323
338
multiprocessing .Process .__init__ (self )
324
339
self .namespace = namespace
325
340
self .worker_uuid = worker_uuid
@@ -339,8 +354,8 @@ def run(self):
339
354
self .log .exception ('Unknown error in KubernetesJobWatcher. Failing' )
340
355
raise
341
356
else :
342
- self .log .warning ( 'Watch died gracefully, starting back up with: '
343
- 'last resource_version: %s' , self .resource_version )
357
+ self .log .info ( 'Watcher will start back up with: '
358
+ 'last resource_version: %s' , self .resource_version )
344
359
345
360
def _run (self , kube_client , resource_version , worker_uuid , kube_config ):
346
361
self .log .info (
@@ -350,15 +365,34 @@ def _run(self, kube_client, resource_version, worker_uuid, kube_config):
350
365
watcher = watch .Watch ()
351
366
352
367
kwargs = {'label_selector' : 'airflow-worker={}' .format (worker_uuid )}
368
+
353
369
if resource_version :
354
370
kwargs ['resource_version' ] = resource_version
371
+
355
372
if kube_config .kube_client_request_args :
356
373
for key , value in kube_config .kube_client_request_args .items ():
357
374
kwargs [key ] = value
358
375
359
- last_resource_version = None
360
- for event in watcher .stream (kube_client .list_namespaced_pod , self .namespace ,
361
- ** kwargs ):
376
+ if resource_version :
377
+ last_resource_version = resource_version
378
+ else :
379
+ last_resource_version = None
380
+
381
+ event_generator = watcher .stream (kube_client .list_namespaced_pod ,
382
+ self .namespace ,
383
+ ** kwargs )
384
+ while True :
385
+ try :
386
+ event = next (event_generator )
387
+ except StopIteration :
388
+ break
389
+ except ReadTimeoutError :
390
+ self .log .info ("Timed out waiting for an event." )
391
+ break
392
+ except HTTPError :
393
+ self .log .info ("Terminating connection to kube-api." )
394
+ break
395
+
362
396
task = event ['object' ]
363
397
self .log .info (
364
398
'Event: %s had an event of type %s' ,
@@ -437,13 +471,11 @@ def _make_kube_watcher(self):
437
471
watcher .start ()
438
472
return watcher
439
473
440
- def _health_check_kube_watcher (self ):
441
- if self .kube_watcher .is_alive ():
442
- pass
443
- else :
474
+ def _ensure_kube_watcher (self ):
475
+ if not self .kube_watcher .is_alive ():
476
+ settings .Stats .incr ("executor.kube_watcher.restarts" )
444
477
self .log .error (
445
- 'Error while health checking kube watcher process. '
446
- 'Process died for unknown reasons' )
478
+ 'Kubernetes job watcher process stopped. Restarting' )
447
479
self .kube_watcher = self ._make_kube_watcher ()
448
480
449
481
def run_next (self , next_job ):
@@ -491,7 +523,7 @@ def sync(self):
491
523
:return:
492
524
493
525
"""
494
- self ._health_check_kube_watcher ()
526
+ self ._ensure_kube_watcher ()
495
527
while True :
496
528
try :
497
529
task = self .watcher_queue .get_nowait ()
@@ -500,6 +532,8 @@ def sync(self):
500
532
finally :
501
533
self .watcher_queue .task_done ()
502
534
except Empty :
535
+ # When self.watcher_queue is empty,
536
+ # this function returns
503
537
break
504
538
505
539
def process_watcher_task (self , task ):
0 commit comments