logs

vamsijilla · vamsijilla · commit 2845a6e5d323 · 2024-11-02T21:40:11.000-04:00
diff --git a/.DS_Store b/.DS_Store
diff --git a/Logs_Anomalies/anomaly-detector.py b/Logs_Anomalies/anomaly-detector.py
@@ -0,0 +1,145 @@
+# anomaly_detector.py
+from typing import Dict, List, Optional
+import numpy as np
+from datetime import datetime
+import tensorflow as tf
+from google.cloud import storage
+import json
+
+class AnomalyDetector:
+    def __init__(self, logger, bucket_name: str):
+        self.logger = logger
+        self.bucket_name = bucket_name
+        self.client = storage.Client()
+        self.bucket = self.client.bucket(bucket_name)
+        
+        # Define thresholds
+        self.thresholds = {
+            'missing_labels_ratio': 0.05,
+            'corrupt_images_ratio': 0.02,
+            'min_image_size': 100 * 100,  # minimum 100x100 pixels
+            'max_image_size': 1000 * 1000,  # maximum 1000x1000 pixels
+            'class_imbalance_ratio': 0.1  # minimum 10% for any class
+        }
+        
+    def check_data_completeness(self) -> Dict:
+        """Check for missing data and labels"""
+        try:
+            self.logger.log_task_start("check_data_completeness")
+            
+            # Get all image files
+            image_files = set(
+                blob.name.split('/')[-1] 
+                for blob in self.bucket.list_blobs(prefix='raw/xray/')
+                if blob.name.endswith(('.png', '.jpg', '.jpeg'))
+            )
+            
+            # Get all labeled files
+            labels_blob = self.bucket.blob('raw/xray/labels.csv')
+            labels_df = pd.read_csv(labels_blob.download_as_string())
+            labeled_files = set(labels_df['image_id'].values)
+            
+            # Calculate metrics
+            metrics = {
+                'total_images': len(image_files),
+                'total_labeled': len(labeled_files),
+                'missing_labels': len(image_files - labeled_files),
+                'extra_labels': len(labeled_files - image_files),
+                'timestamp': datetime.now().isoformat()
+            }
+            
+            # Check for anomalies
+            missing_ratio = metrics['missing_labels'] / metrics['total_images']
+            if missing_ratio > self.thresholds['missing_labels_ratio']:
+                self.logger.log_error(
+                    "data_completeness",
+                    f"High ratio of missing labels: {missing_ratio:.2%}",
+                    alert=True
+                )
+            
+            return metrics
+            
+        except Exception as e:
+            self.logger.log_error("check_data_completeness", e)
+            raise
+            
+    def check_image_quality(self, sample_size: int = 100) -> Dict:
+        """Check image quality and format"""
+        try:
+            self.logger.log_task_start("check_image_quality")
+            
+            # Sample images
+            blobs = list(self.bucket.list_blobs(prefix='raw/xray/'))[:sample_size]
+            
+            metrics = {
+                'corrupt_images': 0,
+                'invalid_dimensions': 0,
+                'invalid_format': 0,
+                'samples_checked': len(blobs)
+            }
+            
+            for blob in blobs:
+                try:
+                    # Try to decode image
+                    image_data = blob.download_as_bytes()
+                    image = tf.image.decode_image(image_data)
+                    
+                    # Check dimensions
+                    image_size = image.shape[0] * image.shape[1]
+                    if (image_size < self.thresholds['min_image_size'] or 
+                        image_size > self.thresholds['max_image_size']):
+                        metrics['invalid_dimensions'] += 1
+                        
+                except Exception:
+                    metrics['corrupt_images'] += 1
+                    
+            # Calculate ratios
+            metrics['corrupt_ratio'] = metrics['corrupt_images'] / metrics['samples_checked']
+            metrics['invalid_dim_ratio'] = metrics['invalid_dimensions'] / metrics['samples_checked']
+            
+            # Check for anomalies
+            if metrics['corrupt_ratio'] > self.thresholds['corrupt_images_ratio']:
+                self.logger.log_error(
+                    "image_quality",
+                    f"High ratio of corrupt images: {metrics['corrupt_ratio']:.2%}",
+                    alert=True
+                )
+                
+            return metrics
+            
+        except Exception as e:
+            self.logger.log_error("check_image_quality", e)
+            raise
+            
+    def check_class_distribution(self) -> Dict:
+        """Check for class imbalance"""
+        try:
+            self.logger.log_task_start("check_class_distribution")
+            
+            # Load labels
+            labels_blob = self.bucket.blob('raw/xray/labels.csv')
+            labels_df = pd.read_csv(labels_blob.download_as_string())
+            
+            # Calculate class distribution
+            class_dist = labels_df['label'].value_counts()
+            total_samples = len(labels_df)
+            
+            metrics = {
+                'class_distribution': class_dist.to_dict(),
+                'class_ratios': (class_dist / total_samples).to_dict()
+            }
+            
+            # Check for class imbalance
+            for class_name, ratio in metrics['class_ratios'].items():
+                if ratio < self.thresholds['class_imbalance_ratio']:
+                    self.logger.log_error(
+                        "class_distribution",
+                        f"Class {class_name} is underrepresented: {ratio:.2%}",
+                        alert=True
+                    )
+                    
+            return metrics
+            
+        except Exception as e:
+            self.logger.log_error("check_class_distribution", e)
+            raise
diff --git a/Logs_Anomalies/logging-setup.py b/Logs_Anomalies/logging-setup.py
@@ -0,0 +1,83 @@
+# logging_config.py
+import logging
+from logging.handlers import RotatingFileHandler
+import os
+from datetime import datetime
+from typing import Dict, Any
+from airflow.hooks.base import BaseHook
+from slack_sdk import WebClient
+from slack_sdk.errors import SlackApiError
+
+class PipelineLogger:
+    def __init__(self, pipeline_name: str):
+        self.pipeline_name = pipeline_name
+        self.log_dir = f"logs/{pipeline_name}"
+        os.makedirs(self.log_dir, exist_ok=True)
+        
+        # Set up file handler
+        log_file = f"{self.log_dir}/{datetime.now().strftime('%Y%m%d')}.log"
+        file_handler = RotatingFileHandler(
+            log_file,
+            maxBytes=10485760,  # 10MB
+            backupCount=5
+        )
+        
+        # Set up console handler
+        console_handler = logging.StreamHandler()
+        
+        # Create formatters and add it to handlers
+        log_format = logging.Formatter(
+            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        file_handler.setFormatter(log_format)
+        console_handler.setFormatter(log_format)
+        
+        # Create logger
+        self.logger = logging.getLogger(pipeline_name)
+        self.logger.setLevel(logging.INFO)
+        self.logger.addHandler(file_handler)
+        self.logger.addHandler(console_handler)
+        
+        # Slack client setup
+        self.slack_client = self._setup_slack()
+        
+    def _setup_slack(self) -> WebClient:
+        """Set up Slack client using Airflow connection"""
+        try:
+            slack_conn = BaseHook.get_connection('slack_webhook')
+            return WebClient(token=slack_conn.password)
+        except Exception as e:
+            self.logger.error(f"Failed to setup Slack client: {str(e)}")
+            return None
+    
+    def log_task_start(self, task_name: str, params: Dict[str, Any] = None):
+        """Log task start with parameters"""
+        msg = f"Starting task: {task_name}"
+        if params:
+            msg += f" with parameters: {params}"
+        self.logger.info(msg)
+        
+    def log_task_completion(self, task_name: str, metrics: Dict[str, Any] = None):
+        """Log task completion with metrics"""
+        msg = f"Completed task: {task_name}"
+        if metrics:
+            msg += f" with metrics: {metrics}"
+        self.logger.info(msg)
+        
+    def log_error(self, task_name: str, error: Exception, alert: bool = True):
+        """Log error and optionally send alert"""
+        error_msg = f"Error in task {task_name}: {str(error)}"
+        self.logger.error(error_msg)
+        
+        if alert and self.slack_client:
+            try:
+                self.slack_client.chat_postMessage(
+                    channel="#pipeline-alerts",
+                    text=f":red_circle: *Pipeline Error*\n{error_msg}"
+                )
+            except SlackApiError as e:
+                self.logger.error(f"Failed to send Slack alert: {str(e)}")
+                
+    def log_metric(self, metric_name: str, value: Any):
+        """Log a specific metric"""
+        self.logger.info(f"Metric - {metric_name}: {value}")
diff --git a/Logs_Anomalies/pipeline-optimizer.py b/Logs_Anomalies/pipeline-optimizer.py
@@ -0,0 +1,16 @@
+# pipeline_optimizer.py
+from airflow.models import DagRun
+from airflow.utils.db import provide_session
+from datetime import datetime, timedelta
+import numpy as np
+from typing import Dict, List
+import json
+
+class PipelineOptimizer:
+    def __init__(self, logger, dag_id: str):
+        self.logger = logger
+        self.dag_id = dag_id
+        
+    @provide_session
+    def analyze_task_durations(self, session=None, lookback_days: int = 7) -> Dict:
+        """Analyze task durations from Airflow's history"""