Merge pull request #3 from dalager/Refactor

dalager · web-flow · commit e117366d6662 · 2024-09-29T09:22:09.000+02:00
Refactoring, cleanup and test improvements
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+max-line-length = 100
diff --git a/README.md b/README.md
@@ -79,24 +79,25 @@ In [graph_visualizer.py](graph_visualizer.py) you will find three configuration
 The following is the output of running the script on the provided `Book1.xlsx` file.
 
 ```bash
-=== Dependency Graph Summary ===
-Cell/Node count                50
-Dependency count               60
-
-=== Nodes with the highest degree ===
-Range!F1                       10
-Base!B5                         4
-Base!B12                        3
-Base!B17                        3
-Base!I21                        3
-Base!G22                        3
-Base!B22                        3
-Base!B28                        3
-Range!B2                        3
-Range!B3                        3
-
-=== Formula functions by count ===
-SUM                             3
+===  Dependency Graph Summary ===
+Cell/Node count                70
+Dependency count              100
+
+
+===  Most connected nodes     ===
+Range Madness!A2:A11           22
+Range Madness!B2:B11           11
+Range Madness!F1               10
+Main Sheet!B5                   4
+Main Sheet!B22                  4
+Detached !A2:A4                 4
+Range Madness!B2                4
+Range Madness!B3                4
+Range Madness!B4                4
+Range Madness!B5                4
+
+===  Most used functions      ===
+SUM                             4
 POWER                           1
 
 Visualizing the graph of dependencies.
diff --git a/excel_parser.py b/excel_parser.py
@@ -1,63 +1,75 @@
 from openpyxl.utils import get_column_letter, range_boundaries
 import re
-
+from typing import List, Tuple, Dict
 
 # Regex to detect cell references like A1, B2, or ranges like A1:B2
 CELL_REF_REGEX = r"('?[A-Za-z0-9_\-\[\] ]+'?![A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)|([A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)"  # noqa
 
 
-def expand_range(range_ref):
+def expand_range(range_reference: str) -> List[str]:
     """
     Expand a range reference (e.g., 'A1:A3') into a list of individual cell references.
-    """
 
+    Args:
+        range_ref (str): The range reference to expand.
+
+    Returns:
+        List[str]: A list of individual cell references.
+    """
     # if there is a sheet name in the range reference, put it away for now
-    if "!" in range_ref:
-        sheet_name, range_ref = range_ref.split("!")
+    if "!" in range_reference:
+        sheet_name, range_reference = range_reference.split("!")
     else:
         sheet_name = None
 
-    min_col, min_row, max_col, max_row = range_boundaries(range_ref)
+    min_col, min_row, max_col, max_row = range_boundaries(range_reference)
     expanded_cells = []
 
     # Loop over rows and columns in the range
     for row in range(min_row, max_row + 1):
         for col in range(min_col, max_col + 1):
-            # if sheetname is set
+            cell_ref = f"{get_column_letter(col)}{row}"
             if sheet_name:
-                expanded_cells.append(f"{sheet_name}!{get_column_letter(col)}{row}")
+                expanded_cells.append(f"{sheet_name}!{cell_ref}")
             else:
-                expanded_cells.append(f"{get_column_letter(col)}{row}")
+                expanded_cells.append(cell_ref)
 
     return expanded_cells
 
 
-def extract_references(formula):
+def extract_references(formula: str) -> Tuple[List[str], List[str], Dict[str, str]]:
     """
     Extract all referenced cells and ranges from a formula using regular expressions.
     This returns a list of both individual cells and range references.
+
+    Args:
+        formula (str): The formula to extract references from.
+
+    Returns:
+        Tuple[List[str], List[str], Dict[str, str]]: A tuple containing lists of direct references,
+                                                     range references, and a dictionary of dependencies.
     """
     formula = formula.replace("$", "")
     matches = re.findall(CELL_REF_REGEX, formula)
     references = [match[0] if match[0] else match[2] for match in matches]
 
-    # trim the extracted references
+    # Trim the extracted references
     references = [ref.strip() for ref in references]
 
     expanded_references = []
     dependencies = {}
     direct_references = []
     range_references = []
 
-    for ref in references:
-        if ":" in ref:  # it's a range like A1:A3
-            expanded_cells = expand_range(ref)
+    for reference in references:
+        if ":" in reference:  # it's a range like A1:A3
+            expanded_cells = expand_range(reference)
             expanded_references.extend(expanded_cells)
-            range_references.append(ref)
+            range_references.append(reference)
             # Store the range-to-cells relationship
             for cell in expanded_cells:
-                dependencies[cell] = ref
+                dependencies[cell] = reference
         else:  # single cell
-            direct_references.append(ref)
+            direct_references.append(reference)
 
     return direct_references, range_references, dependencies
diff --git a/graph_summarizer.py b/graph_summarizer.py
@@ -0,0 +1,46 @@
+from collections import Counter
+
+
+def print_summary(graph, functionsdict):
+    """
+    Summarize a networkx DiGraph representing a dependency graph and print the most used functions in the formulas.
+    """
+    strpadsize = 28
+    numpadsize = 5
+
+    print_basic_info(graph, strpadsize, numpadsize)
+    print_highest_degree_nodes(graph, strpadsize, numpadsize)
+    print_most_used_functions(functionsdict, strpadsize, numpadsize)
+
+
+def print_basic_info(graph, strpadsize, numpadsize):
+    print("===  Dependency Graph Summary ===")
+    print(
+        "Cell/Node count".ljust(strpadsize, " ")
+        + str(graph.number_of_nodes()).rjust(numpadsize, " ")
+    )
+    print(
+        "Dependency count".ljust(strpadsize, " ")
+        + str(graph.number_of_edges()).rjust(numpadsize, " ")
+    )
+    print()
+
+
+def print_highest_degree_nodes(graph, strpadsize, numpadsize):
+    print("\n===  Most connected nodes     ===")
+    degree_view = graph.degree()
+    degree_counts = Counter(dict(degree_view))
+    max_degree_node = degree_counts.most_common(10)
+
+    for node, degree in max_degree_node:
+        print(f"{node.ljust(strpadsize)}{str(degree).rjust(numpadsize, ' ')} ")
+
+
+def print_most_used_functions(functionsdict, strpadsize, numpadsize):
+    print("\n===  Most used functions      ===")
+    sorted_functions = dict(
+        sorted(functionsdict.items(), key=lambda item: item[1], reverse=True)
+    )
+
+    for function, count in sorted_functions.items():
+        print(f"{function.ljust(strpadsize, ' ')}{str(count).rjust(numpadsize, ' ')}")
diff --git a/graphbuilder.py b/graphbuilder.py
diff --git a/test_cell_reference_extraction.py b/test_cell_reference_extraction.py