2
2
This script extracts formulas from an Excel file and builds a dependency graph.
3
3
"""
4
4
5
- from collections import Counter
6
- import re
7
5
from openpyxl import load_workbook
6
+ from collections import Counter
8
7
import networkx as nx
9
- import matplotlib . pyplot as plt
8
+ import re
10
9
import sys
11
-
12
- # Regex to detect cell references like A1, B2, or ranges like A1:B2
13
- CELL_REF_REGEX = r"('?[A-Za-z0-9_\-\[\] ]+'?![A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)|([A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)" # noqa
10
+ from graph_visualizer import visualize_dependency_graph
11
+ from excel_parser import extract_references
14
12
15
13
# dictionary that stores the uniqe functions used in the formulas
16
14
# the key will be the funciton name and the value will be the number of times it was used
@@ -28,10 +26,14 @@ def log(msg):
28
26
29
27
30
28
def stat_functions (cellvalue ):
29
+ """
30
+ Extract the functions used in the formula and store them in a dictionary.
31
+ This will be used to print the most used functions in the formulas.
32
+ """
33
+
31
34
# functions used in the formula
32
35
cellfuncs = re .findall (r"[A-Z]+\(" , cellvalue )
33
36
log (f" Functions used: { functions_dict } " )
34
- # add the functions to the dictionary
35
37
for function in cellfuncs :
36
38
# remove the "(" from the function name
37
39
function = function [:- 1 ]
@@ -63,28 +65,50 @@ def extract_formulas_and_build_dependencies(file_path):
63
65
if isinstance (cell .value , str ) and cell .value .startswith ("=" ):
64
66
stat_functions (cell .value )
65
67
66
- # The formula is found in this cell
67
68
cell_name = f"{ sheet_name } !{ cell .coordinate } "
68
69
log (f"Formula in { cell_name } : { cell .value } " )
69
70
70
- # Extract all referenced cells from the formula
71
- referenced_cells = extract_references (cell .value )
72
- refs = []
71
+ graph .add_node (cell_name , sheet = sheet_name )
72
+
73
+ # Extract all referenced cells and ranges from the formula
74
+ referenced_cells , range_dependencies = extract_references (
75
+ cell .value
76
+ )
73
77
74
78
# Add the cell and its dependencies to the graph
75
79
for ref_cell in referenced_cells :
76
80
if "!" not in ref_cell :
77
- # No sheet specified in the assume current sheet
81
+ # No sheet specified, assume current sheet
78
82
refc = f"{ sheet_name } !{ ref_cell } "
79
83
else :
80
84
refc = ref_cell
81
85
82
- # Add node to refs if not already in refs
83
- if refc not in refs :
84
- log (f" Depends on: { refc } " )
85
- refs .append (refc )
86
- graph .add_edge (cell_name , refc )
86
+ log (f" Depends on: { refc } " )
87
+ graph .add_node (refc , sheet = sheet_name )
88
+ graph .add_edge (cell_name , refc )
89
+
90
+ # Add dependencies for ranges
91
+ for single_cell , range_ref in range_dependencies .items ():
92
+ if "!" not in range_ref :
93
+ range_ref = f"{ sheet_name } !{ range_ref } "
94
+ range_sheet = sheet_name
95
+ else :
96
+ range_ref = range_ref
97
+ range_sheet = range_ref .split ("!" )[0 ]
98
+
99
+ if "!" not in single_cell :
100
+ single_cell = f"{ sheet_name } !{ single_cell } "
101
+ cell_sheet = sheet_name
102
+ else :
103
+ single_cell = single_cell
104
+ cell_sheet = single_cell .split ("!" )[0 ]
105
+
106
+ # this is the single cell that points to the range it belongs to
107
+ graph .add_node (f"{ single_cell } " , sheet = cell_sheet ) # noqa
108
+ graph .add_node (f"{ range_ref } " , sheet = range_sheet )
87
109
110
+ # Then add the edge between the single cell and the range
111
+ graph .add_edge (f"{ single_cell } " , f"{ range_ref } " )
88
112
return graph
89
113
90
114
@@ -127,51 +151,6 @@ def print_summary(graph, functionsdict):
127
151
print (f"{ function .ljust (strpadsize , ' ' )} { str (count ).rjust (numpadsize , ' ' )} " )
128
152
129
153
130
- def extract_references (formula ):
131
- """
132
- Extract all referenced cells from a formula using regular expressions.
133
- This returns a list of cells that are mentioned directly (e.g., A1, B2),
134
- but doesn't handle ranges or external sheets' references.
135
- """
136
- formula = formula .replace ("$" , "" )
137
- matches = re .findall (CELL_REF_REGEX , formula )
138
- references = [match [0 ] if match [0 ] else match [2 ] for match in matches ]
139
-
140
- # trim the extracted references
141
- references = [ref .strip () for ref in references ]
142
-
143
- return references
144
-
145
-
146
- def visualize_dependency_graph (graph , file_path ):
147
- """
148
- Render the dependency graph using matplotlib and networkx.
149
- """
150
-
151
- if "--keep-direction" not in sys .argv :
152
- # Convert the graph to an undirected graph
153
- graph = graph .to_undirected ()
154
-
155
- pos = nx .spring_layout (graph ) # layout for nodes
156
- plt .figure (figsize = (10 , 10 ))
157
- nx .draw (
158
- graph ,
159
- pos ,
160
- with_labels = True ,
161
- node_color = "black" ,
162
- edge_color = "gray" ,
163
- linewidths = 3.5 ,
164
- alpha = 0.8 ,
165
- width = 1 ,
166
- # font_weight="bold",
167
- node_size = 20 ,
168
- )
169
-
170
- filename = f"images/{ file_path } .png"
171
- plt .savefig (filename )
172
- print (f"Graph visualization saved to { filename } " )
173
-
174
-
175
154
if __name__ == "__main__" :
176
155
path_to_excel = "Book1.xlsx"
177
156
0 commit comments