2
2
This script extracts formulas from an Excel file and builds a dependency graph.
3
3
"""
4
4
5
+ from typing import List , Dict
5
6
from openpyxl import load_workbook
6
- from collections import Counter
7
7
import networkx as nx
8
8
import re
9
9
import sys
10
10
from graph_visualizer import visualize_dependency_graph
11
+ from graph_summarizer import print_summary
11
12
from excel_parser import extract_references
12
13
13
- # dictionary that stores the uniqe functions used in the formulas
14
- # the key will be the funciton name and the value will be the number of times it was used
15
- functions_dict = {}
14
+ # Dictionary that stores the unique functions used in the formulas
15
+ # The key will be the function name and the value will be the number of times it was used
16
+ functions_dict : Dict [ str , int ] = {}
16
17
17
18
18
- def log (msg ) :
19
+ def log (msg : str ) -> None :
19
20
"""
20
21
Log a message to the console if verbosity is enabled using the --verbose flag.
21
22
"""
22
- # if verbosity is enabled
23
-
24
23
if "--verbose" in sys .argv :
25
24
print (msg )
26
25
27
26
28
- def sanitize_sheetname (sheetname ) :
27
+ def sanitize_sheetname (sheetname : str ) -> str :
29
28
"""
30
29
Remove any special characters from the sheet name.
31
30
"""
32
31
return sheetname .replace ("'" , "" )
33
32
34
33
35
- def sanitize_range (rangestring ) :
34
+ def sanitize_range (rangestring : str ) -> str :
36
35
"""
37
36
Remove any special characters from the range.
38
37
"""
39
38
if "!" in rangestring :
40
- sheet = rangestring .split ("!" )[ 0 ]. replace ( "'" , " " )
41
- range = rangestring . split ( "!" )[ 1 ]
42
-
43
- return f" { sheet } ! { range } "
39
+ sheet , range_ = rangestring .split ("!" )
40
+ sheet = sheet . replace ( "'" , "" )
41
+ return f" { sheet } ! { range_ } "
42
+ return rangestring
44
43
45
44
46
- def stat_functions (cellvalue ) :
45
+ def stat_functions (cellvalue : str ) -> None :
47
46
"""
48
47
Extract the functions used in the formula and store them in a dictionary.
49
48
This will be used to print the most used functions in the formulas.
50
49
"""
51
-
52
- # functions used in the formula
53
50
cellfuncs = re .findall (r"[A-Z]+\(" , cellvalue )
54
51
log (f" Functions used: { functions_dict } " )
55
52
for function in cellfuncs :
56
- # remove the "(" from the function name
57
- function = function [:- 1 ]
58
- if function in functions_dict :
59
- functions_dict [function ] += 1
60
- else :
61
- functions_dict [function ] = 1
53
+ function = function [:- 1 ] # Remove the "(" from the function name
54
+ functions_dict [function ] = functions_dict .get (function , 0 ) + 1
62
55
63
56
64
- def add_node (graph , node , sheet ) :
57
+ def add_node (graph : nx . DiGraph , node : str , sheet : str ) -> None :
65
58
"""
66
59
Add a node to the graph with the specified sheet name.
67
60
"""
@@ -70,135 +63,114 @@ def add_node(graph, node, sheet):
70
63
graph .add_node (node , sheet = sheet )
71
64
72
65
73
- def extract_formulas_and_build_dependencies (file_path ) :
66
+ def extract_formulas_and_build_dependencies (file_path : str ) -> nx . DiGraph :
74
67
"""
75
68
Extract formulas from an Excel file and build a dependency graph.
76
69
"""
70
+ try :
71
+ wb = load_workbook (file_path , data_only = False )
72
+ except Exception as e :
73
+ log (f"Error loading workbook: { e } " )
74
+ sys .exit (1 )
77
75
78
- # Load the workbook
79
- wb = load_workbook (file_path , data_only = False )
80
-
81
- # Create a directed graph for dependencies
82
76
graph = nx .DiGraph ()
83
77
84
- # Iterate over all sheets
85
78
for sheet_name in wb .sheetnames :
86
79
ws = wb [sheet_name ]
87
80
log (f"========== Analyzing sheet: { sheet_name } ==========" )
88
- sheet_name = sanitize_sheetname (sheet_name )
89
- # Iterate over all cells in the sheet and extract formulas
90
- for row in ws .iter_rows ():
91
- for cell in row :
92
- # only interested in cells with formulas
93
- if isinstance (cell .value , str ) and cell .value .startswith ("=" ):
94
- # collect functions usage statistics
95
- stat_functions (cell .value )
96
-
97
- current_cell = f"{ sheet_name } !{ cell .coordinate } "
98
- log (f"Formula in { current_cell } : { cell .value } " )
99
-
100
- add_node (graph , current_cell , sheet_name )
101
-
102
- # Extract all referenced cells and ranges from the formula
103
- direct_references , range_references , range_dependencies = (
104
- extract_references (cell .value )
105
- )
106
-
107
- # all the referenced cells and cells from expanded ranges
108
- # is added to the graph as nodes and edges
109
- for ref_cell in direct_references :
110
- if "!" not in ref_cell :
111
- # No sheet specified, assume current sheet
112
- refc = f"{ sheet_name } !{ ref_cell } "
113
- else :
114
- # remove ' from sheet name
115
- ref_cell = ref_cell .replace ("'" , "" )
116
- refc = ref_cell
117
-
118
- log (f" Cell: { refc } " )
119
- # add the node
120
- add_node (graph , refc , sheet_name )
121
- # add the edge
122
- graph .add_edge (current_cell , refc )
123
-
124
- # If a range like A1:B3 is referenced, add the range definition as a node
125
- for rng in range_references :
126
- log (f" Range: { rng } " )
127
-
128
- if "!" not in rng :
129
- rng = f"{ sheet_name } !{ rng } "
130
- range_sheet = sheet_name
131
- else :
132
- rng = sanitize_range (rng )
133
- range_sheet = rng .split ("!" )[0 ]
134
-
135
- add_node (graph , rng , range_sheet )
136
- graph .add_edge (current_cell , rng )
137
-
138
- # If a range like A1:B3 is referenced, add the
139
- # edge between the cells within that range and
140
- # the range istself
141
- for single_cell , range_ref in range_dependencies .items ():
142
- if "!" not in range_ref :
143
- range_ref = f"{ sheet_name } !{ range_ref } "
144
- range_sheet = sheet_name
145
- else :
146
- range_ref = sanitize_range (range_ref )
147
- range_sheet = range_ref .split ("!" )[0 ]
148
-
149
- if "!" not in single_cell :
150
- single_cell = f"{ sheet_name } !{ single_cell } "
151
- cell_sheet = sheet_name
152
- else :
153
- single_cell = single_cell
154
- cell_sheet = single_cell .split ("!" )[0 ]
155
-
156
- # this is the single cell that points to the range it belongs to
157
- add_node (graph , single_cell , cell_sheet )
158
- add_node (graph , range_ref , range_sheet )
159
-
160
- # Then add the edge between the single cell and the range
161
- graph .add_edge (range_ref , single_cell )
81
+ sanitized_sheet_name = sanitize_sheetname (sheet_name )
82
+ process_sheet (ws , sanitized_sheet_name , graph )
83
+
162
84
return graph
163
85
164
86
165
- def print_summary ( graph , functionsdict ) :
87
+ def process_sheet ( ws , sheet_name : str , graph : nx . DiGraph ) -> None :
166
88
"""
167
- Summarize a networkx DiGraph representing a dependency graph. And print the most used functions in the formulas
89
+ Process a sheet and add references to the graph.
168
90
"""
91
+ for row in ws .iter_rows ():
92
+ for cell in row :
93
+ if isinstance (cell .value , str ) and cell .value .startswith ("=" ):
94
+ process_formula_cell (cell , sheet_name , graph )
169
95
170
- strpadsize = 28
171
- numpadsize = 5
172
- # 1. Print basic information about the graph
173
96
174
- print ("=== Dependency Graph Summary ===" )
175
- print (
176
- "Cell/Node count" .ljust (strpadsize , " " )
177
- + str (graph .number_of_nodes ()).rjust (numpadsize , " " )
178
- )
179
- print (
180
- "Dependency count" .ljust (strpadsize , " " )
181
- + str (graph .number_of_edges ()).rjust (numpadsize , " " )
97
+ def process_formula_cell (cell , sheet_name : str , graph : nx .DiGraph ) -> None :
98
+ """
99
+ Process a cell containing a formula.
100
+ """
101
+ stat_functions (cell .value )
102
+ cell_reference = f"{ sheet_name } !{ cell .coordinate } "
103
+ log (f"Formula in { cell_reference } : { cell .value } " )
104
+ add_node (graph , cell_reference , sheet_name )
105
+
106
+ direct_references , range_references , range_dependencies = extract_references (
107
+ cell .value
182
108
)
183
- print ()
109
+ add_references_to_graph (direct_references , cell_reference , sheet_name , graph )
110
+ add_ranges_to_graph (range_references , cell_reference , sheet_name , graph )
111
+ add_range_dependencies_to_graph (range_dependencies , sheet_name , graph )
112
+
113
+
114
+ def add_references_to_graph (
115
+ references : List [str ], current_cell : str , sheet_name : str , graph : nx .DiGraph
116
+ ) -> None :
117
+ """
118
+ Add direct cell references to the graph.
119
+ """
120
+ for cell_reference in references :
121
+ cell_reference = format_reference (cell_reference , sheet_name )
122
+ log (f" Cell: { cell_reference } " )
123
+ add_node (graph , cell_reference , sheet_name )
124
+ graph .add_edge (current_cell , cell_reference )
184
125
185
- # 2. Print the nodes with the highest degree
186
- degree_view = graph .degree ()
187
126
188
- degree_counts = Counter (dict (degree_view ))
189
- max_degree_node = degree_counts .most_common (10 )
190
- print ("=== Nodes with the highest degree ===" )
191
- for node , degree in max_degree_node :
192
- print (f"{ node .ljust (strpadsize )} { str (degree ).rjust (numpadsize , ' ' )} " )
127
+ def add_ranges_to_graph (
128
+ ranges : List [str ], current_cell : str , sheet_name : str , graph : nx .DiGraph
129
+ ) -> None :
130
+ """
131
+ Add range references to the graph.
132
+ """
133
+ for range_reference in ranges :
134
+ range_sheet_name = get_range_sheet_name (range_reference , sheet_name )
135
+ range_reference = format_reference (range_reference , sheet_name )
136
+ log (f" Range: { range_reference } " )
137
+ add_node (graph , range_reference , range_sheet_name )
138
+ graph .add_edge (current_cell , range_reference )
139
+
140
+
141
+ def add_range_dependencies_to_graph (
142
+ range_dependencies : Dict [str , str ], sheet_name : str , graph : nx .DiGraph
143
+ ) -> None :
144
+ """
145
+ Add dependencies between ranges and cells.
146
+ """
147
+ for cell_reference , range_reference in range_dependencies .items ():
148
+ range_reference = format_reference (range_reference , sheet_name )
149
+ cell_reference = format_reference (cell_reference , sheet_name )
150
+ range_sheet_name = range_reference .split ("!" )[0 ]
151
+ cell_sheet_name = cell_reference .split ("!" )[0 ]
152
+
153
+ add_node (graph , cell_reference , cell_sheet_name )
154
+ add_node (graph , range_reference , range_sheet_name )
155
+ graph .add_edge (range_reference , cell_reference )
193
156
194
- # 3. Print the most used functions
195
- print ("\n === Formula functions by count ===" )
196
- sorted_functions = dict (
197
- sorted (functionsdict .items (), key = lambda item : item [1 ], reverse = True )
157
+
158
+ def format_reference (reference : str , sheet_name : str ) -> str :
159
+ """
160
+ Format a cell or range reference to include the sheet name if not already present.
161
+ """
162
+ return (
163
+ f"{ sheet_name } !{ reference } "
164
+ if "!" not in reference
165
+ else reference .replace ("'" , "" )
198
166
)
199
167
200
- for function , count in sorted_functions .items ():
201
- print (f"{ function .ljust (strpadsize , ' ' )} { str (count ).rjust (numpadsize , ' ' )} " )
168
+
169
+ def get_range_sheet_name (range_reference : str , sheet_name : str ) -> str :
170
+ """
171
+ Get the sheet name for a range reference.
172
+ """
173
+ return sheet_name if "!" not in range_reference else range_reference .split ("!" )[0 ]
202
174
203
175
204
176
if __name__ == "__main__" :
0 commit comments