Skip to content

Commit 947b774

Browse files
authored
Merge pull request #2 from dalager/deranged
Range expansions redefined
2 parents 4be43ac + ff42671 commit 947b774

7 files changed

+195
-112
lines changed

Book1.xlsx

931 Bytes
Binary file not shown.

README.md

+25-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,29 @@ This is a simple tool and maybe even naïve in its approach - it was hacked toge
1717

1818
Single-cell references in a formula sitting in cell `A3` like `=A1+A2` is considered a dependency between the node `A3` and the nodes `A2` and `A1`.
1919

20-
A range defined in a formula like `=SUM(B1:B200)` is semantically handled like a single reference or node in the tree and not 200 individual nodes in the graph.
20+
```mermaid
21+
graph TD
22+
A1 --> A3
23+
A2 --> A3
24+
A3["=A1 + A2"]
25+
```
26+
27+
A range defined in a formula like `=SUM(B1:B3)` is kept as a single node in the graph, but all the containing cells are expanded as dependencies of the range node.
28+
29+
So when a cell, `C1` contains `=SUM(B1:B3)` the graph will look like this:
30+
31+
```mermaid
32+
33+
graph TD
34+
R -->B1
35+
R -->B2
36+
R -->B3
37+
R["B1:B3"]
38+
C1 --> R
39+
40+
C1["C1=SUM(B1:B3)"]
41+
42+
```
2143

2244
The way the graph is built is by iterating over all cells in the spreadsheet and extracting the references in the formula of each cell. The references are then added as edges in the graph.
2345

@@ -40,6 +62,8 @@ python graphbuilder.py <path_to_excel_file> [--verbose] [--no-visualize] [--keep
4062

4163
Depending on the size of the spreadsheet you might want to adjust the plot configuration in the code to to make the graph more readable (remove labels, decrease widths and sizes etc)
4264

65+
In [graph_visualizer.py](graph_visualizer.py) you will find three configuration for small, medium and large graphs. You can adjust the configuration to your needs.
66+
4367
### Arguments
4468

4569
`--verbose` will dump formula cell contents during (more quiet)

excel_parser.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,18 @@ def extract_references(formula):
4646

4747
expanded_references = []
4848
dependencies = {}
49+
direct_references = []
50+
range_references = []
4951

5052
for ref in references:
5153
if ":" in ref: # it's a range like A1:A3
5254
expanded_cells = expand_range(ref)
5355
expanded_references.extend(expanded_cells)
54-
56+
range_references.append(ref)
5557
# Store the range-to-cells relationship
5658
for cell in expanded_cells:
5759
dependencies[cell] = ref
5860
else: # single cell
59-
expanded_references.append(ref)
61+
direct_references.append(ref)
6062

61-
return expanded_references, dependencies
63+
return direct_references, range_references, dependencies

graph_visualizer.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
small_graph_settings = {
88
"node_size": 50,
99
"edge_color": "black",
10-
"with_labels": True,
11-
"font_size": 7,
10+
"with_labels": False,
11+
"font_size": 10,
1212
"linewidths": 0.8,
1313
"alpha": 0.8,
1414
"width": 0.2,

graphbuilder.py

+58-15
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,24 @@ def log(msg):
2525
print(msg)
2626

2727

28+
def sanitize_sheetname(sheetname):
29+
"""
30+
Remove any special characters from the sheet name.
31+
"""
32+
return sheetname.replace("'", "")
33+
34+
35+
def sanitize_range(rangestring):
36+
"""
37+
Remove any special characters from the range.
38+
"""
39+
if "!" in rangestring:
40+
sheet = rangestring.split("!")[0].replace("'", "")
41+
range = rangestring.split("!")[1]
42+
43+
return f"{sheet}!{range}"
44+
45+
2846
def stat_functions(cellvalue):
2947
"""
3048
Extract the functions used in the formula and store them in a dictionary.
@@ -43,6 +61,15 @@ def stat_functions(cellvalue):
4361
functions_dict[function] = 1
4462

4563

64+
def add_node(graph, node, sheet):
65+
"""
66+
Add a node to the graph with the specified sheet name.
67+
"""
68+
log(f"Adding node: {node} in sheet: {sheet}")
69+
sheet = sanitize_sheetname(sheet)
70+
graph.add_node(node, sheet=sheet)
71+
72+
4673
def extract_formulas_and_build_dependencies(file_path):
4774
"""
4875
Extract formulas from an Excel file and build a dependency graph.
@@ -57,8 +84,8 @@ def extract_formulas_and_build_dependencies(file_path):
5784
# Iterate over all sheets
5885
for sheet_name in wb.sheetnames:
5986
ws = wb[sheet_name]
60-
log(f"-- Analyzing sheet: {sheet_name} --")
61-
87+
log(f"========== Analyzing sheet: {sheet_name} ==========")
88+
sheet_name = sanitize_sheetname(sheet_name)
6289
# Iterate over all cells in the sheet and extract formulas
6390
for row in ws.iter_rows():
6491
for cell in row:
@@ -67,30 +94,46 @@ def extract_formulas_and_build_dependencies(file_path):
6794
# collect functions usage statistics
6895
stat_functions(cell.value)
6996

70-
cell_name = f"{sheet_name}!{cell.coordinate}"
71-
log(f"Formula in {cell_name}: {cell.value}")
97+
current_cell = f"{sheet_name}!{cell.coordinate}"
98+
log(f"Formula in {current_cell}: {cell.value}")
7299

73-
graph.add_node(cell_name, sheet=sheet_name)
100+
add_node(graph, current_cell, sheet_name)
74101

75102
# Extract all referenced cells and ranges from the formula
76-
referenced_cells, range_dependencies = extract_references(
77-
cell.value
103+
direct_references, range_references, range_dependencies = (
104+
extract_references(cell.value)
78105
)
79106

80107
# all the referenced cells and cells from expanded ranges
81108
# is added to the graph as nodes and edges
82-
for ref_cell in referenced_cells:
109+
for ref_cell in direct_references:
83110
if "!" not in ref_cell:
84111
# No sheet specified, assume current sheet
85112
refc = f"{sheet_name}!{ref_cell}"
86113
else:
114+
# remove ' from sheet name
115+
ref_cell = ref_cell.replace("'", "")
87116
refc = ref_cell
88117

89-
log(f" Depends on: {refc}")
118+
log(f" Cell: {refc}")
90119
# add the node
91-
graph.add_node(refc, sheet=sheet_name)
120+
add_node(graph, refc, sheet_name)
92121
# add the edge
93-
graph.add_edge(cell_name, refc)
122+
graph.add_edge(current_cell, refc)
123+
124+
# If a range like A1:B3 is referenced, add the range definition as a node
125+
for rng in range_references:
126+
log(f" Range: {rng}")
127+
128+
if "!" not in rng:
129+
rng = f"{sheet_name}!{rng}"
130+
range_sheet = sheet_name
131+
else:
132+
rng = sanitize_range(rng)
133+
range_sheet = rng.split("!")[0]
134+
135+
add_node(graph, rng, range_sheet)
136+
graph.add_edge(current_cell, rng)
94137

95138
# If a range like A1:B3 is referenced, add the
96139
# edge between the cells within that range and
@@ -100,7 +143,7 @@ def extract_formulas_and_build_dependencies(file_path):
100143
range_ref = f"{sheet_name}!{range_ref}"
101144
range_sheet = sheet_name
102145
else:
103-
range_ref = range_ref
146+
range_ref = sanitize_range(range_ref)
104147
range_sheet = range_ref.split("!")[0]
105148

106149
if "!" not in single_cell:
@@ -111,11 +154,11 @@ def extract_formulas_and_build_dependencies(file_path):
111154
cell_sheet = single_cell.split("!")[0]
112155

113156
# this is the single cell that points to the range it belongs to
114-
graph.add_node(f"{single_cell}", sheet=cell_sheet) # noqa
115-
graph.add_node(f"{range_ref}", sheet=range_sheet)
157+
add_node(graph, single_cell, cell_sheet)
158+
add_node(graph, range_ref, range_sheet)
116159

117160
# Then add the edge between the single cell and the range
118-
graph.add_edge(f"{single_cell}", f"{range_ref}")
161+
graph.add_edge(range_ref, single_cell)
119162
return graph
120163

121164

images/Book1.xlsx.png

-60.9 KB
Loading

0 commit comments

Comments
 (0)