Skip to content

Commit 0b49304

Browse files
committed
Modularity + 3 visualization themes based on graph size
1 parent 0152f97 commit 0b49304

7 files changed

+292
-81
lines changed

Book1.xlsx

23 Bytes
Binary file not shown.

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ pip install -r requirements.txt
3131
## Usage
3232

3333
```bash
34-
python graphbuilder.py <path_to_excel_file> [--verbose] [--no-visualize] [--keep-direction]
34+
python graphbuilder.py <path_to_excel_file> [--verbose] [--no-visualize] [--keep-direction] [--open-image]
3535
```
3636

3737
Depending on the size of the spreadsheet you might want to adjust the plot configuration in the code to to make the graph more readable (remove labels, decrease widths and sizes etc)
@@ -44,6 +44,8 @@ Depending on the size of the spreadsheet you might want to adjust the plot confi
4444

4545
`--keep-direction` will keep the direction of the graph as it is in the excel file, otherwise it will be simplified to an undirected graph (slower)
4646

47+
`--open-image` will open the generated image in the default image viewer (only on Windows)
48+
4749
## Sample output
4850

4951
The following is the output of running the script on the provided `Book1.xlsx` file.

excel_parser.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from openpyxl.utils import get_column_letter, range_boundaries
2+
import re
3+
4+
5+
# Regex to detect cell references like A1, B2, or ranges like A1:B2
6+
CELL_REF_REGEX = r"('?[A-Za-z0-9_\-\[\] ]+'?![A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)|([A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)" # noqa
7+
8+
9+
def expand_range(range_ref):
10+
"""
11+
Expand a range reference (e.g., 'A1:A3') into a list of individual cell references.
12+
"""
13+
14+
# if there is a sheet name in the range reference, put it away for now
15+
if "!" in range_ref:
16+
sheet_name, range_ref = range_ref.split("!")
17+
else:
18+
sheet_name = None
19+
20+
min_col, min_row, max_col, max_row = range_boundaries(range_ref)
21+
expanded_cells = []
22+
23+
# Loop over rows and columns in the range
24+
for row in range(min_row, max_row + 1):
25+
for col in range(min_col, max_col + 1):
26+
# if sheetname is set
27+
if sheet_name:
28+
expanded_cells.append(f"{sheet_name}!{get_column_letter(col)}{row}")
29+
else:
30+
expanded_cells.append(f"{get_column_letter(col)}{row}")
31+
32+
return expanded_cells
33+
34+
35+
def extract_references(formula):
36+
"""
37+
Extract all referenced cells and ranges from a formula using regular expressions.
38+
This returns a list of both individual cells and range references.
39+
"""
40+
formula = formula.replace("$", "")
41+
matches = re.findall(CELL_REF_REGEX, formula)
42+
references = [match[0] if match[0] else match[2] for match in matches]
43+
44+
# trim the extracted references
45+
references = [ref.strip() for ref in references]
46+
47+
expanded_references = []
48+
dependencies = {}
49+
50+
for ref in references:
51+
if ":" in ref: # it's a range like A1:A3
52+
expanded_cells = expand_range(ref)
53+
expanded_references.extend(expanded_cells)
54+
55+
# Store the range-to-cells relationship
56+
for cell in expanded_cells:
57+
dependencies[cell] = ref
58+
else: # single cell
59+
expanded_references.append(ref)
60+
61+
return expanded_references, dependencies

graph_visualizer.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import matplotlib.cm as cm
2+
import matplotlib.patches as mpatches
3+
import networkx as nx
4+
import matplotlib.pyplot as plt
5+
import sys
6+
7+
small_graph_settings = {
8+
"node_size": 50,
9+
"edge_color": "black",
10+
"with_labels": True,
11+
"font_size": 7,
12+
"linewidths": 0.8,
13+
"alpha": 0.8,
14+
"width": 0.2,
15+
}
16+
17+
medium_graph_settings = {
18+
"node_size": 30,
19+
"edge_color": "gray",
20+
"with_labels": True,
21+
"font_size": 10,
22+
"linewidths": 0.1,
23+
"alpha": 0.4,
24+
"width": 0.2,
25+
}
26+
27+
large_graph_settings = {
28+
"node_size": 5,
29+
"edge_color": "gray",
30+
"with_labels": False,
31+
"font_size": 12,
32+
"linewidths": 0.5,
33+
"alpha": 0.2,
34+
"width": 0.2,
35+
}
36+
37+
38+
def get_graph_default_settings(graph_size):
39+
"""
40+
Set the default settings for the graph visualization based on the number of nodes.
41+
"""
42+
43+
if graph_size < 200:
44+
settings = small_graph_settings
45+
fig_size = 10
46+
elif graph_size < 500:
47+
settings = medium_graph_settings
48+
fig_size = 20
49+
else:
50+
settings = large_graph_settings
51+
fig_size = 20
52+
53+
return settings, fig_size
54+
55+
56+
# Function to get colors and generate legend for sheets
57+
def get_node_colors_and_legend(graph):
58+
sheets = {data.get("sheet", "Sheet1") for _, data in graph.nodes(data=True)}
59+
color_map = cm.get_cmap("tab20c", len(sheets))
60+
61+
# Map sheet names to colors
62+
sheet_to_color = {sheet: color_map(i) for i, sheet in enumerate(sheets)}
63+
64+
# Assign colors to nodes based on their sheet
65+
node_colors = [
66+
sheet_to_color[data.get("sheet", "Sheet1")]
67+
for _, data in graph.nodes(data=True)
68+
]
69+
70+
# Create patches for the legend
71+
legend_patches = [
72+
mpatches.Patch(color=color, label=sheet)
73+
for sheet, color in sheet_to_color.items()
74+
]
75+
76+
return node_colors, legend_patches
77+
78+
79+
def visualize_dependency_graph(graph, file_path):
80+
"""
81+
Render the dependency graph using matplotlib and networkx.
82+
"""
83+
84+
if "--keep-direction" not in sys.argv:
85+
# Convert the graph to an undirected graph
86+
graph = graph.to_undirected()
87+
88+
# Set the default settings for the graph visualization based on the number of nodes
89+
graph_settings, fig_size = get_graph_default_settings(len(graph.nodes))
90+
91+
plt.figure(figsize=(fig_size, fig_size))
92+
node_colors = [hash(graph.nodes[node]["sheet"]) % 256 for node in graph.nodes]
93+
pos = nx.spring_layout(graph) # layout for nodes
94+
95+
# add legends for the colors
96+
node_colors, legend_patches = get_node_colors_and_legend(graph)
97+
98+
nx.draw(
99+
graph,
100+
pos,
101+
node_color=node_colors,
102+
**graph_settings,
103+
)
104+
105+
plt.legend(handles=legend_patches, title="Sheets", loc="upper left")
106+
107+
filename = f"images/{file_path}.png"
108+
plt.savefig(filename)
109+
print(f"Graph visualization saved to {filename}")
110+
111+
# open the image file in windows
112+
if sys.platform == "win32" and "--open-image" in sys.argv:
113+
import os
114+
115+
os.system(f"start {filename}")

graphbuilder.py

+41-62
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,13 @@
22
This script extracts formulas from an Excel file and builds a dependency graph.
33
"""
44

5-
from collections import Counter
6-
import re
75
from openpyxl import load_workbook
6+
from collections import Counter
87
import networkx as nx
9-
import matplotlib.pyplot as plt
8+
import re
109
import sys
11-
12-
# Regex to detect cell references like A1, B2, or ranges like A1:B2
13-
CELL_REF_REGEX = r"('?[A-Za-z0-9_\-\[\] ]+'?![A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)|([A-Z]{1,3}[0-9]+(:[A-Z]{1,3}[0-9]+)?)" # noqa
10+
from graph_visualizer import visualize_dependency_graph
11+
from excel_parser import extract_references
1412

1513
# dictionary that stores the uniqe functions used in the formulas
1614
# the key will be the funciton name and the value will be the number of times it was used
@@ -28,10 +26,14 @@ def log(msg):
2826

2927

3028
def stat_functions(cellvalue):
29+
"""
30+
Extract the functions used in the formula and store them in a dictionary.
31+
This will be used to print the most used functions in the formulas.
32+
"""
33+
3134
# functions used in the formula
3235
cellfuncs = re.findall(r"[A-Z]+\(", cellvalue)
3336
log(f" Functions used: {functions_dict}")
34-
# add the functions to the dictionary
3537
for function in cellfuncs:
3638
# remove the "(" from the function name
3739
function = function[:-1]
@@ -63,28 +65,50 @@ def extract_formulas_and_build_dependencies(file_path):
6365
if isinstance(cell.value, str) and cell.value.startswith("="):
6466
stat_functions(cell.value)
6567

66-
# The formula is found in this cell
6768
cell_name = f"{sheet_name}!{cell.coordinate}"
6869
log(f"Formula in {cell_name}: {cell.value}")
6970

70-
# Extract all referenced cells from the formula
71-
referenced_cells = extract_references(cell.value)
72-
refs = []
71+
graph.add_node(cell_name, sheet=sheet_name)
72+
73+
# Extract all referenced cells and ranges from the formula
74+
referenced_cells, range_dependencies = extract_references(
75+
cell.value
76+
)
7377

7478
# Add the cell and its dependencies to the graph
7579
for ref_cell in referenced_cells:
7680
if "!" not in ref_cell:
77-
# No sheet specified in the assume current sheet
81+
# No sheet specified, assume current sheet
7882
refc = f"{sheet_name}!{ref_cell}"
7983
else:
8084
refc = ref_cell
8185

82-
# Add node to refs if not already in refs
83-
if refc not in refs:
84-
log(f" Depends on: {refc}")
85-
refs.append(refc)
86-
graph.add_edge(cell_name, refc)
86+
log(f" Depends on: {refc}")
87+
graph.add_node(refc, sheet=sheet_name)
88+
graph.add_edge(cell_name, refc)
89+
90+
# Add dependencies for ranges
91+
for single_cell, range_ref in range_dependencies.items():
92+
if "!" not in range_ref:
93+
range_ref = f"{sheet_name}!{range_ref}"
94+
range_sheet = sheet_name
95+
else:
96+
range_ref = range_ref
97+
range_sheet = range_ref.split("!")[0]
98+
99+
if "!" not in single_cell:
100+
single_cell = f"{sheet_name}!{single_cell}"
101+
cell_sheet = sheet_name
102+
else:
103+
single_cell = single_cell
104+
cell_sheet = single_cell.split("!")[0]
105+
106+
# this is the single cell that points to the range it belongs to
107+
graph.add_node(f"{single_cell}", sheet=cell_sheet) # noqa
108+
graph.add_node(f"{range_ref}", sheet=range_sheet)
87109

110+
# Then add the edge between the single cell and the range
111+
graph.add_edge(f"{single_cell}", f"{range_ref}")
88112
return graph
89113

90114

@@ -127,51 +151,6 @@ def print_summary(graph, functionsdict):
127151
print(f"{function.ljust(strpadsize, ' ')}{str(count).rjust(numpadsize, ' ')}")
128152

129153

130-
def extract_references(formula):
131-
"""
132-
Extract all referenced cells from a formula using regular expressions.
133-
This returns a list of cells that are mentioned directly (e.g., A1, B2),
134-
but doesn't handle ranges or external sheets' references.
135-
"""
136-
formula = formula.replace("$", "")
137-
matches = re.findall(CELL_REF_REGEX, formula)
138-
references = [match[0] if match[0] else match[2] for match in matches]
139-
140-
# trim the extracted references
141-
references = [ref.strip() for ref in references]
142-
143-
return references
144-
145-
146-
def visualize_dependency_graph(graph, file_path):
147-
"""
148-
Render the dependency graph using matplotlib and networkx.
149-
"""
150-
151-
if "--keep-direction" not in sys.argv:
152-
# Convert the graph to an undirected graph
153-
graph = graph.to_undirected()
154-
155-
pos = nx.spring_layout(graph) # layout for nodes
156-
plt.figure(figsize=(10, 10))
157-
nx.draw(
158-
graph,
159-
pos,
160-
with_labels=True,
161-
node_color="black",
162-
edge_color="gray",
163-
linewidths=3.5,
164-
alpha=0.8,
165-
width=1,
166-
# font_weight="bold",
167-
node_size=20,
168-
)
169-
170-
filename = f"images/{file_path}.png"
171-
plt.savefig(filename)
172-
print(f"Graph visualization saved to {filename}")
173-
174-
175154
if __name__ == "__main__":
176155
path_to_excel = "Book1.xlsx"
177156

images/Book1.xlsx.png

1.06 KB
Loading

0 commit comments

Comments
 (0)