-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_stats.py
155 lines (137 loc) · 7.31 KB
/
compute_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""Module for computing interesting statistics about this phenomenon."""
from audioop import reverse
from collections import defaultdict
import json
from typing import DefaultDict
from distance_to_philosophy import get_reverse_edges
from get_to_philosophy import get_edges
def compute_heat_graph() -> DefaultDict[str, int]:
"""Compute a 'heat graph' of the articles and edges stored in the cache.
In the context of this project, we will define a few concepts to make
syntax and construction easier. Each article has a number of articles
that link to them. Let us define a 'heat graph' as the sum of all
articles that eventually link to it. That is, following a path of
articles, each article is numerically valued based on how many articles
will eventually lead to that article.
A cycle is a series of two or more articles that reference each other
continuously. The only two ways for a chain of articles to terminate is
when it either reaches a cycle, or reaches an article without any links.
Given the nature of Wikipedia, the former is far more common, with the
latter only typically occuring for chains beginning at linkless articles
in the first place.
Our methodology is a little complicated, but is enough to come up with a
fairly accurate, and representative number for each article. To keep an
account of each article's running sum, we will use a dictionary that
links the article's title to how many articles eventually link to it. It
will also help to think of this data set as a set of disconnected graphs,
where nodes are articles, and edges are the directed relation by which
one article links to another.
First, we find every cycle in the data set. That is, create a set of
every article that is part of a cycle. We will use each of these nodes as
'terminating notes.' That is, in our process, if we come across one of
these cycles, we know this is where we will stop our current path. In
treating each cycle as one large node, we can think of this problem less
as a graphing problem, and more as a tree problem, where each cycle is the
root of a new tree.
Each of those 'terminating nodes' will be added to a queue, and they will
each be 'visited' to check their value. A 'visit' will be defined here as
the process of going to a node, getting the value of its children
(computing that first if it has not yet been visited), and setting its
value to the sum of each of its children, plus the number of children
themselves. This will be a recursive process.
Once all 'non-terminating' nodes have been valued, we tackle the nodes in
cycles. Each node in the cycle will be valued similarly as previous nodes,
and then it will traverse down the cycle to all other nodes, adding its
value to theirs.
Lastly, each of these cycle nodes have their values incremented by one,
to account for the fact that they each link to each other. This gives us a
workable, meaningful quantification for how many articles will eventually
reach a given article.
"""
hmap = defaultdict(int)
# articles = get_articles() # we really dont *need* articles i think?
edges = get_edges()
reverse_edges = get_reverse_edges(edges)
# Get terminating nodes
print("Getting terminating nodes (this may take a while...)")
terminating_nodes = set()
TERMINATING_NODES_STORATE_URL = "cache/terminating_nodes.json"
seen_nodes = set()
choice = input("Would you like to use cached value? (y/n): ").lower()
if choice == 'n':
print("Computing manually...")
count = 0
for article in edges.keys():
count += 1
if count % 100000 == 0: print(f"Checked {count:,d} articles so far...")
if article in seen_nodes: continue
path = [article]
nodes_seen_this_batch = set()
while len(path) == len(set(path)): # no duplicates
next_node = edges[path[-1]]
if next_node == "": break # no links
path.append(next_node)
if next_node in nodes_seen_this_batch: break
nodes_seen_this_batch.add(next_node)
next_node = path[-1]
# Check for terminating nodes
while next_node not in terminating_nodes and next_node not in seen_nodes and len(path) > 1:
terminating_nodes.add(path.pop())
next_node = path[-1]
# Add to nodes we've seen
for node in nodes_seen_this_batch:
seen_nodes.add(node)
print("Done!")
print("Writing to cache...")
with open(TERMINATING_NODES_STORATE_URL, "w+") as outfile:
json.dump({'nodes': list(terminating_nodes)}, outfile, indent=4)
else:
print("Using cache...")
with open(TERMINATING_NODES_STORATE_URL, 'r') as infile:
terminating_nodes = set(json.load(infile)['nodes'])
# Set up "visit" process
def visit(node: str):
"""The process for visiting a node (doesn't work for terminating nodes)"""
if node in terminating_nodes: return # don't count looping nodes (yet)
for child in reverse_edges[node]:
visit(child) # visit the child (recursive)
hmap[node] += hmap[child] + 1 # value = value of child + the child itself
# Do the visiting
print("Starting to visit nodes...")
count = 0
for node in terminating_nodes:
count += 1
if count % 1000 == 0: print(f"Visited {count:,d}/{len(terminating_nodes):,d} terminating nodes so far...")
for child_node in reverse_edges[node]:
visit(child_node)
print("Done!")
# Visit nodes in cycles
for node in terminating_nodes:
# get value for self
additional_value = 0 # prevent double-counting
for child_node in reverse_edges[node]:
if child_node in terminating_nodes: continue # don't count terminating nodes (yet)
additional_value += hmap[child_node] + 1 # value = value of child + the child itself
hmap[node] += additional_value
# share this value with other nodes in the cycle
if edges[node] == "": continue # if the node loops with itself (or has no links) do nothing
next_node = edges[node]
while next_node != node and next_node != "": # go around the cycle, or until you hit a node with no links
hmap[next_node] += additional_value
next_node = edges[next_node]
# Add 1 to each, since each node in the cycle links to each once
for node in terminating_nodes:
hmap[node] += 1
return hmap
if __name__=='__main__':
hmap = compute_heat_graph()
# Reverse the heat map (i.e. index by number and not article title)
reverse_hmap = defaultdict(set)
for article, heat in hmap.items():
reverse_hmap[heat].add(article)
# Print the top 10 results
heat_values = list(reverse_hmap.keys())
heat_values.sort()
heat_values = heat_values[::-1]
for i in range(100):
print(f"{i+1}: {reverse_hmap[heat_values[i]]} -> {heat_values[i]}")