-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsimilarity.py
88 lines (71 loc) · 2.07 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#document similarity
#by Yang Tian and Xi Chen
# input query tfidf and docs-tfidf
# output a rank list for the indices of docs
import math
def vectorlength(vec):
length = float(0)
for num in vec:
length += num*num
return math.sqrt(length)
def simDistance(w1,w2):
sim = float(0)
for wi1, wi2 in zip(w1, w2):
sim += wi1*wi2
sim = sim/(vectorlength(w1)*vectorlength(w2))
return sim
## output index
def computerSimilarity(query, listOfVectors, candidateIndice):
indexList= []
docIndices=[]
output = []
if not listOfVectors:
print query + "did not match any documents."
return indexList
if listOfVectors:
simiVector = [simDistance(query, value) for value in listOfVectors]
#print simiVector
indexList = sorted(range(len(simiVector)), key = lambda k: simiVector[k])
for i in indexList:
#print i
docIndices.append(candidateIndice[i])
docIndices.reverse()
simiVector.sort()
simiVector.reverse()
output.append(docIndices)
output.append(simiVector)
return output
##out put value
"""
def computerSimilarity(query, listOfVectors):
indexList= []
if not listOfVectors:
print query + "did not match any documents."
return indexList
sortedResult = listOfVectors
if listOfVectors:
for vector in listOfVectors:
value = simDistance(query,vector)
vector = value
indexList = sorted(range(len(sortedResult)), key = lambda k: sortedResult[k])
return indexList
"""
"""
def computerSimilarity(query, listOfVectors):
indexList= []
if not listOfVectors:
print query + "did not match any documents."
return indexList
if listOfVectors:
for vector in listOfVectors:
value = simDistance(query,vector)
indexList.append(value)
indexList.sort()
return indexList
"""
"""
query = [1,2,3,0]
listOfVectors = [[1,0,0,0],[1,3,1,1],[0.5,0,0,0],[2,90,100,2],[1,1,0,0]]
output = computerSimilarity(query, listOfVectors)
print output
"""