-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetect.py
executable file
·84 lines (68 loc) · 2.15 KB
/
detect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Detects a language based on n-gram cosine similarity.
import sys
import re
import math
import unicodedata
import cPickle
if len(sys.argv) < 2:
print >>sys.stderr,"usage: %s <model file> <tweet>"%(sys.argv[0])
print >>sys.stderr
print >>sys.stderr,"example: %s langid.model \"Go #Giants! Beat the #Tigers\""%(sys.argv[0])
sys.exit(1)
n = 3
def fix_newlines(s):
return s.replace(u"\r\n",u"\n").replace(u"\r",u"\n").replace(u"\n",u"↵")
def fix_tabs(s):
return s.replace(u"\t",u"⟼")
def extract_text(s):
return fix_tabs(
fix_newlines(
re.sub(ur"http:\S+","",
re.sub(ur"#\S+",u"",
re.sub(ur"@\w+",u"",s)
)
)
)
)
def normalize(t):
"Basic multilingual normalization"
return unicodedata.normalize("NFKC",t).lower()
def ngram(t):
gram_vector = {}
for l in range(0,len(t)-n+1):
gram = t[l:l+n]
count = gram_vector.setdefault(gram,0)
gram_vector[gram] = count + 1
return gram_vector
# For sparse vectors represented by dictionaries. A vector is represented by a
# string -> number dictionary, where the string is the name of the dimension, and
# the float is the magnitude along that dimension. For example:
# {"x":4,"y":8}
def smag(sv):
"Sparse vector magnitude."
return math.sqrt(sum([x**2 for x in sv.values()]))
def sdot(sv1,sv2):
"""Sparse vector dot product.
Note, if you know ahead of time that one vector has fewer dimensions than
the other, put the fewer-dimensional one first in the parameter list,
since it will run faster."""
tot = 0
for (k,v) in sv1.items():
if k in sv2:
tot += v*sv2[k]
return tot
def scos(sv1,sv2):
"Cosine similarity between two sparse vectors"
return sdot(sv1,sv2)/(smag(sv1)*smag(sv2))
models = cPickle.load(open(sys.argv[1],"rb"))
tweet = ngram(normalize(extract_text(sys.argv[2].decode("UTF-8"))))
best = 0.0
best_lang = "n/a"
for lang,model in models.items():
sim = scos(tweet,model)
if sim > best:
best = sim
best_lang = lang
print best_lang