Skip to content

Commit c178e34

Browse files
committed
Fix for Issue 15; reverted root concept to capture matches like a-sa-ra2,sa-ra2; updated usage with examples
1 parent 3f8d48b commit c178e34

File tree

1 file changed

+53
-30
lines changed

1 file changed

+53
-30
lines changed

inflection_finder.py

+53-30
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
#!/usr/bin/python
22
# -*- coding: utf-8 -*-
3-
import sys
4-
import getopt
5-
import logging
6-
import unicodecsv
7-
import jsondoa
3+
import getopt, logging, sys
4+
import jsondoa, unicodecsv
5+
import itertools
86
from collections import defaultdict
97

108
"""
119
An early step in any decipherment process: Is the language inflected?
10+
1211
"walked" and "walking" are inflections of "walk", as is "dogs" of "dog".
13-
Won't catch irregular inflections like tener 'to have' inflected as tiene '(s)he has' and tengo 'i have' in Spanish, but is an organized step in approaching an undeciphered writing system
12+
13+
This won't catch irregular inflections like tener 'to have' inflected as tiene '(s)he has' and
14+
tengo 'i have' in Spanish, but this is an organized step in approaching an undeciphered
15+
writing system.
1416
"""
1517

16-
# Phase I - kiminoa@gmail.com
18+
# Phase I - kiminoa@gmail.com / K.A. Raymoure
1719

1820
def find_intersection(lista, listb):
1921
"""
@@ -57,9 +59,10 @@ def create_inflection_families(candidates):
5759

5860
# avoid processing duplicates like [-ed, -ing] and [-ing, -ed]
5961
if x in inflection_family_candidates: continue
60-
62+
63+
# no intersection found
6164
intersect_xy = find_intersection(candidates[x], candidates[y])
62-
if intersect_xy == None: continue
65+
if not intersect_xy: continue
6366

6467
inflection_family_key = composite_key(x, y)
6568
for i in list(intersect_xy):
@@ -71,7 +74,8 @@ def create_inflection_families(candidates):
7174

7275
def add_candidate_to_file(root, inflections):
7376
"""
74-
outputs to an interim file: each line is a key-value pair [ { 'root' : ['inflection', 'candidates'] } ]
77+
outputs to an interim file: each line is a key-value pair [ { 'root' : ['inflection',
78+
'candidates'] } ]
7579
"""
7680
root = root.encode('utf-8')
7781
candidate_entry = [ {root : inflections } ]
@@ -143,10 +147,14 @@ def process_inflections():
143147

144148
def get_clusters(raw_file):
145149
"""
146-
[unimplemented] inputs a file of unique morphemes, outputs clusters of potential inflections
150+
[unimplemented] inputs a file of unique morphemes, outputs clusters of potential
151+
inflections
147152
148-
Phase I: let OpenRefine do the heavy lifting and create the file of clusters (process_clusters) from raw data
149-
Phase II: use Python libraries to do the clustering (get_clusters) - move this to its own file for Phase II
153+
Phase I: let OpenRefine do the heavy lifting and create the file of clusters
154+
(process_clusters) from raw data
155+
156+
Phase II: use Python libraries to do the clustering (get_clusters) - move this to its
157+
own file for Phase II
150158
"""
151159
pass
152160

@@ -187,7 +195,8 @@ def longest_substring_syllabary(cluster):
187195

188196
def longest_substring(cluster):
189197
"""
190-
inputs a list of potential inflections, returns the longest common substring shared by all
198+
inputs a list of potential inflections, returns the longest common substring shared
199+
by all
191200
"""
192201
substring = ''
193202
# use cluster[0] to find the longest substring in all cluster elements [1] - [n]
@@ -224,7 +233,8 @@ def strip_delimiter(delimited):
224233

225234
def get_inflections(substring, cluster):
226235
"""
227-
inputs the longest common substring for a cluster and returns the list of what *isn't* common in the cluster
236+
inputs the longest common substring for a cluster and returns the list of what *isn't*
237+
common in the cluster
228238
"""
229239
sublen = len(substring)
230240
inflections = []
@@ -234,7 +244,7 @@ def get_inflections(substring, cluster):
234244
inflection = i.replace(i[startdel:startdel+sublen], '')
235245
# Special case: if one of the elements in the cluster *is* the longest common substring
236246
if inflection == '':
237-
inflection = substring + "-root"
247+
inflection = "root"
238248
LOG.debug("get_inflections: String %s after cutting %s: %s", i, substring, inflection)
239249
inflections.append(inflection.encode('utf-8')) # utf-8 friendly
240250
return inflections
@@ -243,8 +253,10 @@ def inflection_clusters(*args):
243253
"""
244254
inputs a list of potential inflections, outputs a list of potential cases
245255
246-
For example, if we receive the list (ko-no-so, ko-no-si-jo, ko-no-si-ja, ko-no-so-de), we will receive in response (o, i-jo, i-ja, o-de) as ko-no-s is ubiquitous.
247-
How do we handle edge cases where the common ground is a complete set, i.e. (ko-no-so, ko-no-so-de)? should have a way to return root + -de instead of just -de
256+
For example, if we receive the list (ko-no-so, ko-no-si-jo, ko-no-si-ja, ko-no-so-de),
257+
we will receive in response (o, i-jo, i-ja, o-de) as ko-no-s is ubiquitous.
258+
How do we handle edge cases where the common ground is a complete set, i.e. (ko-no-so,
259+
ko-no-so-de)? should have a way to return root + -de instead of just -de
248260
"""
249261
if SYLLABARY:
250262
common_substring = longest_substring_syllabary(*args)
@@ -267,17 +279,25 @@ def inflection_clusters(*args):
267279

268280
def process_clusters(cluster_file):
269281
"""
270-
inputs a file with a list of potential inflections in a cluster on each line, processes one line at a time
282+
inputs a file with a list of potential inflections in a cluster on each line,
283+
processes one line at a time
284+
parses 3+ cluster groups into pairs for additional processing
271285
"""
272286
cfile = open(cluster_file)
273-
cluster_list = []
287+
274288
for line in unicodecsv.reader(cfile, encoding="utf-8"):
275-
for i in line:
276-
LOG.debug(u"process_clusters: From CSV: %s", i)
277-
cluster_list.append(i.strip())
278-
if cluster_list:
279-
inflection_clusters(cluster_list) # discover inflection candidates for each morpheme list
280-
del cluster_list[:] # reinitialize for next cluster
289+
LOG.debug(u"process_clusters: From CSV: %s", repr(line))
290+
291+
# Adds a complete cluster, like "a-sa-ra2,sa-ra2,sa-ra-ra"
292+
inflection_clusters(line) # discover inflection candidates for each morpheme list
293+
294+
# Adds sub-cluster sign group pairs for completeness, like "a-sa-ra2,sa-ra2"
295+
sub_cluster_combos = itertools.combinations(line, 2)
296+
297+
for combo in sub_cluster_combos:
298+
inflection_clusters(combo)
299+
LOG.debug(u"Adding cluster pair %s" % repr(combo))
300+
281301
cfile.close()
282302

283303
JSON_DOA.store() # save interim data to JSON
@@ -291,8 +311,11 @@ def usage():
291311
-l, --loglevel= <loglevel:INFO|DEBUG|WARNING|ERROR|CRITICAL>
292312
-s, --syllabary\t\t\tflags input language as a syllabary
293313
-d, --delimiter= <delimiter>\tsyllabary delimiter
294-
\nDelimiter is used for non-alphabetic representations, like the hypen
295-
separating syllables in alphasyllabaries."""
314+
\nDelimiter is used for non-alphabetic representations, like the hyphen
315+
separating syllables in alphasyllabaries.
316+
\nExamples:
317+
inflection_finder.py -s -d \"-\" -f alphasyllabary.csv
318+
inflection_finder.py -f alphabet.csv"""
296319

297320
if __name__ == "__main__":
298321
"""
@@ -319,7 +342,7 @@ def usage():
319342
loglevel = arg
320343
elif opt in ("-d", "--delimiter"):
321344
DELIMITER = arg
322-
elif opt in ("=s", "--syllabary"):
345+
elif opt in ("-s", "--syllabary"):
323346
SYLLABARY = True
324347
else:
325348
print "Unrecognized option."
@@ -345,4 +368,4 @@ def usage():
345368
# Let's do it.
346369
print "\nFiles with candidate clustered morphemes should be CSV (utf-8 is Ok)."
347370
process_clusters(clustered_file)
348-
process_inflections()
371+
process_inflections()

0 commit comments

Comments
 (0)