-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdgmDict-lexemes.py
98 lines (83 loc) · 3.46 KB
/
pdgmDict-lexemes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/local/bin/python3
'''
Draws up the morphological property-values sdhemata used in the
current inventory of termclusters. It cretes two files:
1) 'pdgm-schemata-LANG.json': a full-formatted 'schemata'
and 'pdgmPropOrder' which can be inserted in the 'pdgms-LANG.json'
file;
2) 'pdgm-PVN-LANG.txt': with the same information, but formatted
for insertion in various pdgm and pname displays
no. 3
'''
import json
import re
from shutil import copy
import sys
def value_getter(lexemes):
return lexemes['label']
# For CL argument
language = sys.argv[1]
# For single lang:
#language = input('Type language name: ')
languagenames = (language, )
# For corpus:
#languagenames = ('aari', 'afar', 'alaaba', 'alagwa', 'akkadian-ob', 'arabic', 'arbore', 'awngi', 'bayso', 'beja-alm', 'beja-hud', 'beja-rei', 'beja-rop', 'beja-van', 'beja-wed', 'berber-ghadames', 'bilin', 'boni-jara', 'boni-kijee-bala', 'boni-kilii', 'burji', 'burunge', 'coptic-sahidic', 'dahalo', 'dhaasanac', 'dizi', 'egyptian-middle', 'elmolo', 'gawwada', 'gedeo', 'geez', 'hadiyya', 'hausa', 'hdi', 'hebrew', 'iraqw', 'kambaata', 'kemant', 'khamtanga', 'koorete', 'maale', 'mubi', 'oromo', 'rendille', 'saho', 'shinassha', 'sidaama', 'somali', 'syriac', 'tsamakko', 'wolaytta', 'yaaku', 'yemsa')
# NB: outfile 3 & 4, and lfilename2 are for definitive indices
# in pvlists/, outfile 1&2 and lfilename are for preliminary
for lang in languagenames:
print(str('LANG: ' + lang))
lfile = str('../aama-data/data/' + lang + '/' + lang + '-pdgms.json')
lfilebck = str('../aama-data/data/' + lang + '/' + lang + '-pdgms-old.json')
copy(lfile, lfilebck)
jdata = json.load(open(lfile))
# edit and turn one of the 'lex' files into 'lexemes'
outfile1 = str('pvlists/' + lang + '-lex1.json')
outfile2 = str('pvlists/' + lang + '-lex2.json')
# list of lexeme labels
pdgmLexemes = jdata['schemata']['lexeme']
# output lexDict:what needs to be in lexeme file
lexDict1 = {}
for lex in pdgmLexemes:
lexEntry = {}
lexEntry["pos"] = "Verb"
lexEntry["lemma"] = "[x]"
lexEntry["gloss"] = "[y]"
lexDict1[lex] = lexEntry
print("pdgmLexemes:")
print(lexDict1)
# actual lexemex collection
lexSection = jdata['lexemes']
lexDict2 = {}
lexSorted = sorted(lexSection)
for lex in lexSorted:
lexDict2[lex] = lexSection[lex]
print("lexemes:")
#print(lexSection)
print("lexdict2")
#print(lexDict2)
# lexicalMaterial = lexDict | lexSection
# Keep separate files of lex template (lex1) and current 'lexemes'
json.dump(lexDict1, fp=open(outfile1, "w"), ensure_ascii=False, indent=2)
json.dump(lexDict2, fp=open(outfile2, "w"), ensure_ascii=False, indent=2)
# Replace 'lexemes' section in current updated lang file
# First open the newly made lexemes file
with open(outfile2) as f:
ftext1 = f.read()
# Prepare replacement text
ftext1a = str('lexemes": ' + ftext1 + ',\n "pdgmPropOrder": ')
print("ftext1a:")
# print(ftext1a)
#Open 'matrix' text
with open(lfile) as f:
ftext = f.read()
ftext2 = ftext.replace('\n','\\n')
# print("ftext2")
# print(ftext2)
# Replace
ftextnew1 = re.sub('lexemes":.*"pdgmPropOrder": ', ftext1a, ftext2)
ftextnew2 = ftextnew1.replace('\\n','\n')
print('ftextnew2')
#print(ftextnew2)
file = open(lfile, "w")
file.write(ftextnew2)
file.close