Skip to content

Commit b043ca7

Browse files
authored
Mangling: added support for pluralized initialisms (#107)
Pluralized initialisms are of the simplest form, e.g. ID -> IDs. Initialisms that end with an S (such as DNS) or that conflict with another initialism that ends with an S (such as HTTP) are invariant and do not support a pluralized form. * fixes #46 Other changes that stem from newly supported initialisms that are not fully upper-cased: * replaced IPV4 and IPV6 by IPv4 and IPv6 * now ToGoName doesn't force names like "IPv4Etc" in "IPV4Etc", but leaves the casing of IPv4. * if it happens (by adding non default initialisms) that 2 initialisms differ only in case, the mixed-case one will be favored over the capitalized. Signed-off-by: Frederic BIDON <fredbi@yahoo.com>
1 parent d627b13 commit b043ca7

File tree

5 files changed

+380
-91
lines changed

5 files changed

+380
-91
lines changed

initialism_index.go

+79-16
Original file line numberDiff line numberDiff line change
@@ -30,43 +30,55 @@ var (
3030
// a copy of initialisms pre-baked as []rune
3131
initialismsRunes [][]rune
3232
initialismsUpperCased [][]rune
33+
initialismsPluralForm []pluralForm // pre-baked indexed support for pluralization
3334

3435
isInitialism func(string) bool
3536

3637
maxAllocMatches int
3738
)
3839

3940
func init() {
40-
// Taken from https://github.com/golang/lint/blob/3390df4df2787994aea98de825b964ac7944b817/lint.go#L732-L769
41+
// List of initialisms taken from https://github.com/golang/lint/blob/3390df4df2787994aea98de825b964ac7944b817/lint.go#L732-L769
42+
//
43+
// Now superseded by: https://github.com/mgechev/revive/blob/master/lint/name.go#L93
44+
//
45+
// Notice that initialisms are not necessarily uppercased.
46+
// In particular, we may find plural forms with mixed case like "IDs" or legit values like "IPv4" or "IPv6".
47+
//
48+
// At this moment, we don't support pluralization of terms that ends with an 's' (or 'S').
49+
// We don't want to support pluralization of terms which would otherwise conflict with another one,
50+
// like "HTTPs" vs "HTTPS". All these should be considered invariant. Hence: "Https" matches "HTTPS" and
51+
// "HTTPSS" is "HTTPS" followed by "S".
4152
configuredInitialisms := map[string]bool{
53+
// initialism: true|false = accept a pluralized form 'Xs' - false means invariant plural
4254
"ACL": true,
4355
"API": true,
4456
"ASCII": true,
4557
"CPU": true,
46-
"CSS": true,
47-
"DNS": true,
58+
"CSS": false,
59+
"DNS": false,
4860
"EOF": true,
4961
"GUID": true,
5062
"HTML": true,
51-
"HTTPS": true,
52-
"HTTP": true,
63+
"HTTPS": false,
64+
"HTTP": false,
5365
"ID": true,
5466
"IP": true,
55-
"IPv4": true,
56-
"IPv6": true,
67+
"IPv4": true, // prefer the mixed case outcome IPv4 over the capitalized IPV4
68+
"IPv6": true, // prefer the mixed case outcome
5769
"JSON": true,
5870
"LHS": true,
59-
"OAI": true,
60-
"QPS": true,
71+
"OAI": true, // not in the linter's list, but added for the openapi context
72+
"QPS": false,
6173
"RAM": true,
62-
"RHS": true,
74+
"RHS": false,
6375
"RPC": true,
6476
"SLA": true,
6577
"SMTP": true,
6678
"SQL": true,
6779
"SSH": true,
6880
"TCP": true,
69-
"TLS": true,
81+
"TLS": false,
7082
"TTL": true,
7183
"UDP": true,
7284
"UI": true,
@@ -79,7 +91,7 @@ func init() {
7991
"XML": true,
8092
"XMPP": true,
8193
"XSRF": true,
82-
"XSS": true,
94+
"XSS": false,
8395
}
8496

8597
// a thread-safe index of initialisms
@@ -88,6 +100,7 @@ func init() {
88100
initialismsRunes = asRunes(initialisms)
89101
initialismsUpperCased = asUpperCased(initialisms)
90102
maxAllocMatches = maxAllocHeuristic(initialismsRunes)
103+
initialismsPluralForm = asPluralForms(initialisms, commonInitialisms)
91104

92105
// a test function
93106
isInitialism = commonInitialisms.isInitialism
@@ -112,6 +125,16 @@ func asUpperCased(in []string) [][]rune {
112125
return out
113126
}
114127

128+
// asPluralForms bakes an index of pluralization support.
129+
func asPluralForms(in []string, idx *indexOfInitialisms) []pluralForm {
130+
out := make([]pluralForm, len(in))
131+
for i, initialism := range in {
132+
out[i] = idx.pluralForm(initialism)
133+
}
134+
135+
return out
136+
}
137+
115138
func maxAllocHeuristic(in [][]rune) int {
116139
heuristic := make(map[rune]int)
117140
for _, initialism := range in {
@@ -139,12 +162,14 @@ func maxAllocHeuristic(in [][]rune) int {
139162
func AddInitialisms(words ...string) {
140163
for _, word := range words {
141164
// commonInitialisms[upper(word)] = true
142-
commonInitialisms.add(upper(word))
165+
uword := upper(word)
166+
commonInitialisms.add(uword, !strings.HasSuffix(uword, "S"))
143167
}
144168
// sort again
145169
initialisms = commonInitialisms.sorted()
146170
initialismsRunes = asRunes(initialisms)
147171
initialismsUpperCased = asUpperCased(initialisms)
172+
initialismsPluralForm = asPluralForms(initialisms, commonInitialisms)
148173
}
149174

150175
// indexOfInitialisms is a thread-safe implementation of the sorted index of initialisms.
@@ -175,8 +200,8 @@ func (m *indexOfInitialisms) isInitialism(key string) bool {
175200
return ok
176201
}
177202

178-
func (m *indexOfInitialisms) add(key string) *indexOfInitialisms {
179-
m.index.Store(key, true)
203+
func (m *indexOfInitialisms) add(key string, hasPlural bool) *indexOfInitialisms {
204+
m.index.Store(key, hasPlural)
180205
return m
181206
}
182207

@@ -192,6 +217,40 @@ func (m *indexOfInitialisms) sorted() (result []string) {
192217
return
193218
}
194219

220+
// pluralForm denotes the kind of pluralization to be used for initialisms.
221+
//
222+
// At this moment, initialisms are either invariant or follow a simple plural form with an
223+
// extra (lower case) "s".
224+
type pluralForm uint8
225+
226+
const (
227+
notPlural pluralForm = iota
228+
invariantPlural
229+
simplePlural
230+
)
231+
232+
// pluralForm indicates how we want to pluralize a given initialism.
233+
//
234+
// Besides configured invariant forms (like HTTP and HTTPS),
235+
// an initialism is normally pluralized by adding a single 's', like in IDs.
236+
//
237+
// Initialisms ending with an 'S' or an 's' are configured as invariant (we don't
238+
// support plural forms like CSSes or DNSes, however the mechanism could be extended to
239+
// do just that).
240+
func (m *indexOfInitialisms) pluralForm(key string) pluralForm {
241+
v, ok := m.index.Load(key)
242+
if !ok {
243+
return notPlural
244+
}
245+
246+
acceptsPlural := v.(bool)
247+
if !acceptsPlural {
248+
return invariantPlural
249+
}
250+
251+
return simplePlural
252+
}
253+
195254
type byInitialism []string
196255

197256
func (s byInitialism) Len() int {
@@ -200,10 +259,14 @@ func (s byInitialism) Len() int {
200259
func (s byInitialism) Swap(i, j int) {
201260
s[i], s[j] = s[j], s[i]
202261
}
262+
263+
// Less specifies the order in which initialisms are prioritized:
264+
// 1. match longest first
265+
// 2. when equal length, match in reverse lexicographical order, lower case match comes first
203266
func (s byInitialism) Less(i, j int) bool {
204267
if len(s[i]) != len(s[j]) {
205268
return len(s[i]) < len(s[j])
206269
}
207270

208-
return strings.Compare(s[i], s[j]) > 0
271+
return s[i] < s[j]
209272
}

split.go

+57-9
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ type (
2626
initialisms []string
2727
initialismsRunes [][]rune
2828
initialismsUpperCased [][]rune // initialisms cached in their trimmed, upper-cased version
29+
initialismsPluralForm []pluralForm
2930
postSplitInitialismCheck bool
3031
}
3132

@@ -35,6 +36,7 @@ type (
3536
body []rune
3637
start, end int
3738
complete bool
39+
hasPlural pluralForm
3840
}
3941
initialismMatches []initialismMatch
4042
)
@@ -149,6 +151,7 @@ func newSplitter(options ...splitterOption) splitter {
149151
initialisms: initialisms,
150152
initialismsRunes: initialismsRunes,
151153
initialismsUpperCased: initialismsUpperCased,
154+
initialismsPluralForm: initialismsPluralForm,
152155
}
153156

154157
for _, option := range options {
@@ -242,25 +245,68 @@ func (s splitter) gatherInitialismMatches(nameRunes []rune) *initialismMatches {
242245
for _, match := range *matches {
243246
if keepCompleteMatch := match.complete; keepCompleteMatch {
244247
*newMatches = append(*newMatches, match)
248+
249+
// the match is complete: keep it then move on to next rune
245250
continue
246251
}
247252

248-
// drop failed match
249253
currentMatchRune := match.body[currentRunePosition-match.start]
250254
if currentMatchRune != currentRune {
255+
// failed match, move on to next rune
251256
continue
252257
}
253258

254259
// try to complete ongoing match
255260
if currentRunePosition-match.start == len(match.body)-1 {
256261
// we are close; the next step is to check the symbol ahead
257-
// if it is a small letter, then it is not the end of match
258-
// but beginning of the next word
262+
// if it is a lowercase letter, then it is not the end of match
263+
// but the beginning of the next word.
264+
//
265+
// NOTE(fredbi): this heuristic sometimes leads to counterintuitive splits and
266+
// perhaps (not sure yet) we should check against case _alternance_.
267+
//
268+
// Example:
269+
//
270+
// In the current version, in the sentence "IDS initialism", "ID" is recognized as an initialism,
271+
// leading to a split like "id_s_initialism" (or IDSInitialism),
272+
// whereas in the sentence "IDx initialism", it is not and produces something like
273+
// "i_d_x_initialism" (or IDxInitialism). The generated file name is not great.
274+
//
275+
// Both go identifiers are tolerated by linters.
276+
//
277+
// Notice that the slightly different input "IDs initialism" is correctly detected
278+
// as a pluralized initialism and produces something like "ids_initialism" (or IDsInitialism).
259279

260280
if currentRunePosition < len(nameRunes)-1 {
261281
nextRune := nameRunes[currentRunePosition+1]
282+
283+
// recognize a plural form for this initialism (only simple pluralization is supported)
284+
if nextRune == 's' && match.hasPlural == simplePlural {
285+
// detected a pluralized initialism
286+
match.body = append(match.body, nextRune)
287+
currentRunePosition++
288+
if currentRunePosition < len(nameRunes)-1 {
289+
nextRune = nameRunes[currentRunePosition+1]
290+
if newWord := unicode.IsLower(nextRune); newWord {
291+
// it is the start of a new word.
292+
// Match is only partial and the initialism is not recognized : move on
293+
continue
294+
}
295+
}
296+
297+
// this is a pluralized match: keep it
298+
match.complete = true
299+
match.hasPlural = simplePlural
300+
match.end = currentRunePosition
301+
*newMatches = append(*newMatches, match)
302+
303+
// match is complete: keep it then move on to next rune
304+
continue
305+
}
306+
262307
if newWord := unicode.IsLower(nextRune); newWord {
263-
// oh ok, it was the start of a new word
308+
// it is the start of a new word
309+
// Match is only partial and the initialism is not recognized : move on
264310
continue
265311
}
266312
}
@@ -269,18 +315,20 @@ func (s splitter) gatherInitialismMatches(nameRunes []rune) *initialismMatches {
269315
match.end = currentRunePosition
270316
}
271317

318+
// append the ongoing matching attempt (not necessarily complete)
272319
*newMatches = append(*newMatches, match)
273320
}
274321
}
275322

276323
// check for new initialism matches
277324
for i := range s.initialisms {
278-
initialismRunes := s.initialismsRunes[i]
279-
if initialismRunes[0] == currentRune {
325+
r := s.initialismsRunes[i]
326+
if r[0] == currentRune {
280327
*newMatches = append(*newMatches, initialismMatch{
281-
start: currentRunePosition,
282-
body: initialismRunes,
283-
complete: false,
328+
start: currentRunePosition,
329+
body: r,
330+
complete: false,
331+
hasPlural: s.initialismsPluralForm[i],
284332
})
285333
}
286334
}

0 commit comments

Comments
 (0)