@@ -315,6 +315,12 @@ def self.join(*uris)
315
315
# value is the reserved plus unreserved character classes specified in
316
316
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
317
317
#
318
+ # @param [Regexp] upcase_encoded
319
+ # A string of characters that may already be percent encoded, and whose
320
+ # encodings should be upcased. This allows normalization of percent
321
+ # encodings for characters not included in the
322
+ # <code>character_class</code>.
323
+ #
318
324
# @return [String] The encoded component.
319
325
#
320
326
# @example
@@ -327,7 +333,8 @@ def self.join(*uris)
327
333
# )
328
334
# => "simple%2Fexample"
329
335
def self . encode_component ( component , character_class =
330
- CharacterClasses ::RESERVED + CharacterClasses ::UNRESERVED )
336
+ CharacterClasses ::RESERVED + CharacterClasses ::UNRESERVED ,
337
+ upcase_encoded = '' )
331
338
return nil if component . nil?
332
339
333
340
begin
@@ -356,9 +363,15 @@ def self.encode_component(component, character_class=
356
363
component = component . dup
357
364
component . force_encoding ( Encoding ::ASCII_8BIT )
358
365
end
359
- return component . gsub ( character_class ) do |sequence |
366
+ component . gsub! ( character_class ) do |sequence |
360
367
( sequence . unpack ( 'C*' ) . map { |c | "%" + ( "%02x" % c ) . upcase } ) . join
361
368
end
369
+ if upcase_encoded . length > 0
370
+ component . gsub! ( /%(#{ upcase_encoded . chars . map do |c |
371
+ c . unpack ( 'C*' ) . map { |c | '%02x' % c } . join
372
+ end . join ( '|' ) } )/i) { |s | s . upcase }
373
+ end
374
+ return component
362
375
end
363
376
364
377
class << self
@@ -380,11 +393,15 @@ class << self
380
393
# <code>Addressable::URI</code>. All other values are invalid. Defaults
381
394
# to <code>String</code>.
382
395
#
396
+ # @param [String] leave_encoded
397
+ # A string of characters to leave encoded. If a percent encoded character
398
+ # in this list is encountered then it will remain percent encoded.
399
+ #
383
400
# @return [String, Addressable::URI]
384
401
# The unencoded component or URI.
385
402
# The return type is determined by the <code>return_type</code>
386
403
# parameter.
387
- def self . unencode ( uri , return_type = String )
404
+ def self . unencode ( uri , return_type = String , leave_encoded = '' )
388
405
return nil if uri . nil?
389
406
390
407
begin
@@ -398,7 +415,8 @@ def self.unencode(uri, return_type=String)
398
415
"got #{ return_type . inspect } "
399
416
end
400
417
result = uri . gsub ( /%[0-9a-f]{2}/i ) do |sequence |
401
- sequence [ 1 ..3 ] . to_i ( 16 ) . chr
418
+ c = sequence [ 1 ..3 ] . to_i ( 16 ) . chr
419
+ leave_encoded . include? ( c ) ? sequence : c
402
420
end
403
421
result . force_encoding ( "utf-8" ) if result . respond_to? ( :force_encoding )
404
422
if return_type == String
@@ -433,6 +451,13 @@ class << self
433
451
# value is the reserved plus unreserved character classes specified in
434
452
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
435
453
#
454
+ # @param [String] leave_encoded
455
+ # When <code>character_class</code> is a <code>String</code> then
456
+ # <code>leave_encoded</code> is a string of characters that should remain
457
+ # percent encoded while normalizing the component; if they appear percent
458
+ # encoded in the original component, then they will be upcased ("%2f"
459
+ # normalized to "%2F") but otherwise left alone.
460
+ #
436
461
# @return [String] The normalized component.
437
462
#
438
463
# @example
@@ -447,8 +472,15 @@ class << self
447
472
# Addressable::URI::CharacterClasses::UNRESERVED
448
473
# )
449
474
# => "simple%2Fexample"
475
+ # Addressable::URI.normalize_component(
476
+ # "one%20two%2fthree%26four",
477
+ # "0-9a-zA-Z &/",
478
+ # "/"
479
+ # )
480
+ # => "one two%2Fthree&four"
450
481
def self . normalize_component ( component , character_class =
451
- CharacterClasses ::RESERVED + CharacterClasses ::UNRESERVED )
482
+ CharacterClasses ::RESERVED + CharacterClasses ::UNRESERVED ,
483
+ leave_encoded = '' )
452
484
return nil if component . nil?
453
485
454
486
begin
@@ -462,19 +494,29 @@ def self.normalize_component(component, character_class=
462
494
"Expected String or Regexp, got #{ character_class . inspect } "
463
495
end
464
496
if character_class . kind_of? ( String )
465
- character_class = /[^#{ character_class } ]/
497
+ leave_re = if leave_encoded . length > 0
498
+ character_class << '%'
499
+
500
+ "|%(?!#{ leave_encoded . chars . map do |c |
501
+ seq = c . unpack ( 'C*' ) . map { |c | '%02x' % c } . join
502
+ [ seq . upcase , seq . downcase ]
503
+ end . flatten . join ( '|' ) } )"
504
+ end
505
+
506
+ character_class = /[^#{ character_class } ]#{ leave_re } /
466
507
end
467
508
if component . respond_to? ( :force_encoding )
468
509
# We can't perform regexps on invalid UTF sequences, but
469
510
# here we need to, so switch to ASCII.
470
511
component = component . dup
471
512
component . force_encoding ( Encoding ::ASCII_8BIT )
472
513
end
473
- unencoded = self . unencode_component ( component )
514
+ unencoded = self . unencode_component ( component , String , leave_encoded )
474
515
begin
475
516
encoded = self . encode_component (
476
517
Addressable ::IDNA . unicode_normalize_kc ( unencoded ) ,
477
- character_class
518
+ character_class ,
519
+ leave_encoded
478
520
)
479
521
rescue ArgumentError
480
522
encoded = self . encode_component ( unencoded )
@@ -1391,7 +1433,8 @@ def normalized_query
1391
1433
( self . query . split ( "&" , -1 ) . map do |pair |
1392
1434
Addressable ::URI . normalize_component (
1393
1435
pair ,
1394
- Addressable ::URI ::CharacterClasses ::QUERY . sub ( "\\ &" , "" )
1436
+ Addressable ::URI ::CharacterClasses ::QUERY . sub ( "\\ &" , "" ) ,
1437
+ '+'
1395
1438
)
1396
1439
end ) . join ( "&" )
1397
1440
end )
0 commit comments