Skip to content

Commit 72bf6c0

Browse files
committed
Merge pull request #99 from tps12/leave-%2b-in-query
Make normalize ignore %2B in query strings
2 parents dc1871b + 37131be commit 72bf6c0

File tree

2 files changed

+120
-9
lines changed

2 files changed

+120
-9
lines changed

lib/addressable/uri.rb

+52-9
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,12 @@ def self.join(*uris)
315315
# value is the reserved plus unreserved character classes specified in
316316
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
317317
#
318+
# @param [Regexp] upcase_encoded
319+
# A string of characters that may already be percent encoded, and whose
320+
# encodings should be upcased. This allows normalization of percent
321+
# encodings for characters not included in the
322+
# <code>character_class</code>.
323+
#
318324
# @return [String] The encoded component.
319325
#
320326
# @example
@@ -327,7 +333,8 @@ def self.join(*uris)
327333
# )
328334
# => "simple%2Fexample"
329335
def self.encode_component(component, character_class=
330-
CharacterClasses::RESERVED + CharacterClasses::UNRESERVED)
336+
CharacterClasses::RESERVED + CharacterClasses::UNRESERVED,
337+
upcase_encoded='')
331338
return nil if component.nil?
332339

333340
begin
@@ -356,9 +363,15 @@ def self.encode_component(component, character_class=
356363
component = component.dup
357364
component.force_encoding(Encoding::ASCII_8BIT)
358365
end
359-
return component.gsub(character_class) do |sequence|
366+
component.gsub!(character_class) do |sequence|
360367
(sequence.unpack('C*').map { |c| "%" + ("%02x" % c).upcase }).join
361368
end
369+
if upcase_encoded.length > 0
370+
component.gsub!(/%(#{upcase_encoded.chars.map do |c|
371+
c.unpack('C*').map { |c| '%02x' % c }.join
372+
end.join('|')})/i) { |s| s.upcase }
373+
end
374+
return component
362375
end
363376

364377
class << self
@@ -380,11 +393,15 @@ class << self
380393
# <code>Addressable::URI</code>. All other values are invalid. Defaults
381394
# to <code>String</code>.
382395
#
396+
# @param [String] leave_encoded
397+
# A string of characters to leave encoded. If a percent encoded character
398+
# in this list is encountered then it will remain percent encoded.
399+
#
383400
# @return [String, Addressable::URI]
384401
# The unencoded component or URI.
385402
# The return type is determined by the <code>return_type</code>
386403
# parameter.
387-
def self.unencode(uri, return_type=String)
404+
def self.unencode(uri, return_type=String, leave_encoded='')
388405
return nil if uri.nil?
389406

390407
begin
@@ -398,7 +415,8 @@ def self.unencode(uri, return_type=String)
398415
"got #{return_type.inspect}"
399416
end
400417
result = uri.gsub(/%[0-9a-f]{2}/i) do |sequence|
401-
sequence[1..3].to_i(16).chr
418+
c = sequence[1..3].to_i(16).chr
419+
leave_encoded.include?(c) ? sequence : c
402420
end
403421
result.force_encoding("utf-8") if result.respond_to?(:force_encoding)
404422
if return_type == String
@@ -433,6 +451,13 @@ class << self
433451
# value is the reserved plus unreserved character classes specified in
434452
# <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>.
435453
#
454+
# @param [String] leave_encoded
455+
# When <code>character_class</code> is a <code>String</code> then
456+
# <code>leave_encoded</code> is a string of characters that should remain
457+
# percent encoded while normalizing the component; if they appear percent
458+
# encoded in the original component, then they will be upcased ("%2f"
459+
# normalized to "%2F") but otherwise left alone.
460+
#
436461
# @return [String] The normalized component.
437462
#
438463
# @example
@@ -447,8 +472,15 @@ class << self
447472
# Addressable::URI::CharacterClasses::UNRESERVED
448473
# )
449474
# => "simple%2Fexample"
475+
# Addressable::URI.normalize_component(
476+
# "one%20two%2fthree%26four",
477+
# "0-9a-zA-Z &/",
478+
# "/"
479+
# )
480+
# => "one two%2Fthree&four"
450481
def self.normalize_component(component, character_class=
451-
CharacterClasses::RESERVED + CharacterClasses::UNRESERVED)
482+
CharacterClasses::RESERVED + CharacterClasses::UNRESERVED,
483+
leave_encoded='')
452484
return nil if component.nil?
453485

454486
begin
@@ -462,19 +494,29 @@ def self.normalize_component(component, character_class=
462494
"Expected String or Regexp, got #{character_class.inspect}"
463495
end
464496
if character_class.kind_of?(String)
465-
character_class = /[^#{character_class}]/
497+
leave_re = if leave_encoded.length > 0
498+
character_class << '%'
499+
500+
"|%(?!#{leave_encoded.chars.map do |c|
501+
seq = c.unpack('C*').map { |c| '%02x' % c }.join
502+
[seq.upcase, seq.downcase]
503+
end.flatten.join('|')})"
504+
end
505+
506+
character_class = /[^#{character_class}]#{leave_re}/
466507
end
467508
if component.respond_to?(:force_encoding)
468509
# We can't perform regexps on invalid UTF sequences, but
469510
# here we need to, so switch to ASCII.
470511
component = component.dup
471512
component.force_encoding(Encoding::ASCII_8BIT)
472513
end
473-
unencoded = self.unencode_component(component)
514+
unencoded = self.unencode_component(component, String, leave_encoded)
474515
begin
475516
encoded = self.encode_component(
476517
Addressable::IDNA.unicode_normalize_kc(unencoded),
477-
character_class
518+
character_class,
519+
leave_encoded
478520
)
479521
rescue ArgumentError
480522
encoded = self.encode_component(unencoded)
@@ -1391,7 +1433,8 @@ def normalized_query
13911433
(self.query.split("&", -1).map do |pair|
13921434
Addressable::URI.normalize_component(
13931435
pair,
1394-
Addressable::URI::CharacterClasses::QUERY.sub("\\&", "")
1436+
Addressable::URI::CharacterClasses::QUERY.sub("\\&", ""),
1437+
'+'
13951438
)
13961439
end).join("&")
13971440
end)

spec/addressable/uri_spec.rb

+68
Original file line numberDiff line numberDiff line change
@@ -3146,6 +3146,10 @@ def to_s
31463146
it "should have query_values of {'q' => 'a b'}" do
31473147
@uri.query_values.should == {'q' => 'a b'}
31483148
end
3149+
3150+
it "should have a normalized query of 'q=a+b'" do
3151+
@uri.normalized_query.should == "q=a+b"
3152+
end
31493153
end
31503154

31513155
describe Addressable::URI, "when parsed from " +
@@ -3161,6 +3165,43 @@ def to_s
31613165
it "should have query_values of {'q' => 'a+b'}" do
31623166
@uri.query_values.should == {'q' => 'a+b'}
31633167
end
3168+
3169+
it "should have a normalized query of 'q=a%2Bb'" do
3170+
@uri.normalized_query.should == "q=a%2Bb"
3171+
end
3172+
end
3173+
3174+
describe Addressable::URI, "when parsed from " +
3175+
"'http://example.com/?v=%7E&w=%&x=%25&y=%2B&z=C%CC%A7'" do
3176+
before do
3177+
@uri = Addressable::URI.parse("http://example.com/?v=%7E&w=%&x=%25&y=%2B&z=C%CC%A7")
3178+
end
3179+
3180+
it "should have a normalized query of 'v=~&w=%25&x=%25&y=%2B&z=%C3%87'" do
3181+
@uri.normalized_query.should == "v=~&w=%25&x=%25&y=%2B&z=%C3%87"
3182+
end
3183+
end
3184+
3185+
describe Addressable::URI, "when parsed from " +
3186+
"'http://example.com/?v=%7E&w=%&x=%25&y=+&z=C%CC%A7'" do
3187+
before do
3188+
@uri = Addressable::URI.parse("http://example.com/?v=%7E&w=%&x=%25&y=+&z=C%CC%A7")
3189+
end
3190+
3191+
it "should have a normalized query of 'v=~&w=%25&x=%25&y=+&z=%C3%87'" do
3192+
@uri.normalized_query.should == "v=~&w=%25&x=%25&y=+&z=%C3%87"
3193+
end
3194+
end
3195+
3196+
describe Addressable::URI, "when parsed from " +
3197+
"'http://example.com/sound%2bvision'" do
3198+
before do
3199+
@uri = Addressable::URI.parse("http://example.com/sound%2bvision")
3200+
end
3201+
3202+
it "should have a normalized path of '/sound+vision'" do
3203+
@uri.normalized_path.should == '/sound+vision'
3204+
end
31643205
end
31653206

31663207
describe Addressable::URI, "when parsed from " +
@@ -4619,6 +4660,19 @@ def to_str
46194660
end
46204661
end
46214662

4663+
describe Addressable::URI, "when normalizing a string but leaving some characters encoded" do
4664+
it "should result in correct percent encoded sequence" do
4665+
Addressable::URI.normalize_component("%58X%59Y%5AZ", "0-9a-zXY", "Y").should ==
4666+
"XX%59Y%5A%5A"
4667+
end
4668+
end
4669+
4670+
describe Addressable::URI, "when encoding a string with existing encodings to upcase" do
4671+
it "should result in correct percent encoded sequence" do
4672+
Addressable::URI.encode_component("JK%4c", "0-9A-IKM-Za-z%", "L").should == "%4AK%4C"
4673+
end
4674+
end
4675+
46224676
describe Addressable::URI, "when encoding a multibyte string" do
46234677
it "should result in correct percent encoded sequence" do
46244678
Addressable::URI.encode_component("günther").should == "g%C3%BCnther"
@@ -4664,6 +4718,20 @@ def to_str
46644718
end
46654719
end
46664720

4721+
describe Addressable::URI, "when partially unencoding a string" do
4722+
it "should unencode all characters by default" do
4723+
Addressable::URI.unencode('%%25~%7e+%2b', String).should == '%%~~++'
4724+
end
4725+
4726+
it "should unencode characters not in leave_encoded" do
4727+
Addressable::URI.unencode('%%25~%7e+%2b', String, '~').should == '%%~%7e++'
4728+
end
4729+
4730+
it "should leave characters in leave_encoded alone" do
4731+
Addressable::URI.unencode('%%25~%7e+%2b', String, '%~+').should == '%%25~%7e+%2b'
4732+
end
4733+
end
4734+
46674735
describe Addressable::URI, "when unencoding a bogus object" do
46684736
it "should raise a TypeError" do
46694737
(lambda do

0 commit comments

Comments
 (0)