diff --git a/ext/cgi/escape/escape.c b/ext/cgi/escape/escape.c index 17a134a..495ad83 100644 --- a/ext/cgi/escape/escape.c +++ b/ext/cgi/escape/escape.c @@ -83,7 +83,7 @@ optimized_unescape_html(VALUE str) unsigned long charlimit = (strcasecmp(rb_enc_name(enc), "UTF-8") == 0 ? UNICODE_MAX : strcasecmp(rb_enc_name(enc), "ISO-8859-1") == 0 ? 256 : 128); - long i, len, beg = 0; + long i, j, len, beg = 0; size_t clen, plen; int overflow; const char *cstr; @@ -100,6 +100,7 @@ optimized_unescape_html(VALUE str) plen = i - beg; if (++i >= len) break; c = (unsigned char)cstr[i]; + j = i; #define MATCH(s) (len - i >= (int)rb_strlen_lit(s) && \ memcmp(&cstr[i], s, rb_strlen_lit(s)) == 0 && \ (i += rb_strlen_lit(s) - 1, 1)) @@ -112,28 +113,40 @@ optimized_unescape_html(VALUE str) else if (MATCH("mp;")) { c = '&'; } - else continue; + else { + i = j; + continue; + } break; case 'q': ++i; if (MATCH("uot;")) { c = '"'; } - else continue; + else { + i = j; + continue; + } break; case 'g': ++i; if (MATCH("t;")) { c = '>'; } - else continue; + else { + i = j; + continue; + } break; case 'l': ++i; if (MATCH("t;")) { c = '<'; } - else continue; + else { + i = j; + continue; + } break; case '#': if (len - ++i >= 2 && ISDIGIT(cstr[i])) { @@ -142,9 +155,15 @@ optimized_unescape_html(VALUE str) else if ((cstr[i] == 'x' || cstr[i] == 'X') && len - ++i >= 2 && ISXDIGIT(cstr[i])) { cc = ruby_scan_digits(&cstr[i], len-i, 16, &clen, &overflow); } - else continue; + else { + i = j; + continue; + } i += clen; - if (overflow || cc >= charlimit || cstr[i] != ';') continue; + if (overflow || cc >= charlimit || cstr[i] != ';') { + i = j; + continue; + } if (!dest) { dest = rb_str_buf_new(len); } diff --git a/ext/java/org/jruby/ext/cgi/escape/CGIEscape.java b/ext/java/org/jruby/ext/cgi/escape/CGIEscape.java index c34d09c..6412fd7 100644 --- a/ext/java/org/jruby/ext/cgi/escape/CGIEscape.java +++ b/ext/java/org/jruby/ext/cgi/escape/CGIEscape.java @@ -140,7 +140,7 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) { int charlimit = (enc instanceof UTF8Encoding) ? UNICODE_MAX : (enc instanceof ISO8859_1Encoding) ? 256 : 128; - int i, len, beg = 0; + int i, j, len, beg = 0; int clen = 0, plen; boolean overflow = false; byte[] cstrBytes; @@ -160,6 +160,7 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) { plen = i - beg; if (++i >= len) break; c = cstrBytes[cstr + i] & 0xFF; + j = i; switch (c) { case 'a': ++i; @@ -169,28 +170,40 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) { } else if (MATCH(MPSEMI, len, i, cstrBytes, cstr)) { i += MPSEMI.length - 1; c = '&'; - } else continue; + } else { + i = j; + continue; + } break; case 'q': ++i; if (MATCH(UOTSEMI, len, i, cstrBytes, cstr)) { i += UOTSEMI.length - 1; c = '"'; - } else continue; + } else { + i = j; + continue; + } break; case 'g': ++i; if (MATCH(TSEMI, len, i, cstrBytes, cstr)) { i += TSEMI.length - 1; c = '>'; - } else continue; + } else { + i = j; + continue; + } break; case 'l': ++i; if (MATCH(TSEMI, len, i, cstrBytes, cstr)) { i += TSEMI.length - 1; c = '<'; - } else continue; + } else { + i = j; + continue; + } break; case '#': if (len - ++i >= 2 && Character.isDigit(cstrBytes[cstr + i])) { @@ -203,9 +216,15 @@ static boolean MATCH(byte[] s, int len, int i, byte[] cstrBytes, int cstr) { cc = ruby_scan_digits(cstrBytes, cstr + i, len - i, 16, clenOverflow); clen = clenOverflow[0]; overflow = clenOverflow[1] == 1; - } else continue; + } else { + i = j; + continue; + } i += clen; - if (overflow || cc >= charlimit || i >= len || cstrBytes[cstr + i] != ';') continue; + if (overflow || cc >= charlimit || i >= len || cstrBytes[cstr + i] != ';') { + i = j; + continue; + } if (dest == null) { dest = RubyString.newStringLight(runtime, len); } diff --git a/test/cgi/test_cgi_util.rb b/test/cgi/test_cgi_util.rb index 1bdc0e4..b0612fc 100644 --- a/test/cgi/test_cgi_util.rb +++ b/test/cgi/test_cgi_util.rb @@ -186,6 +186,22 @@ def test_cgi_unescapeHTML_invalid assert_equal('&<&>"&abcdefghijklmn', CGI.unescapeHTML('&<&>"&abcdefghijklmn')) end + module UnescapeHTMLTests + def test_cgi_unescapeHTML_following_known_first_letter + assert_equal('&a>&q>&l>&g>', CGI.unescapeHTML('&a>&q>&l>&g>')) + end + + def test_cgi_unescapeHTML_following_number_sign + assert_equal('&#>&#x>', CGI.unescapeHTML('&#>&#x>')) + end + + def test_cgi_unescapeHTML_following_invalid_numeric + assert_equal('�>�>', CGI.unescapeHTML('�>�>')) + end + end + + include UnescapeHTMLTests + Encoding.list.each do |enc| begin escaped = "'&"><".encode(enc) @@ -283,6 +299,8 @@ def teardown end if defined?(CGI::Escape) end + include CGIUtilTest::UnescapeHTMLTests + def test_cgi_escapeHTML_with_invalid_byte_sequence assert_equal("<\xA4??>", CGI.escapeHTML(%[<\xA4??>])) end