blob: 2e87b79a388a52ea30aecf307f67cbd4a1d4f1e4 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
// Unit-test the string-splitter.
#include "pagespeed/kernel/http/google_url.h"
#include "pagespeed/kernel/base/gtest.h"
#include "pagespeed/kernel/base/null_mutex.h"
#include "pagespeed/kernel/base/scoped_ptr.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/util/simple_random.h"
namespace {
const char kUrl[] = "http://a.com/b/c/d.ext?f=g/h";
const char kUrlWithPort[] = "http://a.com:8080/b/c/d.ext?f=g/h";
const char kBadQueryString[] =
"\a\b\t\n\v\f\r !\"$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLM"
"NOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\177#extra#more";
} // namespace
namespace net_instaweb {
class GoogleUrlTest : public testing::Test {
protected:
GoogleUrlTest()
: gurl_(kUrl),
gurl_with_port_(kUrlWithPort)
{}
void TestCopyAndAddQueryParam(const char* before,
const char* key, const char* value,
const char* after) {
GoogleUrl before_url(before);
StringPiece before_url_original(before_url.UncheckedSpec());
scoped_ptr<GoogleUrl> after_url(
before_url.CopyAndAddQueryParam(key, value));
EXPECT_STREQ(after, after_url->UncheckedSpec());
EXPECT_TRUE(after_url->IsWebValid());
EXPECT_STREQ(before_url_original, before_url.UncheckedSpec());
}
void TestAllExceptQueryCase(const char* before, const char* after) {
GoogleUrl before_url(before);
EXPECT_STREQ(after, before_url.AllExceptQuery());
}
void TestAllAfterQueryCase(const char* before, const char* after) {
GoogleUrl before_url(before);
EXPECT_STREQ(after, before_url.AllAfterQuery());
}
// RunMostMethods and RunAllMethods parse a url and run methods to make
// sure they don't crash.
// None of these methods should CHECK-crash if a bad URL is passed in.
// They will LOG(DFATAL)-crash in debug mode, though, so you should use
// EXPECT_DFATAL(RunMostMethods("..."), "");
void RunMostMethods(StringPiece url_string) {
GoogleUrl url(url_string);
url.AllExceptQuery();
url.AllAfterQuery();
url.AllExceptLeaf();
url.LeafWithQuery();
url.LeafSansQuery();
url.PathAndLeaf();
url.PathSansLeaf();
url.PathSansQuery();
url.ExtractFileName();
url.Host();
url.HostAndPort();
url.Origin();
url.Query();
url.Scheme();
url.UncheckedSpec();
url.IntPort();
url.EffectiveIntPort();
}
// Runs all methods, even ones that will CHECK-crash on invalid URLs.
void RunAllMethods(StringPiece url_string) {
RunMostMethods(url_string);
GoogleUrl url(url_string);
ASSERT_TRUE(url.IsAnyValid()) << url_string << " is invalid";
url.IsWebValid();
url.IsWebOrDataValid();
url.Spec();
url.spec_c_str();
url.Relativize(kAbsoluteUrl, url);
url.Relativize(kNetPath, url);
url.Relativize(kAbsolutePath, url);
url.Relativize(kRelativePath, url);
}
void TestEscapeUnescape(StringPiece value) {
EXPECT_STREQ(value,
GoogleUrl::UnescapeQueryParam(
GoogleUrl::EscapeQueryParam(value)));
}
GoogleUrl gurl_;
GoogleUrl gurl_with_port_;
};
// Document which sorts of strings are and are not valid.
TEST_F(GoogleUrlTest, TestNotValid) {
GoogleUrl empty_url;
EXPECT_FALSE(empty_url.IsWebValid());
EXPECT_FALSE(empty_url.IsWebOrDataValid());
EXPECT_FALSE(empty_url.IsAnyValid());
GoogleUrl invalid_url("Hello, world!");
EXPECT_FALSE(invalid_url.IsWebValid());
GoogleUrl relative_url1("/foo/bar.html");
EXPECT_FALSE(relative_url1.IsWebValid());
GoogleUrl relative_url2("foo/bar.html");
EXPECT_FALSE(relative_url2.IsWebValid());
GoogleUrl relative_url3("bar.html");
EXPECT_FALSE(relative_url3.IsWebValid());
// Only http: and https: are considered WebValid.
GoogleUrl proxy_filename("proxy:http://www.example.com/index.html");
EXPECT_FALSE(proxy_filename.IsWebValid());
EXPECT_FALSE(proxy_filename.IsWebOrDataValid());
// But it's still considered to be a valid URL, oddly enough.
EXPECT_TRUE(proxy_filename.IsAnyValid());
GoogleUrl data_url("data:plain/text,foobar");
EXPECT_FALSE(data_url.IsWebValid());
EXPECT_TRUE(data_url.IsWebOrDataValid());
EXPECT_TRUE(data_url.IsAnyValid());
}
TEST_F(GoogleUrlTest, TestSpec) {
EXPECT_STREQ(kUrl, gurl_.Spec());
EXPECT_STREQ("http://a.com/b/c/", gurl_.AllExceptLeaf());
EXPECT_STREQ("d.ext?f=g/h", gurl_.LeafWithQuery());
EXPECT_STREQ("d.ext", gurl_.LeafSansQuery());
EXPECT_STREQ("http://a.com", gurl_.Origin());
EXPECT_STREQ("/b/c/d.ext?f=g/h", gurl_.PathAndLeaf());
EXPECT_STREQ("/b/c/d.ext", gurl_.PathSansQuery());
}
TEST_F(GoogleUrlTest, TestSpecWithPort) {
EXPECT_STREQ(kUrlWithPort, gurl_with_port_.Spec());
EXPECT_STREQ("http://a.com:8080/b/c/", gurl_with_port_.AllExceptLeaf());
EXPECT_STREQ("d.ext?f=g/h", gurl_with_port_.LeafWithQuery());
EXPECT_STREQ("d.ext", gurl_with_port_.LeafSansQuery());
EXPECT_STREQ("http://a.com:8080", gurl_with_port_.Origin());
EXPECT_STREQ("/b/c/d.ext?f=g/h", gurl_with_port_.PathAndLeaf());
EXPECT_STREQ("/b/c/d.ext", gurl_.PathSansQuery());
EXPECT_STREQ("/b/c/", gurl_.PathSansLeaf());
}
TEST_F(GoogleUrlTest, TestDots) {
GoogleUrl url1("http://www.example.com/foo/../bar/baz/./../index.html");
EXPECT_EQ("http://www.example.com/bar/index.html", url1.Spec());
GoogleUrl url2("http://www.example.com/../../../../..");
EXPECT_EQ("http://www.example.com/", url2.Spec());
GoogleUrl url3("http://www.example.com/foo/./bar/index.html");
EXPECT_EQ("http://www.example.com/foo/bar/index.html", url3.Spec());
}
// Note: These tests are basically gold tests done for documentation more
// than expectation. If they change that should be fine, we just want to
// know what is decoded and what is not.
TEST_F(GoogleUrlTest, TestDecode) {
// Not "%20" -> " "
GoogleUrl url1("http://www.example.com/foo%20bar.html");
EXPECT_EQ("http://www.example.com/foo%20bar.html", url1.Spec());
// "%2E" -> "."
GoogleUrl url2("http://www.example.com/foo/%2E%2E/bar.html");
EXPECT_EQ("http://www.example.com/bar.html", url2.Spec());
GoogleUrl url2b("http://www.example.com/foo/%2e%2e/bar.html");
EXPECT_EQ("http://www.example.com/bar.html", url2b.Spec());
GoogleUrl url2c("http://www.example.com/%2e%2e/%2e%2e/some/file/path/");
EXPECT_EQ("http://www.example.com/some/file/path/", url2c.Spec());
// Not "%2F" -> "/"
GoogleUrl url3("http://www.example.com/foo%2Fbar.html");
EXPECT_EQ("http://www.example.com/foo%2Fbar.html", url3.Spec());
GoogleUrl url3b("http://www.example.com/foo%2fbar.html");
EXPECT_EQ("http://www.example.com/foo%2fbar.html", url3b.Spec());
// Other
GoogleUrl url4("http://www.example.com/%53%2D%38%25%32%35");
EXPECT_EQ("http://www.example.com/S-8%2525", url4.Spec());
}
TEST_F(GoogleUrlTest, TestCopyAndAddQueryParam) {
TestCopyAndAddQueryParam("http://a.com/b/c/d.ext", "r", "s",
"http://a.com/b/c/d.ext?r=s");
TestCopyAndAddQueryParam("http://a.com/b/c/d.ext?p=q", "r", "s",
"http://a.com/b/c/d.ext?p=q&r=s");
TestCopyAndAddQueryParam("http://a.com", "r", "s",
"http://a.com/?r=s");
TestCopyAndAddQueryParam("http://a.com?p=q", "r", "s",
"http://a.com/?p=q&r=s");
TestCopyAndAddQueryParam("http://a.com/b/c/d.ext?p=q#ref", "r", "s",
"http://a.com/b/c/d.ext?p=q&r=s#ref");
// Escaping example
TestCopyAndAddQueryParam("http://a.com/", "key/?", "val %\t@ ue",
"http://a.com/?key%2f%3f=val+%25%09%40+ue");
// NULL value
TestCopyAndAddQueryParam("http://a.com/", "key", NULL,
"http://a.com/?key");
}
TEST_F(GoogleUrlTest, TestAllExceptQuery) {
TestAllExceptQueryCase("http://a.com/b/c/d.ext",
"http://a.com/b/c/d.ext");
TestAllExceptQueryCase("http://a.com/b/c/d.ext?p=p&q=q",
"http://a.com/b/c/d.ext");
TestAllExceptQueryCase("http://a.com?p=p&q=q",
"http://a.com/");
}
TEST_F(GoogleUrlTest, TestAllAfterQuery) {
TestAllAfterQueryCase("http://a.com/b/c/d.ext",
"");
TestAllAfterQueryCase("http://a.com/b/c/d.ext?p=p&q=q",
"");
TestAllAfterQueryCase("http://a.com/b/c/d.ext?p=p&q=q#ref",
"#ref");
TestAllAfterQueryCase("http://a.com/b/c/d.ext?p=p&q=q#ref1#ref2",
"#ref1#ref2");
TestAllAfterQueryCase("http://a.com#ref",
"#ref");
}
TEST_F(GoogleUrlTest, TestTrivialAllExceptLeaf) {
GoogleUrl queryless("http://a.com/b/c/d.ext");
EXPECT_STREQ("http://a.com/b/c/", queryless.AllExceptLeaf());
GoogleUrl queryful("http://a.com/b/c/d.ext?p=p&q=q");
EXPECT_STREQ("http://a.com/b/c/", queryful.AllExceptLeaf());
}
TEST_F(GoogleUrlTest, TestAllExceptLeafIsIdempotent) {
// In various places the code takes either a full URL or the URL's
// AllExceptLeaf and depends on the fact that calling AllExceptLeaf on either
// gives you the same thing. This test is catch any breakage to that.
GoogleUrl queryless("http://a.com/b/c/d.ext");
StringPiece all_except_leaf(queryless.AllExceptLeaf());
GoogleUrl all_except_leaf_url(all_except_leaf);
EXPECT_TRUE(all_except_leaf_url.IsWebValid());
EXPECT_EQ(all_except_leaf, all_except_leaf_url.AllExceptLeaf());
}
TEST_F(GoogleUrlTest, TestTrivialLeafSansQuery) {
GoogleUrl queryless("http://a.com/b/c/d.ext");
EXPECT_STREQ("d.ext", queryless.LeafSansQuery());
}
TEST_F(GoogleUrlTest, ResolveRelative) {
GoogleUrl base(StringPiece("http://www.google.com"));
ASSERT_TRUE(base.IsWebValid());
GoogleUrl resolved(base, "test.html");
ASSERT_TRUE(resolved.IsWebValid());
EXPECT_STREQ("http://www.google.com/test.html", resolved.Spec());
EXPECT_STREQ("/test.html", resolved.PathSansQuery());
}
TEST_F(GoogleUrlTest, ResolveAbsolute) {
GoogleUrl base(StringPiece("http://www.google.com"));
ASSERT_TRUE(base.IsWebValid());
GoogleUrl resolved(base, "http://www.google.com");
ASSERT_TRUE(resolved.IsWebValid());
EXPECT_STREQ("http://www.google.com/", resolved.Spec());
EXPECT_STREQ("/", resolved.PathSansQuery());
}
TEST_F(GoogleUrlTest, TestReset) {
GoogleUrl url("http://www.google.com");
EXPECT_TRUE(url.IsWebValid());
url.Clear();
EXPECT_FALSE(url.IsWebValid());
EXPECT_TRUE(url.is_empty());
EXPECT_TRUE(url.UncheckedSpec().empty());
}
TEST_F(GoogleUrlTest, TestHostAndPort) {
const char kExpected5[] = "example.com:5";
EXPECT_EQ(kExpected5, GoogleUrl("http://example.com:5").HostAndPort());
EXPECT_EQ(kExpected5, GoogleUrl("http://example.com:5/a/b").HostAndPort());
EXPECT_EQ(kExpected5, GoogleUrl("https://example.com:5").HostAndPort());
const char kExpected[] = "example.com";
EXPECT_EQ(kExpected, GoogleUrl("http://example.com").HostAndPort());
EXPECT_EQ(kExpected, GoogleUrl("http://example.com/a/b").HostAndPort());
EXPECT_EQ(kExpected, GoogleUrl("https://example.com").HostAndPort());
}
TEST_F(GoogleUrlTest, TestPort) {
EXPECT_EQ(5, GoogleUrl("http://example.com:5").IntPort());
EXPECT_EQ(5, GoogleUrl("http://example.com:5").EffectiveIntPort());
EXPECT_EQ(5, GoogleUrl("https://example.com:5").IntPort());
EXPECT_EQ(5, GoogleUrl("https://example.com:5").EffectiveIntPort());
EXPECT_EQ(-1, GoogleUrl("http://example.com").IntPort());
EXPECT_EQ(80, GoogleUrl("http://example.com").EffectiveIntPort());
EXPECT_EQ(-1, GoogleUrl("https://example.com").IntPort());
EXPECT_EQ(443, GoogleUrl("https://example.com").EffectiveIntPort());
}
TEST_F(GoogleUrlTest, TestExtraSlash) {
// We always preserve // in URLs.
GoogleUrl no_base("http://example.com//extra_slash/index.html");
EXPECT_STREQ("http://example.com//extra_slash/index.html", no_base.Spec());
GoogleUrl base("http://www.example.com");
GoogleUrl early_slashes(base, "http://a.com//extra_slash/index.html");
EXPECT_STREQ("http://a.com//extra_slash/index.html", early_slashes.Spec());
GoogleUrl late_slashes(base, "http://a.com/extra_slash//index.html");
EXPECT_STREQ("http://a.com/extra_slash//index.html", late_slashes.Spec());
GoogleUrl three_slashes(base, "http://a.com///extra_slash/index.html");
EXPECT_STREQ("http://a.com///extra_slash/index.html", three_slashes.Spec());
}
TEST_F(GoogleUrlTest, SchemeRelativeBase) {
GoogleUrl base("http://www.example.com");
GoogleUrl resolved(base, "//other.com/file.ext");
ASSERT_TRUE(resolved.IsWebValid());
EXPECT_STREQ("http://other.com/file.ext", resolved.Spec());
}
TEST_F(GoogleUrlTest, SchemeRelativeHttpsBase) {
GoogleUrl base("https://www.example.com");
GoogleUrl resolved(base, "//other.com/file.ext");
ASSERT_TRUE(resolved.IsWebValid());
EXPECT_STREQ("https://other.com/file.ext", resolved.Spec());
}
TEST_F(GoogleUrlTest, SchemeRelativeNoBase) {
GoogleUrl gurl("//other.com/file.ext");
EXPECT_FALSE(gurl.IsWebValid());
}
TEST_F(GoogleUrlTest, FindRelativity) {
EXPECT_EQ(kAbsoluteUrl, GoogleUrl::FindRelativity(
"http://example.com/foo/bar/file.ext?k=v#f"));
EXPECT_EQ(kNetPath, GoogleUrl::FindRelativity(
"//example.com/foo/bar/file.ext?k=v#f"));
EXPECT_EQ(kAbsolutePath, GoogleUrl::FindRelativity(
"/foo/bar/file.ext?k=v#f"));
EXPECT_EQ(kRelativePath, GoogleUrl::FindRelativity(
"bar/file.ext?k=v#f"));
}
TEST_F(GoogleUrlTest, Relativize) {
GoogleUrl url("http://example.com/foo/bar/file.ext?k=v#f");
GoogleUrl good_base_url("http://example.com/foo/index.html");
EXPECT_EQ("http://example.com/foo/bar/file.ext?k=v#f",
url.Relativize(kAbsoluteUrl, good_base_url));
EXPECT_EQ("//example.com/foo/bar/file.ext?k=v#f",
url.Relativize(kNetPath, good_base_url));
EXPECT_EQ("/foo/bar/file.ext?k=v#f",
url.Relativize(kAbsolutePath, good_base_url));
EXPECT_EQ("bar/file.ext?k=v#f",
url.Relativize(kRelativePath, good_base_url));
GoogleUrl bad_base_url("https://www.example.com/other/path.html");
EXPECT_EQ("http://example.com/foo/bar/file.ext?k=v#f",
url.Relativize(kAbsoluteUrl, bad_base_url));
EXPECT_EQ("http://example.com/foo/bar/file.ext?k=v#f",
url.Relativize(kNetPath, bad_base_url));
EXPECT_EQ("http://example.com/foo/bar/file.ext?k=v#f",
url.Relativize(kAbsolutePath, bad_base_url));
EXPECT_EQ("http://example.com/foo/bar/file.ext?k=v#f",
url.Relativize(kRelativePath, bad_base_url));
GoogleUrl double_slash("http://example.com//index.html");
GoogleUrl double_base("http://example.com/");
EXPECT_EQ("http://example.com//index.html",
double_slash.Relativize(kAbsoluteUrl, double_base));
// Safe to use net path.
EXPECT_EQ("//example.com//index.html",
double_slash.Relativize(kNetPath, double_base));
// We cannot shorten to "//index.html", that looks like a net path.
// Perhaps we could shorten to "/.//index.html" instead.
EXPECT_EQ("http://example.com//index.html",
double_slash.Relativize(kAbsolutePath, double_base));
// We cannot shorten to "/index.html", that looks like an absolute path.
// Perhaps we could shorten to ".//index.html" instead.
EXPECT_EQ("http://example.com//index.html",
double_slash.Relativize(kRelativePath, double_base));
GoogleUrl query_url("http://example.com/?bar");
GoogleUrl query_base("http://example.com/foo.html");
// We cannot shorten to "?bar", because that would refer to foo.html?bar
EXPECT_EQ("http://example.com/?bar",
query_url.Relativize(kRelativePath, query_base));
GoogleUrl fragment_url("http://example.com/#bar");
GoogleUrl fragment_base("http://example.com/foo.html");
// We cannot shorten to "#bar", because that would refer to foo.html#bar
EXPECT_EQ("http://example.com/#bar",
fragment_url.Relativize(kRelativePath, fragment_base));
GoogleUrl perverse_url("http://example.com/http://otherdomain.com/");
GoogleUrl perverse_base("http://example.com/");
// We cannot shorten to "http://otherdomain.com/" ... obviously.
EXPECT_EQ("http://example.com/http://otherdomain.com/",
perverse_url.Relativize(kRelativePath, perverse_base));
GoogleUrl no_slash_base("http://example.com");
GoogleUrl no_slash_url("http://example.com/index.html");
// Make sure we don't strip this to "/index.html".
EXPECT_EQ("index.html",
no_slash_url.Relativize(kRelativePath, no_slash_base));
}
TEST_F(GoogleUrlTest, RelativeUrls) {
const StringPiece base_urls[] = {
"http://example.com/",
"https://example.com/foo/bar/file.ext?k=v#f",
"file:///dir/sub/foo.html",
"file://",
};
// URLs which will be reproduced by Relativize().
const StringPiece reproducible_urls[] = {
"http://example.com/", "/", "/foo.html", "foo.html", "dir/foo.html",
"//example.com/foo.html",
};
for (int i = 0; i < arraysize(reproducible_urls); ++i) {
UrlRelativity url_relativity =
GoogleUrl::FindRelativity(reproducible_urls[i]);
for (int j = 0; j < arraysize(base_urls); ++j) {
GoogleUrl base(base_urls[j]);
GoogleUrl url(base, reproducible_urls[i]);
EXPECT_EQ(reproducible_urls[i], url.Relativize(url_relativity, base));
}
}
// URLs which will change after Relativize().
// Format: { (original url), (after relativized w/ base_urls[0]),
// (after relativized w/ base_urls[1]), ... }
const StringPiece non_reproducible_urls[][arraysize(base_urls) + 1] = {
{ "../file.html",
"file.html", "https://example.com/foo/file.html",
"file:///dir/file.html", "file.html", },
{ "../../file.html",
"file.html", "https://example.com/file.html",
"file:///file.html", "file.html", },
{ "./file.html",
"file.html", "file.html", "file.html", "file.html", },
{ "../bar/file.html",
"bar/file.html", "file.html",
"file:///dir/bar/file.html", "bar/file.html", },
{ "",
"", "file.ext?k=v", "foo.html", "", },
{ "?a=b",
"?a=b", "file.ext?a=b", "foo.html?a=b", "?a=b", },
{ "#f2",
"#f2", "file.ext?k=v#f2", "foo.html#f2", "#f2", },
{ ".",
"", "https://example.com/foo/bar/", "file:///dir/sub/", "", },
};
for (int i = 0; i < arraysize(non_reproducible_urls); ++i) {
StringPiece in_url = non_reproducible_urls[i][0];
UrlRelativity url_relativity = GoogleUrl::FindRelativity(in_url);
for (int j = 0; j < arraysize(base_urls); ++j) {
GoogleUrl base(base_urls[j]);
GoogleUrl url(base, in_url);
StringPiece expected_url = non_reproducible_urls[i][j+1];
EXPECT_EQ(expected_url, url.Relativize(url_relativity, base))
<< in_url << " " << base_urls[j];
}
}
}
// Make sure weird URLs don't crash our system.
TEST_F(GoogleUrlTest, TestNoCrash) {
RunAllMethods("http://www.example.com/");
RunAllMethods("http:foo.css");
RunAllMethods("data:");
RunAllMethods(""
"FCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljN"
"BAAO9TXL0Y4OHwAAAABJRU5ErkJggg==");
RunAllMethods("https://secure.example.org/foo/bar.html?blah=t#frag");
RunAllMethods("file:");
RunAllMethods("file:foobar");
RunAllMethods("file:foo/bar/baz.rtf");
RunAllMethods("file:///var/log/");
RunAllMethods("ftp://ftp.example.com/");
}
TEST_F(GoogleUrlTest, Query) {
// First try very simple names and values.
GoogleUrl gurl("http://example.com/a?b=c&d=e");
ASSERT_TRUE(gurl.IsWebValid());
EXPECT_STREQ("b=c&d=e", gurl.Query());
// Now use a URL that will require escaping.
gurl.Reset("http://example.com/a?b=<value requiring escapes>");
ASSERT_TRUE(gurl.IsWebValid());
EXPECT_STREQ("b=%3Cvalue%20requiring%20escapes%3E", gurl.Query());
EXPECT_STREQ("b=<value requiring escapes>",
GoogleUrl::UnescapeQueryParam(gurl.Query()));
}
TEST_F(GoogleUrlTest, URLQuery) {
// Test that the result of Query() is encoded as we expect (and rely on in
// QueryParams), so that we know if our assumptions become invalid:
// 1. HASHes ('#') cannot be in the result as that terminates the component.
// 2. TABs, NLs, & CRs are removed completely.
// 3. Control characters are % encoded.
// 4. SPACE (' '), DOUBLE QUOTE ('"'), LESS THAN ('<'), GREATER THAN ('>'),
// and DEL ('\177') are % encoded.
// 5. In the open source build, which uses chromium's version of the URL
// libraries, single-quote ("'") is also %-encoded.
// 6. HASH ('#') would be encoded but it cannot be in the result per 1 above.
// The code that does all this is:
// * ParsePath in url_parse.cc extracts the query part, starting at the
// first '?' and ending at e.o.string or the first '#'.
// * url::RemoveURLWhitespace strips tabs, newlines, & carriage returns
// per the url::IsRemovableURLWhitespace method.
// * url::IsQueryChar results in the encoding of control characters,
// the characters in [ "#<>], & DEL (& "'" in open source), however since
// '#' terminates the query part it cannot be in the result.
GoogleString good_query_param1(kBadQueryString);
ASSERT_EQ(1, GlobalReplaceSubstring("#more", "", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("#extra", "", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\t", "", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\n", "", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\r", "", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\a", "%07", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\b", "%08", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\v", "%0B", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\f", "%0C", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring(" ", "%20", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\"", "%22", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("<", "%3C", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring(">", "%3E", &good_query_param1));
ASSERT_EQ(1, GlobalReplaceSubstring("\177", "%7F", &good_query_param1));
GoogleString good_query_param2 = good_query_param1;
ASSERT_EQ(1, GlobalReplaceSubstring("'", "%27", &good_query_param2));
// Despite all the ugliness in the query parameter, it's [now] a valid URL.
GoogleUrl gurl(StrCat("http://example.com/?", kBadQueryString));
ASSERT_TRUE(gurl.IsAnyValid());
ASSERT_TRUE(!gurl.Query().empty());
if (gurl.Query() != good_query_param1 && gurl.Query() != good_query_param2) {
EXPECT_TRUE(false)
<< "gurl.Query() does not equal either of the expected values:\n"
<< " Actual: " << gurl.Query() << "\n"
<< "Expected: " << good_query_param1 << "\n"
<< " Or: " << good_query_param2;
}
}
TEST_F(GoogleUrlTest, UnescapeQueryParam) {
EXPECT_STREQ("", GoogleUrl::UnescapeQueryParam(""));
EXPECT_STREQ("noescaping", GoogleUrl::UnescapeQueryParam("noescaping"));
EXPECT_STREQ("http://example.com:8080/src/example.html?a=b&a=c,d",
GoogleUrl::UnescapeQueryParam(
"http%3A%2f%2Fexample.com%3A8080%2Fsrc%2Fexample.html"
"%3Fa%3Db%26a%3dc%2Cd"));
EXPECT_STREQ("%:%1z%zZ%a%", GoogleUrl::UnescapeQueryParam("%%3a%1z%zZ%a%"));
}
TEST_F(GoogleUrlTest, EscapeQueryParam) {
EXPECT_STREQ("Hello1234-5678_910~",
GoogleUrl::EscapeQueryParam("Hello1234-5678_910~"));
// Note, even commas are escaped :(.
EXPECT_STREQ("Hello%2c+World%21",
GoogleUrl::EscapeQueryParam("Hello, World!"));
TestEscapeUnescape("Hello, World!");
TestEscapeUnescape("Hello1234-5678_910~");
TestEscapeUnescape("noescaping");
TestEscapeUnescape("http://example.com:8080/src/example.html?a=b&a=c,d");
TestEscapeUnescape("%:%1z%zZ%a%");
// Ensure that we encode/decode everything all characters.
TestEscapeUnescape(kBadQueryString);
// Ensure that we correctly re-encode/decode an already-encoded string.
TestEscapeUnescape(GoogleUrl::EscapeQueryParam(kBadQueryString));
}
TEST_F(GoogleUrlTest, Sanitize) {
// Test URL-looking example.
EXPECT_STREQ(
"http://example.com/messy,file:name%20with%25lots%20of%22punctuation",
GoogleUrl::Sanitize(
"http://example.com/messy,file:name with%25lots%20of\"punctuation"));
// Note: We currently do not escape % -> %25. This is because the purpose of
// GoogleUrl::Sanitize() is to guarantee that the result does not contain
// certain chars (like ' ', '"', etc.) not to make sure that we have the
// canonical escaping.
EXPECT_STREQ("%", GoogleUrl::Sanitize("%"));
// Note: We do not unescape %67%64%62%67 -> gdbg even though these are
// unreserved chars, so we could. As noted above, the purpose of this
// function is to sanitize, not to produce a canonical version.
EXPECT_STREQ("%67%64%62%67", GoogleUrl::Sanitize("%67%64%62%67"));
// Test all special chars.
const char symbols[] = "~`!@#$%^&*()-_=+[{]}\\|;:'\",<.>/?";
const char escaped_symbols[] =
"~%60!@#$%%5E&*()-_=+[%7B]%7D%5C%7C;:'%22,%3C.%3E/?";
EXPECT_STREQ(escaped_symbols, GoogleUrl::Sanitize(symbols));
// Test that escaping is idempotent.
EXPECT_STREQ(escaped_symbols, GoogleUrl::Sanitize(escaped_symbols));
// Test idempotence for all chars.
for (int c = 0; c < 0x80; ++c) {
GoogleString s;
s.push_back(static_cast<char>(c));
GoogleString escaped_s = GoogleUrl::Sanitize(s);
EXPECT_STREQ(escaped_s, GoogleUrl::Sanitize(escaped_s));
}
// Test idempotence on a large random example.
SimpleRandom random_generator(new NullMutex);
GoogleString random = random_generator.GenerateHighEntropyString(1000);
GoogleString escaped_random = GoogleUrl::Sanitize(random);
EXPECT_STREQ(escaped_random, GoogleUrl::Sanitize(escaped_random));
}
} // namespace net_instaweb