| /* |
| * Copyright 2012 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Author: jmarantz@google.com (Joshua Marantz) |
| |
| #include "pagespeed/kernel/html/canonical_attributes.h" |
| |
| #include "pagespeed/kernel/base/gtest.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| #include "pagespeed/kernel/html/html_parse.h" |
| #include "pagespeed/kernel/html/html_parse_test_base.h" |
| |
| |
| namespace net_instaweb { |
| |
| class CanonicalAttributesTest : public HtmlParseTestBase { |
| protected: |
| CanonicalAttributesTest() : canonical_attributes_(&html_parse_) { |
| html_parse_.AddFilter(&canonical_attributes_); |
| } |
| |
| virtual bool AddBody() const { return false; } |
| |
| GoogleString Image(const StringPiece& image) { |
| return StrCat("<img src='", image, "'>"); |
| } |
| |
| CanonicalAttributes canonical_attributes_; |
| }; |
| |
| TEST_F(CanonicalAttributesTest, Unescaped) { |
| ValidateExpected( |
| "unescaped", |
| Image("a.png?a=b&c=d"), |
| Image("a.png?a=b&c=d")); |
| EXPECT_EQ(1, canonical_attributes_.num_changes()); |
| } |
| |
| TEST_F(CanonicalAttributesTest, Escaped) { |
| ValidateNoChanges( |
| "escaped", |
| Image("a.png?a=b&c=d")); |
| } |
| |
| TEST_F(CanonicalAttributesTest, QueryWithEncodedAmpersand) { |
| // Mixed usage of unterminated & followed by a well-formed &. We |
| // correct the usage here. |
| ValidateExpected( |
| "ampersand", |
| Image("discuss/a.php?&action=vtopic&forum=2"), |
| Image("discuss/a.php?&action=vtopic&forum=2")); |
| } |
| |
| TEST_F(CanonicalAttributesTest, Numeric) { |
| ValidateNoChanges( |
| "numeric_escape", |
| "<a title='’'>b</a>"); |
| } |
| |
| TEST_F(CanonicalAttributesTest, NonUtf8) { |
| // The input ú is transformed to its symbolic form &ucacute;. |
| ValidateExpected( |
| "non_utf8", |
| "<a title='ú'>b</a>", |
| "<a title='ú'>b</a>"); |
| EXPECT_EQ(1, canonical_attributes_.num_changes()); |
| EXPECT_EQ(0, canonical_attributes_.num_errors()); |
| } |
| |
| TEST_F(CanonicalAttributesTest, Spanish) { |
| // The 8-bit input cannot be processed; we consider it a decoding error, so |
| // we live the text alone. |
| ValidateNoChanges("spanish", "<a title='muñecos'>b</a>"); |
| EXPECT_EQ(0, canonical_attributes_.num_changes()); |
| EXPECT_EQ(1, canonical_attributes_.num_errors()); |
| } |
| |
| TEST_F(CanonicalAttributesTest, AccentedSingleByteEscape) { |
| // We can transfer single-byte escapes into 8-bit and back without loss. |
| ValidateNoChanges("spanish", "<a title='ãÝ'>b</a>"); |
| EXPECT_EQ(1, canonical_attributes_.num_changes()); |
| EXPECT_EQ(0, canonical_attributes_.num_errors()); |
| } |
| |
| // |
| // TODO(jmarantz): fix handling of empty attribute names. |
| // TEST_F(CanonicalAttributesTest, EmptyAttrName) { |
| // ValidateNoChanges("empty_attr_name", "<img ='109'/>"); |
| // } |
| |
| TEST_F(CanonicalAttributesTest, Nasa) { |
| ValidateNoChanges("nasa", "<a title='NASA’s Budget'>b</a>"); |
| } |
| |
| TEST_F(CanonicalAttributesTest, Retronaut) { |
| ValidateNoChanges("retronaut", |
| "<link title='Retronaut » Feed'/>"); |
| } |
| |
| TEST_F(CanonicalAttributesTest, SingleQuoteInAttr) { |
| // This is fully valid, and we rewrite the attribute, but no textual |
| // change takes place. |
| ValidateNoChanges("squote", "<link title=\"a's b » Feed\">"); |
| EXPECT_EQ(1, canonical_attributes_.num_changes()); |
| EXPECT_EQ(0, canonical_attributes_.num_errors()); |
| } |
| |
| TEST_F(CanonicalAttributesTest, Hellip) { |
| // … exists, but would need to be unescaped as multi-byte so we do |
| // not process it. |
| ValidateNoChanges("hellip", "<input value='Search this website …'/>"); |
| EXPECT_EQ(0, canonical_attributes_.num_changes()); |
| EXPECT_EQ(1, canonical_attributes_.num_errors()); |
| } |
| |
| TEST_F(CanonicalAttributesTest, Yuml) { |
| // Ÿ & ÿ both exist, but Ÿ is multi-byte, so we error out. |
| // ÿ is single-byte, so we process it properly. |
| ValidateNoChanges("Yuml", "<input value='Search this website Ÿ'/>"); |
| EXPECT_EQ(0, canonical_attributes_.num_changes()); |
| EXPECT_EQ(1, canonical_attributes_.num_errors()); |
| ValidateNoChanges("yuml", "<input value='Search this website ÿ'/>"); |
| EXPECT_EQ(1, canonical_attributes_.num_changes()); |
| EXPECT_EQ(0, canonical_attributes_.num_errors()); |
| } |
| |
| TEST_F(CanonicalAttributesTest, Truncated) { |
| // Here we "correct" the missing ";" in the input. |
| ValidateExpected("truncated", |
| "<link href='foo.css?user=z&'/>", |
| "<link href='foo.css?user=z&'/>"); |
| } |
| |
| TEST_F(CanonicalAttributesTest, EndsWithAmpersand) { |
| // Here we "correct" the missing "amp;" in the input. |
| ValidateExpected("ends_with_ampersand", |
| "<link href='foo.css?user=z&'/>", |
| "<link href='foo.css?user=z&'/>"); |
| } |
| |
| TEST_F(CanonicalAttributesTest, EndsWithValue) { |
| // Here we "correct" the input, transforming & to &. |
| ValidateExpected("ends", |
| "<link href='a/b?c=d&e=a&t'>", |
| "<link href='a/b?c=d&e=a&t'>"); |
| } |
| |
| } // namespace net_instaweb |